View Javadoc
1   /*
2    * ====================================================================
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *   http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing,
14   * software distributed under the License is distributed on an
15   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16   * KIND, either express or implied.  See the License for the
17   * specific language governing permissions and limitations
18   * under the License.
19   * ====================================================================
20   *
21   * This software consists of voluntary contributions made by many
22   * individuals on behalf of the Apache Software Foundation.  For more
23   * information on the Apache Software Foundation, please see
24   * <http://www.apache.org/>.
25   *
26   */
27  
28  package org.apache.hc.client5.http.psl;
29  
30  import java.io.InputStream;
31  import java.io.InputStreamReader;
32  import java.net.URL;
33  import java.nio.charset.StandardCharsets;
34  import java.util.List;
35  
36  import org.junit.jupiter.api.Assertions;
37  import org.junit.jupiter.api.BeforeEach;
38  import org.junit.jupiter.api.Test;
39  
40  class TestPublicSuffixMatcher {
41  
42      private static final String SOURCE_FILE = "suffixlistmatcher.txt";
43      private static final String PUBLIC_SUFFIX_LIST_FILE = "org/publicsuffix/list/effective_tld_names.dat";
44  
45      private PublicSuffixMatcher matcher;
46      private PublicSuffixMatcher pslMatcher;
47  
48      /**
49       * Create a matcher using the public suffix list file provided by publicsuffix.org (Mozilla).
50       *
51       * This test uses a copy of https://publicsuffix.org/list/effective_tld_names.dat in
52       * src/main/resources/org/publicsuffix/list/effective_tld_names.dat
53       */
54      @BeforeEach
55      void setUp() throws Exception {
56          final ClassLoader classLoader = getClass().getClassLoader();
57          // Create a matcher using a custom crafted public suffix list file
58          try (InputStream in = classLoader.getResourceAsStream(SOURCE_FILE)) {
59              Assertions.assertNotNull(in, SOURCE_FILE);
60              final List<PublicSuffixList> lists = PublicSuffixListParser.INSTANCE.parseByType(new InputStreamReader(in, StandardCharsets.UTF_8));
61              matcher = new PublicSuffixMatcher(lists);
62          }
63          final URL publicSuffixListUrl = classLoader.getResource(PUBLIC_SUFFIX_LIST_FILE);
64          Assertions.assertNotNull(publicSuffixListUrl, PUBLIC_SUFFIX_LIST_FILE);
65          pslMatcher = PublicSuffixMatcherLoader.load(publicSuffixListUrl);
66      }
67  
68      @Test
69      void testGetDomainRootAnyType() {
70          // ICANN
71          Assertions.assertEquals(null, matcher.getDomainRoot("com"));
72          Assertions.assertEquals("blah.com", matcher.getDomainRoot("blah.com"));
73          Assertions.assertEquals("foo.com", matcher.getDomainRoot("foo.com"));
74          Assertions.assertEquals(null, matcher.getDomainRoot("blah.foo.com"));
75          Assertions.assertEquals(null, matcher.getDomainRoot("booh.foo.com"));
76          Assertions.assertEquals("blah.blah.foo.com", matcher.getDomainRoot("blah.blah.foo.com"));
77  
78          Assertions.assertEquals(null, matcher.getDomainRoot("kioto.jp"));
79          Assertions.assertEquals(null, matcher.getDomainRoot("tokyo.jp"));
80          Assertions.assertEquals(null, matcher.getDomainRoot("blah.tokyo.jp"));
81          Assertions.assertEquals(null, matcher.getDomainRoot("booh.tokyo.jp"));
82          Assertions.assertEquals("blah.blah.tokyo.jp", matcher.getDomainRoot("blah.blah.tokyo.jp"));
83          Assertions.assertEquals("metro.tokyo.jp", matcher.getDomainRoot("metro.tokyo.jp"));
84          Assertions.assertEquals("blah.ac.jp", matcher.getDomainRoot("blah.ac.jp"));
85          Assertions.assertEquals("blah.ac.jp", matcher.getDomainRoot("blah.blah.ac.jp"));
86          Assertions.assertEquals("metro.tokyo.jp", matcher.getDomainRoot("metro.tokyo.jp"));
87  
88          // Private
89          Assertions.assertEquals("example.xx", matcher.getDomainRoot("example.XX"));
90          Assertions.assertEquals("example.xx", matcher.getDomainRoot("www.example.XX"));
91          Assertions.assertEquals("example.xx", matcher.getDomainRoot("www.blah.blah.example.XX"));
92          Assertions.assertEquals(null, matcher.getDomainRoot("appspot.com"));
93          Assertions.assertEquals("example.appspot.com", matcher.getDomainRoot("example.appspot.com"));
94          // Too short
95          Assertions.assertNull(matcher.getDomainRoot("jp"));
96          Assertions.assertNull(matcher.getDomainRoot("ac.jp"));
97          Assertions.assertNull(matcher.getDomainRoot("any.tokyo.jp"));
98          // Unknown
99          Assertions.assertEquals(null, matcher.getDomainRoot("garbage"));
100         Assertions.assertEquals("garbage.garbage", matcher.getDomainRoot("garbage.garbage"));
101         Assertions.assertEquals("garbage.garbage", matcher.getDomainRoot("*.garbage.garbage"));
102         Assertions.assertEquals("garbage.garbage", matcher.getDomainRoot("*.garbage.garbage.garbage"));
103 
104         Assertions.assertEquals(null, matcher.getDomainRoot("*.compute-1.amazonaws.com"));
105         Assertions.assertEquals(null, matcher.getDomainRoot("blah.compute-1.amazonaws.com"));
106         Assertions.assertEquals("blah.blah.compute-1.amazonaws.com", matcher.getDomainRoot("blah.blah.compute-1.amazonaws.com"));
107     }
108 
109     @Test
110     void testGetDomainRootOnlyPRIVATE() {
111         // Private
112         Assertions.assertEquals("example.xx", matcher.getDomainRoot("example.XX", DomainType.PRIVATE));
113         Assertions.assertEquals("example.xx", matcher.getDomainRoot("www.example.XX", DomainType.PRIVATE));
114         Assertions.assertEquals("example.xx", matcher.getDomainRoot("www.blah.blah.example.XX", DomainType.PRIVATE));
115         Assertions.assertEquals("example.appspot.com", matcher.getDomainRoot("example.appspot.com"));
116         // Too short
117         Assertions.assertNull(matcher.getDomainRoot("jp", DomainType.PRIVATE));
118         Assertions.assertNull(matcher.getDomainRoot("ac.jp", DomainType.PRIVATE));
119         Assertions.assertNull(matcher.getDomainRoot("any.tokyo.jp", DomainType.PRIVATE));
120         // ICANN
121         Assertions.assertNull(matcher.getDomainRoot("metro.tokyo.jp", DomainType.PRIVATE));
122         Assertions.assertNull(matcher.getDomainRoot("blah.blah.tokyo.jp", DomainType.PRIVATE));
123         Assertions.assertNull(matcher.getDomainRoot("blah.blah.ac.jp", DomainType.PRIVATE));
124         // Unknown
125         Assertions.assertNull(matcher.getDomainRoot("garbage", DomainType.PRIVATE));
126         Assertions.assertNull(matcher.getDomainRoot("garbage.garbage", DomainType.PRIVATE));
127         Assertions.assertNull(matcher.getDomainRoot("*.garbage.garbage", DomainType.PRIVATE));
128         Assertions.assertNull(matcher.getDomainRoot("*.garbage.garbage.garbage", DomainType.PRIVATE));
129     }
130 
131     @Test
132     void testGetDomainRootOnlyICANN() {
133         // Private
134         Assertions.assertNull(matcher.getDomainRoot("example.XX", DomainType.ICANN));
135         Assertions.assertNull(matcher.getDomainRoot("www.example.XX", DomainType.ICANN));
136         Assertions.assertNull(matcher.getDomainRoot("www.blah.blah.example.XX", DomainType.ICANN));
137         // Too short
138         Assertions.assertNull(matcher.getDomainRoot("xx", DomainType.ICANN));
139         Assertions.assertNull(matcher.getDomainRoot("jp", DomainType.ICANN));
140         Assertions.assertNull(matcher.getDomainRoot("ac.jp", DomainType.ICANN));
141         Assertions.assertNull(matcher.getDomainRoot("any.tokyo.jp", DomainType.ICANN));
142         // ICANN
143         Assertions.assertEquals("metro.tokyo.jp", matcher.getDomainRoot("metro.tokyo.jp", DomainType.ICANN));
144         Assertions.assertEquals("blah.blah.tokyo.jp", matcher.getDomainRoot("blah.blah.tokyo.jp", DomainType.ICANN));
145         Assertions.assertEquals("blah.ac.jp", matcher.getDomainRoot("blah.blah.ac.jp", DomainType.ICANN));
146         // Unknown
147         Assertions.assertNull(matcher.getDomainRoot("garbage", DomainType.ICANN));
148         Assertions.assertNull(matcher.getDomainRoot("garbage.garbage", DomainType.ICANN));
149         Assertions.assertNull(matcher.getDomainRoot("*.garbage.garbage", DomainType.ICANN));
150         Assertions.assertNull(matcher.getDomainRoot("*.garbage.garbage.garbage", DomainType.ICANN));
151     }
152 
153     @Test
154     void testMatch() {
155         Assertions.assertTrue(matcher.matches(".jp"));
156         Assertions.assertTrue(matcher.matches(".ac.jp"));
157         Assertions.assertTrue(matcher.matches(".any.tokyo.jp"));
158         Assertions.assertTrue(matcher.matches(".xx"));
159         Assertions.assertTrue(matcher.matches(".appspot.com"));
160         // exception
161         Assertions.assertFalse(matcher.matches(".metro.tokyo.jp"));
162     }
163 
164     @Test
165     void testMatchUnicode() {
166         Assertions.assertTrue(matcher.matches(".h\u00E5.no")); // \u00E5 is <aring>
167         Assertions.assertTrue(matcher.matches(".xn--h-2fa.no"));
168         Assertions.assertTrue(matcher.matches(".h\u00E5.no"));
169         Assertions.assertTrue(matcher.matches(".xn--h-2fa.no"));
170     }
171 
172     private void checkPublicSuffix(final String input, final String expected) {
173         Assertions.assertEquals(expected, pslMatcher.getDomainRoot(input));
174     }
175 
176     //see https://github.com/publicsuffix/list/blob/master/tests/test_psl.txt
177     @Test
178     void testGetDomainRootPublicSuffixList() {
179          // null input.
180         checkPublicSuffix(null, null);
181         // Mixed case.
182         checkPublicSuffix("COM", null);
183         checkPublicSuffix("example.COM", "example.com");
184         checkPublicSuffix("WwW.example.COM", "example.com");
185         // Leading dot.
186         checkPublicSuffix(".com", null);
187         checkPublicSuffix(".example", null);
188         checkPublicSuffix(".example.com", null);
189         checkPublicSuffix(".example.example", null);
190         // Unlisted TLD.
191         checkPublicSuffix("example", null);
192         checkPublicSuffix("example.example", "example.example");
193         checkPublicSuffix("b.example.example", "example.example");
194         checkPublicSuffix("a.b.example.example", "example.example");
195         // Listed, but non-Internet, TLD.
196         //checkPublicSuffix("local", null);
197         //checkPublicSuffix("example.local", null);
198         //checkPublicSuffix("b.example.local", null);
199         //checkPublicSuffix("a.b.example.local", null);
200         // TLD with only 1 rule.
201         checkPublicSuffix("biz", null);
202         checkPublicSuffix("domain.biz", "domain.biz");
203         checkPublicSuffix("b.domain.biz", "domain.biz");
204         checkPublicSuffix("a.b.domain.biz", "domain.biz");
205         // TLD with some 2-level rules.
206         checkPublicSuffix("com", null);
207         checkPublicSuffix("example.com", "example.com");
208         checkPublicSuffix("b.example.com", "example.com");
209         checkPublicSuffix("a.b.example.com", "example.com");
210         checkPublicSuffix("uk.com", null);
211         checkPublicSuffix("example.uk.com", "example.uk.com");
212         checkPublicSuffix("b.example.uk.com", "example.uk.com");
213         checkPublicSuffix("a.b.example.uk.com", "example.uk.com");
214         checkPublicSuffix("test.ac", "test.ac");
215         // TLD with only 1 (wildcard) rule.
216         checkPublicSuffix("mm", null);
217         checkPublicSuffix("c.mm", null);
218         checkPublicSuffix("b.c.mm", "b.c.mm");
219         checkPublicSuffix("a.b.c.mm", "b.c.mm");
220         // More complex TLD.
221         checkPublicSuffix("jp", null);
222         checkPublicSuffix("test.jp", "test.jp");
223         checkPublicSuffix("www.test.jp", "test.jp");
224         checkPublicSuffix("ac.jp", null);
225         checkPublicSuffix("test.ac.jp", "test.ac.jp");
226         checkPublicSuffix("www.test.ac.jp", "test.ac.jp");
227         checkPublicSuffix("kyoto.jp", null);
228         checkPublicSuffix("test.kyoto.jp", "test.kyoto.jp");
229         checkPublicSuffix("ide.kyoto.jp", null);
230         checkPublicSuffix("b.ide.kyoto.jp", "b.ide.kyoto.jp");
231         checkPublicSuffix("a.b.ide.kyoto.jp", "b.ide.kyoto.jp");
232         checkPublicSuffix("c.kobe.jp", null);
233         checkPublicSuffix("b.c.kobe.jp", "b.c.kobe.jp");
234         checkPublicSuffix("a.b.c.kobe.jp", "b.c.kobe.jp");
235         checkPublicSuffix("city.kobe.jp", "city.kobe.jp");
236         checkPublicSuffix("www.city.kobe.jp", "city.kobe.jp");
237         // TLD with a wildcard rule and exceptions.
238         checkPublicSuffix("ck", null);
239         checkPublicSuffix("test.ck", null);
240         checkPublicSuffix("b.test.ck", "b.test.ck");
241         checkPublicSuffix("a.b.test.ck", "b.test.ck");
242         checkPublicSuffix("www.ck", "www.ck");
243         checkPublicSuffix("www.www.ck", "www.ck");
244         // US K12.
245         checkPublicSuffix("us", null);
246         checkPublicSuffix("test.us", "test.us");
247         checkPublicSuffix("www.test.us", "test.us");
248         checkPublicSuffix("ak.us", null);
249         checkPublicSuffix("test.ak.us", "test.ak.us");
250         checkPublicSuffix("www.test.ak.us", "test.ak.us");
251         checkPublicSuffix("k12.ak.us", null);
252         checkPublicSuffix("test.k12.ak.us", "test.k12.ak.us");
253         checkPublicSuffix("www.test.k12.ak.us", "test.k12.ak.us");
254         // IDN labels.
255         checkPublicSuffix("食狮.com.cn", "食狮.com.cn");
256         checkPublicSuffix("食狮.公司.cn", "食狮.公司.cn");
257         checkPublicSuffix("www.食狮.公司.cn", "食狮.公司.cn");
258         checkPublicSuffix("shishi.公司.cn", "shishi.公司.cn");
259         checkPublicSuffix("公司.cn", null);
260         checkPublicSuffix("食狮.中国", "食狮.中国");
261         checkPublicSuffix("www.食狮.中国", "食狮.中国");
262         checkPublicSuffix("shishi.中国", "shishi.中国");
263         checkPublicSuffix("中国", null);
264         // Same as above, but punycoded.
265         checkPublicSuffix("xn--85x722f.com.cn", "xn--85x722f.com.cn");
266         checkPublicSuffix("xn--85x722f.xn--55qx5d.cn", "xn--85x722f.xn--55qx5d.cn");
267         checkPublicSuffix("www.xn--85x722f.xn--55qx5d.cn", "xn--85x722f.xn--55qx5d.cn");
268         checkPublicSuffix("shishi.xn--55qx5d.cn", "shishi.xn--55qx5d.cn");
269         checkPublicSuffix("xn--55qx5d.cn", null);
270         checkPublicSuffix("xn--85x722f.xn--fiqs8s", "xn--85x722f.xn--fiqs8s");
271         checkPublicSuffix("www.xn--85x722f.xn--fiqs8s", "xn--85x722f.xn--fiqs8s");
272         checkPublicSuffix("shishi.xn--fiqs8s", "shishi.xn--fiqs8s");
273         checkPublicSuffix("xn--fiqs8s", null);
274     }
275 
276 }