From f6bcec9200f48fcf758cc349cf9480c947259c7e Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 25 Oct 2017 15:28:03 +0200 Subject: [PATCH 1/5] NUTCH-1806 Delegate processing of URL domains to crawler commons - add unit test for URLs without host/domain (cf. NUTCH-2450) --- src/test/org/apache/nutch/util/TestURLUtil.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java index eaaf7d0c77..ad636048a1 100644 --- a/src/test/org/apache/nutch/util/TestURLUtil.java +++ b/src/test/org/apache/nutch/util/TestURLUtil.java @@ -71,6 +71,11 @@ public void testGetDomainName() throws Exception { // test non-ascii url = new URL("http://www.example.商業.tw"); Assert.assertEquals("example.商業.tw", URLUtil.getDomainName(url)); + + // test URL without host/authority + url = new URL("file:/path/index.html"); + Assert.assertNotNull(URLUtil.getDomainName(url)); + Assert.assertEquals("", URLUtil.getDomainName(url)); } @Test From bc2ae7e0c8d59c03ea8fe31a7043ed3bb9e6abb4 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Mon, 29 Apr 2024 12:06:14 +0200 Subject: [PATCH 2/5] NUTCH-1806 Delegate processing of URL domains to crawler commons - add unit tests for host names with trailing dot ("www.apache.org.") --- src/test/org/apache/nutch/util/TestURLUtil.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java index ad636048a1..e074b6329e 100644 --- a/src/test/org/apache/nutch/util/TestURLUtil.java +++ b/src/test/org/apache/nutch/util/TestURLUtil.java @@ -32,6 +32,10 @@ public void testGetDomainName() throws Exception { url = new URL("http://lucene.apache.org/nutch"); Assert.assertEquals("apache.org", URLUtil.getDomainName(url)); + // hostname with trailing dot + url = new URL("https://lucene.apache.org./nutch"); + Assert.assertEquals("apache.org", URLUtil.getDomainName(url)); + url = new URL("http://en.wikipedia.org/wiki/Java_coffee"); Assert.assertEquals("wikipedia.org", URLUtil.getDomainName(url)); @@ -85,6 +89,10 @@ public void testGetDomainSuffix() throws Exception { url = new URL("http://lucene.apache.org/nutch"); Assert.assertEquals("org", URLUtil.getDomainSuffix(url).getDomain()); + // hostname with trailing dot + url = new URL("https://lucene.apache.org./nutch"); + Assert.assertEquals("org", URLUtil.getDomainName(url)); + url = new URL("http://140.211.11.130/foundation/contributing.html"); Assert.assertNull(URLUtil.getDomainSuffix(url)); From e0fa35729b6d046f3c056872bc84433bd5a94d77 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sat, 27 Apr 2024 13:18:55 +0200 Subject: [PATCH 3/5] NUTCH-1806 Delegate processing of URL domains to crawler commons - use methods from crawler-commons' EffectiveTldFinder in URLUtil replacing classed and methods from the org.apache.nutch.util.domain package - adapt and extend unit tests - add tests for URLUtil.getTopLevelDomainName(url) - changes to the public suffix list since 2014 ("xyz" is now a public suffix / ICANN suffix) - minor API changes - URLUtil.getDomainName(url) returns the host name in case no valid public suffix is found - for Unicode suffixes and TLDs the methods URLUtil.getDomainSuffix(url) resp. URLUtil.getTopLevelDomainName(url) now return the ASCII representation - complete Javadoc --- src/java/org/apache/nutch/util/URLUtil.java | 213 ++++++++++-------- .../org/apache/nutch/util/TestURLUtil.java | 76 +++++-- 2 files changed, 181 insertions(+), 108 deletions(-) diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java index 9ec0d35a8b..b439e5f402 100644 --- a/src/java/org/apache/nutch/util/URLUtil.java +++ b/src/java/org/apache/nutch/util/URLUtil.java @@ -23,8 +23,7 @@ import java.util.Locale; import java.util.regex.Pattern; -import org.apache.nutch.util.domain.DomainSuffix; -import org.apache.nutch.util.domain.DomainSuffixes; +import crawlercommons.domains.EffectiveTldFinder; /** Utility class for URL analysis */ public class URLUtil { @@ -85,72 +84,89 @@ static URL fixPureQueryTargets(URL base, String target) .compile("(\\d{1,3}\\.){3}(\\d{1,3})"); /** - * Get the domain name of the url. The domain name of a url is the - * substring of the url's hostname, w/o subdomain names. As an example
+ * Get the domain name of the URL. The domain name of a URL is the substring + * of the URL's hostname, w/o subdomain names. As an example
* - * getDomainName(new URL(http://lucene.apache.org/)) + * getDomainName(new URL("https://lucene.apache.org/")) *
* will return
* apache.org - * @param url A input {@link URL} to extract the domain from + * + * Special cases: + *
    + *
  • if the hostname does not end in a valid domain suffix, the entire + * hostname is returned.
  • + *
  • for URLs without a hostname, an empty string is returned.
  • + *
+ * + * Valid domain suffixes are taken from the + * https://publicsuffix.org/list/public_suffix_list.dat and are compared + * using + * crawler-commons' EffectiveTldFinder. Only ICANN domain suffixes are + * used. Because EffectiveTldFinder loads the public suffix list as file + * "effective_tld_names.dat" from the Java classpath, it's possible to use the + * a specific version of the public suffix list (e.g., the most recent one) by + * placing the public suffix list with the name "effective_tld_names.dat" in + * Nutch's conf/ folder. + * + * See {@link EffectiveTldFinder#getAssignedDomain(String, boolean, boolean)} + * + * @param url + * input {@link URL} to extract the domain from * @return the domain name string - * */ + */ public static String getDomainName(URL url) { - DomainSuffixes tlds = DomainSuffixes.getInstance(); String host = url.getHost(); - // it seems that java returns hostnames ending with . - if (host.endsWith(".")) + + // strip trailing dot in host names + if (host.length() > 0 && host.charAt(host.length() - 1) == '.') { host = host.substring(0, host.length() - 1); - if (IP_PATTERN.matcher(host).matches()) - return host; - - int index = 0; - String candidate = host; - for (; index >= 0;) { - index = candidate.indexOf('.'); - String subCandidate = candidate.substring(index + 1); - if (tlds.isDomainSuffix(subCandidate)) { - return candidate; - } - candidate = subCandidate; } - return candidate; + return EffectiveTldFinder.getAssignedDomain(host, false, true); } /** - * Returns the domain name of the url. The domain name of a url is the - * substring of the url's hostname, w/o subdomain names. As an example
+ * Returns the domain name of the URL. The domain name of a URL is the + * substring of the URL's hostname, w/o subdomain names. As an example
* - * getDomainName(conf, new http://lucene.apache.org/) + * getDomainName("https://lucene.apache.org/") *
* will return
- * apache.org - * @param url A input url string to extract the domain from + * apache.org + * + * See {@link #getDomainName(URL)} for more information. + * + * @param url + * input URL string to extract the domain from * @return the domain name - * @throws MalformedURLException if the input url is malformed + * @throws MalformedURLException + * if the input URL is malformed */ public static String getDomainName(String url) throws MalformedURLException { return getDomainName(new URL(url)); } /** - * Returns the top level domain name of the url. The top level domain name of - * a url is the substring of the url's hostname, w/o subdomain names. As an + * Returns the top-level domain name of the URL. The top level domain name of + * a URL is the substring of the URL's hostname, w/o subdomain names. As an * example
* - * getTopLevelDomainName(conf, new http://lucene.apache.org/) + * getTopLevelDomainName(new URL("https://www.example.co.uk/")) *
* will return
- * org + * uk * - * @param url A input {@link URL} to extract the top - * level domain name from - * @return the top level domain name - * @throws MalformedURLException if the input url is malformed + * In case of internationalized top-level domains, the ASCII representation is + * returned. + * + * @param url + * input {@link URL} to extract the top level domain name from + * @return the top level domain name or the empty string if there is none */ - public static String getTopLevelDomainName(URL url) - throws MalformedURLException { - String suffix = getDomainSuffix(url).toString(); + public static String getTopLevelDomainName(URL url) { + String suffix = getDomainSuffix(url); int idx = suffix.lastIndexOf("."); if (idx != -1) { return suffix.substring(idx + 1); @@ -160,19 +176,23 @@ public static String getTopLevelDomainName(URL url) } /** - * Returns the top level domain name of the url. The top level domain name of - * a url is the substring of the url's hostname, w/o subdomain names. As an + * Returns the top-level domain name of the URL. The top level domain name of + * a URL is the substring of the URL's hostname, w/o subdomain names. As an * example
* - * getTopLevelDomainName(conf, new http://lucene.apache.org/) + * getTopLevelDomainName("https://www.example.co.uk/") *
* will return
- * org + * uk + * + * In case of internationalized top-level domains, the ASCII representation is + * returned. * - * @param url A input url string to extract the top - * level domain name from - * @return the top level domain name - * @throws MalformedURLException if the input url is malformed + * @param url + * input URL string to extract the top level domain name from + * @return the top level domain name or the empty string if there is none + * @throws MalformedURLException + * if the input URL is malformed */ public static String getTopLevelDomainName(String url) throws MalformedURLException { @@ -180,12 +200,16 @@ public static String getTopLevelDomainName(String url) } /** - * Returns whether the given urls have the same domain name. As an example,
- * isSameDomain(new URL("http://lucene.apache.org") - * , new URL("http://people.apache.org/")) - *
will return true.
- * @param url1 first {@link URL} to compare domain name - * @param url2 second {@link URL} to compare domain name + * Returns whether the given URLs have the same domain name. As an example, + *
+ * isSameDomain(new URL("http://lucene.apache.org") + * , new URL("http://people.apache.org/")) + *
will return true. + * + * @param url1 + * first {@link URL} to compare domain name + * @param url2 + * second {@link URL} to compare domain name * * @return true if the domain names are equal */ @@ -194,14 +218,19 @@ public static boolean isSameDomainName(URL url1, URL url2) { } /** - * Returns whether the given urls have the same domain name. As an example,
- * isSameDomain("http://lucene.apache.org" - * ,"http://people.apache.org/") - *
will return true.
- * @param url1 first url string to compare domain name - * @param url2 second url string to compare domain name + * Returns whether the given URLs have the same domain name. As an example, + *
+ * isSameDomain("http://lucene.apache.org" + * ,"http://people.apache.org/") + *
will return true. + * + * @param url1 + * first URL string to compare domain name + * @param url2 + * second URL string to compare domain name * @return true if the domain names are equal - * @throws MalformedURLException if either of the input urls are malformed + * @throws MalformedURLException + * if any of the input URLs are malformed */ public static boolean isSameDomainName(String url1, String url2) throws MalformedURLException { @@ -209,39 +238,48 @@ public static boolean isSameDomainName(String url1, String url2) } /** - * Returns the {@link DomainSuffix} corresponding to the last public part of - * the hostname - * @param url a {@link URL} to extract the domain suffix from - * @return a {@link org.apache.nutch.util.domain.DomainSuffix} + * Returns the public suffix corresponding to the last public part of the + * hostname. + * + * In case of internationalized domain suffixes, the ASCII representation is + * returned. For the URL https://www.taiuru.māori.nz/ the suffix + * xn--mori-qsa.nz is returned. + * + * @param url + * a {@link URL} to extract the domain suffix from + * @return the domain suffix or the empty string if there is none */ - public static DomainSuffix getDomainSuffix(URL url) { - DomainSuffixes tlds = DomainSuffixes.getInstance(); + public static String getDomainSuffix(URL url) { String host = url.getHost(); - if (IP_PATTERN.matcher(host).matches()) - return null; - int index = 0; - String candidate = host; - for (; index >= 0;) { - index = candidate.indexOf('.'); - String subCandidate = candidate.substring(index + 1); - DomainSuffix d = tlds.get(subCandidate); - if (d != null) { - return d; - } - candidate = subCandidate; + // strip trailing dot in host names + if (host.length() > 0 && host.charAt(host.length() - 1) == '.') { + host = host.substring(0, host.length() - 1); } - return null; + + EffectiveTldFinder.EffectiveTLD suffix = EffectiveTldFinder.getEffectiveTLD(host, true); + if (suffix != null) { + return suffix.getDomain(); + } + + return ""; } /** - * Returns the {@link DomainSuffix} corresponding to the last public part of - * the hostname - * @param url a {@link URL} to extract the domain suffix from - * @return a {@link org.apache.nutch.util.domain.DomainSuffix} - * @throws MalformedURLException if the input url string is malformed + * Returns the domain suffix corresponding to the last public part of the + * hostname. + * + * In case of internationalized domain suffixes, the ASCII representation is + * returned. For the URL https://www.taiuru.māori.nz/ the suffix + * xn--mori-qsa.nz is returned. + * + * @param url + * a {@link URL} to extract the domain suffix from + * @return the domain suffix or the empty string if there is none + * @throws MalformedURLException + * if the input URL string is malformed */ - public static DomainSuffix getDomainSuffix(String url) + public static String getDomainSuffix(String url) throws MalformedURLException { return getDomainSuffix(new URL(url)); } @@ -422,8 +460,7 @@ public static String chooseRepr(String src, String dst, boolean temp) { } /** - * Returns the lowercased hostname for the URL or null if the URL is not well-formed - * formed. + * Returns the lowercased hostname for the URL or null if the URL is not well-formed. * * @param url * The URL to check. diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java index e074b6329e..36724c32f6 100644 --- a/src/test/org/apache/nutch/util/TestURLUtil.java +++ b/src/test/org/apache/nutch/util/TestURLUtil.java @@ -51,15 +51,19 @@ public void testGetDomainName() throws Exception { url = new URL("http://www.example.co.uk.com"); Assert.assertEquals("uk.com", URLUtil.getDomainName(url)); - // "nn" is not a tld + // "nn" is not a public suffix url = new URL("http://example.com.nn"); - Assert.assertEquals("nn", URLUtil.getDomainName(url)); + Assert.assertEquals("example.com.nn", URLUtil.getDomainName(url)); url = new URL("http://"); Assert.assertEquals("", URLUtil.getDomainName(url)); + /* + * "xyz" is an ICANN suffix since 2014, see + * https://www.iana.org/domains/root/db/xyz.html + */ url = new URL("http://www.edu.tr.xyz"); - Assert.assertEquals("xyz", URLUtil.getDomainName(url)); + Assert.assertEquals("tr.xyz", URLUtil.getDomainName(url)); url = new URL("http://www.example.c.se"); Assert.assertEquals("example.c.se", URLUtil.getDomainName(url)); @@ -87,54 +91,86 @@ public void testGetDomainSuffix() throws Exception { URL url = null; url = new URL("http://lucene.apache.org/nutch"); - Assert.assertEquals("org", URLUtil.getDomainSuffix(url).getDomain()); + Assert.assertEquals("org", URLUtil.getDomainSuffix(url)); // hostname with trailing dot url = new URL("https://lucene.apache.org./nutch"); - Assert.assertEquals("org", URLUtil.getDomainName(url)); + Assert.assertEquals("org", URLUtil.getDomainSuffix(url)); url = new URL("http://140.211.11.130/foundation/contributing.html"); - Assert.assertNull(URLUtil.getDomainSuffix(url)); + Assert.assertEquals("", URLUtil.getDomainSuffix(url)); url = new URL("http://www.example.co.uk:8080/index.html"); - Assert.assertEquals("co.uk", URLUtil.getDomainSuffix(url).getDomain()); + Assert.assertEquals("co.uk", URLUtil.getDomainSuffix(url)); url = new URL("http://com"); - Assert.assertEquals("com", URLUtil.getDomainSuffix(url).getDomain()); + Assert.assertEquals("com", URLUtil.getDomainSuffix(url)); url = new URL("http://www.example.co.uk.com"); - Assert.assertEquals("com", URLUtil.getDomainSuffix(url).getDomain()); + Assert.assertEquals("com", URLUtil.getDomainSuffix(url)); - // "nn" is not a tld + // "nn" is not a public suffix url = new URL("http://example.com.nn"); - Assert.assertNull(URLUtil.getDomainSuffix(url)); + Assert.assertEquals("", URLUtil.getDomainSuffix(url)); url = new URL("http://"); - Assert.assertNull(URLUtil.getDomainSuffix(url)); + Assert.assertEquals("", URLUtil.getDomainSuffix(url)); + /* + * "xyz" is an ICANN suffix since 2014, see + * https://www.iana.org/domains/root/db/xyz.html + */ url = new URL("http://www.edu.tr.xyz"); - Assert.assertNull(URLUtil.getDomainSuffix(url)); + Assert.assertEquals("xyz", URLUtil.getDomainSuffix(url)); url = new URL("http://subdomain.example.edu.tr"); - Assert.assertEquals("edu.tr", URLUtil.getDomainSuffix(url).getDomain()); + Assert.assertEquals("edu.tr", URLUtil.getDomainSuffix(url)); url = new URL("http://subdomain.example.presse.fr"); - Assert.assertEquals("presse.fr", URLUtil.getDomainSuffix(url).getDomain()); + Assert.assertEquals("fr", URLUtil.getDomainSuffix(url)); url = new URL("http://subdomain.example.presse.tr"); - Assert.assertEquals("tr", URLUtil.getDomainSuffix(url).getDomain()); + Assert.assertEquals("tr", URLUtil.getDomainSuffix(url)); // plc.co.im is listed as a domain suffix url = new URL("http://www.example.plc.co.im"); - Assert.assertEquals("plc.co.im", URLUtil.getDomainSuffix(url).getDomain()); + Assert.assertEquals("plc.co.im", URLUtil.getDomainSuffix(url)); // 2000.hu is listed as a domain suffix url = new URL("http://www.example.2000.hu"); - Assert.assertEquals("2000.hu", URLUtil.getDomainSuffix(url).getDomain()); + Assert.assertEquals("2000.hu", URLUtil.getDomainSuffix(url)); // test non-ascii url = new URL("http://www.example.商業.tw"); - Assert.assertEquals("商業.tw", URLUtil.getDomainSuffix(url).getDomain()); + Assert.assertEquals("xn--czrw28b.tw", URLUtil.getDomainSuffix(url)); + } + + @Test + public void testGetTopLevelDomain() throws Exception { + URL url = null; + + url = new URL("http://lucene.apache.org/nutch"); + Assert.assertEquals("org", URLUtil.getTopLevelDomainName(url)); + + // hostname with trailing dot + url = new URL("https://lucene.apache.org./nutch"); + Assert.assertEquals("org", URLUtil.getTopLevelDomainName(url)); + + url = new URL("http://140.211.11.130/foundation/contributing.html"); + Assert.assertEquals("", URLUtil.getTopLevelDomainName(url)); + + url = new URL("http://www.example.co.uk:8080/index.html"); + Assert.assertEquals("uk", URLUtil.getTopLevelDomainName(url)); + + // "nn" is not a public suffix + url = new URL("http://example.com.nn"); + Assert.assertEquals("", URLUtil.getTopLevelDomainName(url)); + + url = new URL("http://"); + Assert.assertEquals("", URLUtil.getTopLevelDomainName(url)); + + url = new URL("http://nic.삼성/"); + Assert.assertEquals("xn--cg4bki", URLUtil.getTopLevelDomainName(url)); } @Test @@ -283,7 +319,7 @@ public void testToASCII() throws Exception { @Test public void testFileProtocol() throws Exception { - // keep one single slash NUTCH-XXX + // keep one single slash NUTCH-1483 Assert.assertEquals("file:/path/file.html", URLUtil.toASCII("file:/path/file.html")); Assert.assertEquals("file:/path/file.html", From d43f5793fd0e9681f8bfc12b046de93e3bcf6fa6 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Mon, 29 Apr 2024 14:55:02 +0200 Subject: [PATCH 4/5] NUTCH-1806 Delegate processing of URL domains to crawler commons NUTCH-1942 Remove TopLevelDomain - update DomainStatistics, TLDIndexingFilter and domain URL filters to use the updated methods in URLUtil - remove TLDScoringFilter - remove package org.apache.nutch.util.domain - move DomainStatistics to org.apache.nutch.util - remove configuration files of domain utils --- conf/domain-suffixes.xml.template | 4428 ----------------- conf/domain-suffixes.xsd | 130 - default.properties | 1 - src/bin/nutch | 2 +- .../util/{domain => }/DomainStatistics.java | 7 +- .../nutch/util/domain/DomainSuffix.java | 78 - .../nutch/util/domain/DomainSuffixes.java | 91 - .../util/domain/DomainSuffixesReader.java | 164 - .../nutch/util/domain/TopLevelDomain.java | 66 - .../nutch/util/domain/package-info.java | 28 - .../nutch/indexer/tld/TLDIndexingFilter.java | 13 +- .../nutch/scoring/tld/TLDScoringFilter.java | 60 - .../nutch/scoring/tld/package-info.java | 19 - .../urlfilter/domain/DomainURLFilter.java | 9 +- .../DomainDenylistURLFilter.java | 9 +- 15 files changed, 16 insertions(+), 5089 deletions(-) delete mode 100644 conf/domain-suffixes.xml.template delete mode 100644 conf/domain-suffixes.xsd rename src/java/org/apache/nutch/util/{domain => }/DomainStatistics.java (97%) delete mode 100644 src/java/org/apache/nutch/util/domain/DomainSuffix.java delete mode 100644 src/java/org/apache/nutch/util/domain/DomainSuffixes.java delete mode 100644 src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java delete mode 100644 src/java/org/apache/nutch/util/domain/TopLevelDomain.java delete mode 100644 src/java/org/apache/nutch/util/domain/package-info.java delete mode 100644 src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java delete mode 100644 src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package-info.java diff --git a/conf/domain-suffixes.xml.template b/conf/domain-suffixes.xml.template deleted file mode 100644 index 096309b90a..0000000000 --- a/conf/domain-suffixes.xml.template +++ /dev/null @@ -1,4428 +0,0 @@ - - - - - - - - - - - - INFRASTRUCTURE - - (from http://en.wikipedia.org/wiki/.root) - vrsn-end-of-zone-marker-dummy-record.root is a domain name - listed in the DNS root zone as a diagnostic marker, whose - presence demonstrates the root zone was not truncated upon - loading by a root nameserver. It could be argued it represents - a top-level domain of .root, although technically no such - delegation exists. - - - - - INFRASTRUCTURE - - (from http://en.wikipedia.org/wiki/.arpa) .arpa is an Internet - top-level domain (TLD) used exclusively for - Internet-infrastructure purposes. It does not function as a - normal TLD where websites are registered, but rather as a - meta-TLD used to look up addresses, and for other purposes. - - - - - - - - SPONSORED - for the air transport industry - - - - UNSPONSORED - for business use - - - - SPONSORED - for Catalan language/culture - - - - UNSPONSORED - - for commercial organizations, but unrestricted - - - - - SPONSORED - for cooperatives - - - - UNSPONSORED - 1.0 - - for post-secondary educational establishments - - - - - UNSPONSORED - - for governments and their agencies in the United States - - - - - UNSPONSORED - - for informational sites, but unrestricted - - - - - UNSPONSORED - - for international organizations established by treaty - - - - - SPONSORED - for employment-related sites - - - - UNSPONSORED - for the US military - - - - SPONSORED - for sites catering to mobile devices - - - - SPONSORED - for museums - - - - UNSPONSORED - for families and individuals - - - - UNSPONSORED - - originally for network infrastructures, now unrestricted - - - - - UNSPONSORED - - originally for organizations not clearly falling within the - other gTLDs, now unrestricted - - - - - SPONSORED - for certain professions - - - - SPONSORED - - for travel agents, airlines, hoteliers, tourism bureaus, etc. - - - - - - - STARTUP - for the Asian community - - - - PROPOSED - for postal services - - - - STARTUP - - for services involving connections between the telephone - network and the Internet - - - - - PROPOSED - for geographically related sites - - - - PROPOSED - for Galicia, a country within Spain - - - - PROPOSED - for Wales, a country within the UK - - - - PROPOSED - for Scotland, a country within the UK - - - - PROPOSED - for websites designed for children - - - - PROPOSED - for websites designed for children - - - - PROPOSED - http://en.wikipedia.org/wiki/.mail - - - - PROPOSED - For Web sites of all sorts - - - - PROPOSED - For Adult entertainment sites - - - - - DELETED - - for NATO sites and operations. Replaced by .int - - - - - - PSEUDO_DOMAIN - - identifying a hostname not connected directly to the Internet, - but a bitnet network - - - - - PSEUDO_DOMAIN - - identifying a hostname not connected directly to the Internet, - but a csnet network - - - - - PSEUDO_DOMAIN - - identifying a hostname not connected directly to the Internet, - but a bitnet network - - - - - PSEUDO_DOMAIN - - .local is a pseudo top-level domain used by Apple, Inc.'s - Bonjour protocol. - - - - - PSEUDO_DOMAIN - alias of .local - - - - PSEUDO_DOMAIN - - designates an anonymous or pseudonymous address reachable via - the Tor network. - - - - - - - Ascension Island - - - - Andorra - - - - United Arab Emirates - - - - Afghanistan - - - - Antigua and Barbuda - - - - Anguilla - - - - Albania - - - - Armenia - - - - Netherlands Antilles - - - - Angola - - - - Antarctica - - - - Argentina - - - - American Samoa - - - - Austria - - - - Australia - - - - Aruba - - - - Aland Islands - - - - Azerbaijan - - - - Bosnia and Herzegovina - - - - Barbados - - - - Bangladesh - - - - Belgium - - - - Burkina Faso - - - - Bulgaria - - - - Bahrain - - - - Burundi - - - - Benin - - - - Bermuda - - - - Brunei - - - - Bolivia - - - - Brazil - - - - Bahamas - - - - Bhutan - - - - Burma - NOT_IN_USE - - not in use since re-naming of country to Myanmar, see .mm - - - - - Bouvet Island - NOT_IN_USE - not in use; no registrations - - - - Botswana - - - - Belarus - - - - Belize - - - - Canada - - - - Cocos Keeling Islands - - - - Democratic Republic of the Congo - formerly .zr - Zaire - - - - Central African Republic - - - - Republic of the Congo - - - - Switzerland - - - - Côte d'Ivoire - Ivory Coast - - - - Cook Islands - - - - Chile - - - - Cameroon - - - - People s Republic of China - - - - Colombia - - - - Costa Rica - - - - Serbia and Montenegro - DELETED - - formerly .yu - Yugoslavia; description: on June 3, 2006, - Montenegro declared independence, thus dissolving the state - union) (.cs code not assigned; no DNS) (.cs code previously - used for Czechoslovakia - - - - - Cuba - - - - Cape Verde - - - - Christmas Island - - - - Cyprus - - - - Czech Republic - - - - German Democratic Republic(East Germany) - DELETED - deleted in 1990 - - - - Germany - - - - Djibouti - - - - Denmark - - - - Dominica - - - - Dominican Republic - - - - Algeria - - - - Ecuador - - - - Estonia - - - - Egypt - - - - Western Sahara - NOT_IN_USE - not assigned; no DNS - - - - Eritrea - - - - Spain - - - - Ethiopia - - - - European Union - - code "exceptionally reserved" by ISO 3166-1 - - - - - Finland - - - - Fiji - - - - Falkland Islands - - - - Federated States of Micronesia - - - - Faroe Islands - - - - France - - - - Gabon - - - - United Kingdom - - Reserved domain by IANA; deprecated – see .uk - - - - - Grenada - - - - Georgia - - - - French Guiana - - - - Guernsey - - - - Ghana - - - - Gibraltar - - - - Greenland - - - - Gambia - - - - Guinea - - - - Guadeloupe - - - - Equatorial Guinea - - - - Greece - - - - South Georgia and the South Sandwich Islands - - - - Guatemala - - - - Guam - - - - Guinea Bissau - - - - Guyana - - - - Hong Kong - - - - Heard Island and McDonald Islands - - - - Honduras - - - - Croatia - - - - Haiti - - - - Hungary - - - - Indonesia - - - - Ireland - - - - Israel - - - - Isle of Man - - - - India - - - - British Indian Ocean Territory - - - - Iraq - - - - Iran - - - - Iceland - - - - Italy - - - - Jersey - - - - Jamaica - - - - Jordan - - - - Japan - - - - Kenya - - - - Kyrgyzstan - - - - Cambodia - - - - Kiribati - - - - Comoros - - - - Saint Kitts and Nevis - - - - North Korea - NOT_IN_USE - - not assigned; no DNS - - - - South Korea - - - - Kuwait - - - - Cayman Islands - - - - Kazakhstan - - - - Laos - - - - Lebanon - - - - Saint Lucia - - - - Liechtenstein - - - - Sri Lanka - - - - Liberia - - - - Lesotho - - - - Lithuania - - - - Luxembourg - - - - Latvia - - - - Libya - - - - Morocco - - - - Monaco - - - - Moldova - - - - Montenegro - - - - Madagascar - - - - Marshall Islands - - - - Republic of Macedonia - - - - Mali - - - - Myanmar - formerly .bu - Burma - - - - Mongolia - - - - Macau - - - - Northern Mariana Islands - - - - Martinique - - - - Mauritania - - - - Montserrat - - - - Malta - - - - Mauritius - - - - Maldives - - - - Malawi - - - - Mexico - - - - Malaysia - - - - Mozambique - - - - Namibia - - - - New Caledonia - - - - Niger - - - - Norfolk Island - - - - Nigeria - - - - Nicaragua - - - - Netherlands - - - - Norway - - - - Nepal - - - - Nauru - - - - Niue - - - - New Zealand - - - - Oman - - - - Panama - - - - Peru - - - - French Polynesia - - - - Papua New Guinea - - - - Philippines - - - - Pakistan - - - - Poland - - - - Saint Pierre and Miquelon - - - - Pitcairn Islands - - - - Puerto Rico - - - - Palestinian territories - - - - Portugal - - - - Palau - - - - Paraguay - - - - Qatar - - - - Réunion - - - - Romania - - - - Serbia - - - - Russia - - - - Rwanda - - - - Saudi Arabia - - - - Solomon Islands - - - - Seychelles - - - - Sudan - - - - Sweden - - - - Singapore - - - - Saint Helena - - - - Slovenia - - - - Svalbard and Jan Mayen Islands - NOT_IN_USE - not in use; no registrations - - - - Slovakia - - - - Sierra Leone - - - - San Marino - - - - Senegal - - - - Somalia - - - - Suriname - - - - São Tomé and Príncipe - - - - Soviet Union - DELETED - - deprecated; being phased out; code "transitionally reserved" - by ISO 3166-1 - - - - - El Salvador - - - - Syria - - - - Swaziland - - - - Turks and Caicos Islands - - - - Chad - - - - French Southern Territories - - - - Togo - - - - Thailand - - - - Tajikistan - - - - Tokelau - - - - East Timor - formerly .tp - - - - Turkmenistan - - - - Tunisia - - - - Tonga - - - - East Timor - DELETED - - deprecated - use .tl; code "transitionally reserved" by ISO - 3166-1 - - - - - Turkey - - - - Trinidad and Tobago - - - - Tuvalu - - - - Republic of China - Taiwan - - - - Tanzania - - - - Ukraine - - - - Uganda - - - - United Kingdom - - code "exceptionally reserved" by ISO 3166-1 (see also .gb) - - - - - United States Minor Outlying Islands - DELETED - see http://en.wikipedia.org/wiki/.um - - - - United States - - - - Uruguay - - - - Uzbekistan - - - - Vatican City - - - - Saint Vincent and the Grenadines - - - - Venezuela - - - - British Virgin Islands - - - - United States Virgin Islands - - - - Vietnam - - - - Vanuatu - - - - Wallis and Futuna - - - - Samoa - formerly Western Samoa - - - - Yemen - - - - Mayotte - - - - Yugoslavia - - subsequently renamed Serbia and Montenegro (code officially - replaced by .cs (see above) but still used; code - "transitionally reserved" by ISO 3166-1) - - - - - South Africa - - - - Zambia - - - - Zaire - DELETED - replaced by .cd - - - - Zimbabwe - - - - - - - - - - - - - - - - - - - - - - - - - - - - DELETED - - - DELETED - - - DELETED - - - DELETED - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/conf/domain-suffixes.xsd b/conf/domain-suffixes.xsd deleted file mode 100644 index 67c9bd0e7e..0000000000 --- a/conf/domain-suffixes.xsd +++ /dev/null @@ -1,130 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/default.properties b/default.properties index 385e53e577..3f4b135010 100644 --- a/default.properties +++ b/default.properties @@ -130,7 +130,6 @@ plugins.scoring=\ org.apache.nutch.scoring.opic*:\ org.apache.nutch.scoring.orphan*:\ org.apache.nutch.scoring.similarity*:\ - org.apache.nutch.scoring.tld*:\ org.apache.nutch.scoring.urlmeta*\ org.apache.nutch.scoring.metadata* diff --git a/src/bin/nutch b/src/bin/nutch index 561c79e778..f2527f5687 100755 --- a/src/bin/nutch +++ b/src/bin/nutch @@ -269,7 +269,7 @@ elif [ "$COMMAND" = "filterchecker" ] ; then elif [ "$COMMAND" = "normalizerchecker" ] ; then CLASS=org.apache.nutch.net.URLNormalizerChecker elif [ "$COMMAND" = "domainstats" ] ; then - CLASS=org.apache.nutch.util.domain.DomainStatistics + CLASS=org.apache.nutch.util.DomainStatistics elif [ "$COMMAND" = "protocolstats" ] ; then CLASS=org.apache.nutch.util.ProtocolStatusStatistics elif [ "$COMMAND" = "crawlcomplete" ] ; then diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/DomainStatistics.java similarity index 97% rename from src/java/org/apache/nutch/util/domain/DomainStatistics.java rename to src/java/org/apache/nutch/util/DomainStatistics.java index 1843c424d1..0a74f02310 100644 --- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java +++ b/src/java/org/apache/nutch/util/DomainStatistics.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.nutch.util.domain; +package org.apache.nutch.util; import java.io.IOException; import java.lang.invoke.MethodHandles; @@ -38,9 +38,6 @@ import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.URLUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -192,7 +189,7 @@ public void map(Text urlText, CrawlDatum datum, Context context) out = URLUtil.getDomainName(url); break; case MODE_SUFFIX: - out = URLUtil.getDomainSuffix(url).getDomain(); + out = URLUtil.getDomainSuffix(url); break; case MODE_TLD: out = URLUtil.getTopLevelDomainName(url); diff --git a/src/java/org/apache/nutch/util/domain/DomainSuffix.java b/src/java/org/apache/nutch/util/domain/DomainSuffix.java deleted file mode 100644 index 05162aaf7a..0000000000 --- a/src/java/org/apache/nutch/util/domain/DomainSuffix.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.util.domain; - -/** - * This class represents the last part of the host name, which is operated by - * authoritives, not individuals. This information is needed to find the domain - * name of a host. The domain name of a host is defined to be the last part - * before the domain suffix, w/o subdomain names. As an example the domain name - * of
- * http://lucene.apache.org/ - *
- * is apache.org
- * This class holds three fields, domain field represents the - * suffix (such as "co.uk") boost is a float for boosting score - * of url's with this suffix status field represents domain's - * status - * - * @author Enis Soztutar <enis.soz.nutch@gmail.com> - * @see TopLevelDomain for info please see conf/domain-suffixes.xml - */ -public class DomainSuffix { - - /** - * Enumeration of the status of the tld. Please see domain-suffixes.xml. - */ - public enum Status { - INFRASTRUCTURE, SPONSORED, UNSPONSORED, STARTUP, PROPOSED, DELETED, PSEUDO_DOMAIN, DEPRECATED, IN_USE, NOT_IN_USE, REJECTED - }; - - private String domain; - private Status status; - private float boost; - - public static final float DEFAULT_BOOST = 1.0f; - public static final Status DEFAULT_STATUS = Status.IN_USE; - - public DomainSuffix(String domain, Status status, float boost) { - this.domain = domain; - this.status = status; - this.boost = boost; - } - - public DomainSuffix(String domain) { - this(domain, DEFAULT_STATUS, DEFAULT_BOOST); - } - - public String getDomain() { - return domain; - } - - public Status getStatus() { - return status; - } - - public float getBoost() { - return boost; - } - - @Override - public String toString() { - return domain; - } -} diff --git a/src/java/org/apache/nutch/util/domain/DomainSuffixes.java b/src/java/org/apache/nutch/util/domain/DomainSuffixes.java deleted file mode 100644 index 455f367126..0000000000 --- a/src/java/org/apache/nutch/util/domain/DomainSuffixes.java +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.util.domain; - -import java.io.InputStream; -import java.lang.invoke.MethodHandles; -import java.util.HashMap; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.util.StringUtils; - -/** - * Storage class for DomainSuffix objects Note: this class is - * singleton - * - * @author Enis Soztutar <enis.soz.nutch@gmail.com> - */ -public class DomainSuffixes { - private static final Logger LOG = LoggerFactory - .getLogger(MethodHandles.lookup().lookupClass()); - - private HashMap domains = new HashMap<>(); - - private static DomainSuffixes instance; - - /** private ctor */ - private DomainSuffixes() { - String file = "domain-suffixes.xml"; - - try (InputStream input = this.getClass().getClassLoader() - .getResourceAsStream(file)) { - new DomainSuffixesReader().read(this, input); - } catch (Exception ex) { - LOG.warn(StringUtils.stringifyException(ex)); - } - } - - /** - * Singleton instance, lazy instantination - * - * @return returns the domain suffix instance - */ - public static DomainSuffixes getInstance() { - if (instance == null) { - instance = new DomainSuffixes(); - } - return instance; - } - - void addDomainSuffix(DomainSuffix tld) { - domains.put(tld.getDomain(), tld); - } - - /** - * Return whether the extension is a registered domain entry - * @param extension a String extension - * @return true if input is a registered domain entry, false otherwise - */ - public boolean isDomainSuffix(String extension) { - return domains.containsKey(extension); - } - - /** - * Return the {@link DomainSuffix} object for the extension, if extension is a - * top level domain returned object will be an instance of - * {@link TopLevelDomain} - * - * @param extension - * of the domain - * @return {@link DomainSuffix} - */ - public DomainSuffix get(String extension) { - return domains.get(extension); - } - -} diff --git a/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java b/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java deleted file mode 100644 index 69e212dccf..0000000000 --- a/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.util.domain; - -import java.io.IOException; -import java.io.InputStream; -import java.lang.invoke.MethodHandles; - -import javax.xml.parsers.DocumentBuilder; -import javax.xml.parsers.DocumentBuilderFactory; -import javax.xml.parsers.ParserConfigurationException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.util.StringUtils; -import org.apache.nutch.util.domain.DomainSuffix.Status; -import org.apache.nutch.util.domain.TopLevelDomain.Type; -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.NodeList; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; - -/** - * For parsing xml files containing domain suffix definitions. Parsed xml files - * should validate against domain-suffixes.xsd - * - * @author Enis Soztutar <enis.soz.nutch@gmail.com> - */ -class DomainSuffixesReader { - - private static final Logger LOG = LoggerFactory - .getLogger(MethodHandles.lookup().lookupClass()); - - void read(DomainSuffixes tldEntries, InputStream input) throws IOException { - try { - - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - factory.setIgnoringComments(true); - DocumentBuilder builder = factory.newDocumentBuilder(); - Document document = builder.parse(new InputSource(input)); - - Element root = document.getDocumentElement(); - - if (root != null && root.getTagName().equals("domains")) { - - Element tlds = (Element) root.getElementsByTagName("tlds").item(0); - Element suffixes = (Element) root.getElementsByTagName("suffixes") - .item(0); - - // read tlds - readITLDs(tldEntries, (Element) tlds.getElementsByTagName("itlds") - .item(0)); - readGTLDs(tldEntries, (Element) tlds.getElementsByTagName("gtlds") - .item(0)); - readCCTLDs(tldEntries, (Element) tlds.getElementsByTagName("cctlds") - .item(0)); - - readSuffixes(tldEntries, suffixes); - } else { - throw new IOException("xml file is not valid"); - } - } catch (ParserConfigurationException ex) { - LOG.warn(StringUtils.stringifyException(ex)); - throw new IOException(ex.getMessage()); - } catch (SAXException ex) { - LOG.warn(StringUtils.stringifyException(ex)); - throw new IOException(ex.getMessage()); - } - } - - void readITLDs(DomainSuffixes tldEntries, Element el) { - NodeList children = el.getElementsByTagName("tld"); - for (int i = 0; i < children.getLength(); i++) { - tldEntries.addDomainSuffix(readGTLD((Element) children.item(i), - Type.INFRASTRUCTURE)); - } - } - - void readGTLDs(DomainSuffixes tldEntries, Element el) { - NodeList children = el.getElementsByTagName("tld"); - for (int i = 0; i < children.getLength(); i++) { - tldEntries.addDomainSuffix(readGTLD((Element) children.item(i), - Type.GENERIC)); - } - } - - void readCCTLDs(DomainSuffixes tldEntries, Element el) throws IOException { - NodeList children = el.getElementsByTagName("tld"); - for (int i = 0; i < children.getLength(); i++) { - tldEntries.addDomainSuffix(readCCTLD((Element) children.item(i))); - } - } - - TopLevelDomain readGTLD(Element el, Type type) { - String domain = el.getAttribute("domain"); - Status status = readStatus(el); - float boost = readBoost(el); - return new TopLevelDomain(domain, type, status, boost); - } - - TopLevelDomain readCCTLD(Element el) throws IOException { - String domain = el.getAttribute("domain"); - Status status = readStatus(el); - float boost = readBoost(el); - String countryName = readCountryName(el); - return new TopLevelDomain(domain, status, boost, countryName); - } - - /** read optional field status */ - Status readStatus(Element el) { - NodeList list = el.getElementsByTagName("status"); - if (list == null || list.getLength() == 0) - return DomainSuffix.DEFAULT_STATUS; - return Status.valueOf(list.item(0).getFirstChild().getNodeValue()); - } - - /** read optional field boost */ - float readBoost(Element el) { - NodeList list = el.getElementsByTagName("boost"); - if (list == null || list.getLength() == 0) - return DomainSuffix.DEFAULT_BOOST; - return Float.parseFloat(list.item(0).getFirstChild().getNodeValue()); - } - - /** - * read field countryname - */ - String readCountryName(Element el) throws IOException { - NodeList list = el.getElementsByTagName("country"); - if (list == null || list.getLength() == 0) - throw new IOException("Country name should be given"); - return list.item(0).getNodeValue(); - } - - void readSuffixes(DomainSuffixes tldEntries, Element el) { - NodeList children = el.getElementsByTagName("suffix"); - for (int i = 0; i < children.getLength(); i++) { - tldEntries.addDomainSuffix(readSuffix((Element) children.item(i))); - } - } - - DomainSuffix readSuffix(Element el) { - String domain = el.getAttribute("domain"); - Status status = readStatus(el); - float boost = readBoost(el); - return new DomainSuffix(domain, status, boost); - } - -} diff --git a/src/java/org/apache/nutch/util/domain/TopLevelDomain.java b/src/java/org/apache/nutch/util/domain/TopLevelDomain.java deleted file mode 100644 index 2e9cddb5b3..0000000000 --- a/src/java/org/apache/nutch/util/domain/TopLevelDomain.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.util.domain; - -/** - * (From wikipedia) A top-level domain (TLD) is the last part of an Internet - * domain name; that is, the letters which follow the final dot of any domain - * name. For example, in the domain name www.website.com, the - * top-level domain is com. - * - * @author Enis Soztutar <enis.soz.nutch@gmail.com> - * - * @see iana.org - * - * @see - * Top-level_domain - */ -public class TopLevelDomain extends DomainSuffix { - - public enum Type { - INFRASTRUCTURE, GENERIC, COUNTRY - }; - - private Type type; - private String countryName = null; - - public TopLevelDomain(String domain, Type type, Status status, float boost) { - super(domain, status, boost); - this.type = type; - } - - public TopLevelDomain(String domain, Status status, float boost, - String countryName) { - super(domain, status, boost); - this.type = Type.COUNTRY; - this.countryName = countryName; - } - - public Type getType() { - return type; - } - - /** - * Returns the country name if TLD is Country Code TLD - * - * @return country name or null - */ - public String getCountryName() { - return countryName; - } - -} diff --git a/src/java/org/apache/nutch/util/domain/package-info.java b/src/java/org/apache/nutch/util/domain/package-info.java deleted file mode 100644 index 6a799a9f1d..0000000000 --- a/src/java/org/apache/nutch/util/domain/package-info.java +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * Classes for domain name analysis. for information refer to - * following urls : - * - */ -package org.apache.nutch.util.domain; diff --git a/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java b/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java index 296124d56f..0637e9136f 100644 --- a/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java +++ b/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java @@ -30,10 +30,15 @@ import org.apache.nutch.indexer.NutchDocument; import org.apache.nutch.parse.Parse; import org.apache.nutch.util.URLUtil; -import org.apache.nutch.util.domain.DomainSuffix; /** - * Adds the top-level domain extensions to the index + * Adds the public suffix (aka. effective top-level domain) to the index using + * the field name "tld". + * + *

+ * For the URL https://www.example.co.uk/ the public suffix is + * co.uk. See also {@link URLUtil#getDomainSuffix(URL)}. + *

*/ public class TLDIndexingFilter implements IndexingFilter { private static final Logger LOG = LoggerFactory @@ -47,9 +52,9 @@ public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, try { URL url = new URL(urlText.toString()); - DomainSuffix d = URLUtil.getDomainSuffix(url); + String domain = URLUtil.getDomainSuffix(url); - doc.add("tld", d.getDomain()); + doc.add("tld", domain); } catch (Exception ex) { LOG.warn(ex.toString()); diff --git a/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java b/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java deleted file mode 100644 index 5f3080912c..0000000000 --- a/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.nutch.scoring.tld; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.Text; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.Inlinks; -import org.apache.nutch.indexer.NutchDocument; -import org.apache.nutch.indexer.NutchField; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.scoring.AbstractScoringFilter; -import org.apache.nutch.scoring.ScoringFilterException; -import org.apache.nutch.util.domain.DomainSuffix; -import org.apache.nutch.util.domain.DomainSuffixes; - -/** - * Scoring filter to boost top-level domains (TLDs). - */ -public class TLDScoringFilter extends AbstractScoringFilter { - - private DomainSuffixes tldEntries; - - public TLDScoringFilter() { - tldEntries = DomainSuffixes.getInstance(); - } - - @Override - public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, - CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) - throws ScoringFilterException { - - NutchField tlds = doc.getField("tld"); - float boost = 1.0f; - - if (tlds != null) { - for (Object tld : tlds.getValues()) { - DomainSuffix entry = tldEntries.get(tld.toString()); - if (entry != null) - boost *= entry.getBoost(); - } - } - return initScore * boost; - } - -} diff --git a/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package-info.java b/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package-info.java deleted file mode 100644 index 6ab837301c..0000000000 --- a/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/package-info.java +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** Top Level Domain Scoring plugin. */ -package org.apache.nutch.scoring.tld; diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java index c68750c0c4..9b0e9776de 100644 --- a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java +++ b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java @@ -33,7 +33,6 @@ import org.apache.nutch.plugin.Extension; import org.apache.nutch.plugin.PluginRepository; import org.apache.nutch.util.URLUtil; -import org.apache.nutch.util.domain.DomainSuffix; /** *

@@ -163,13 +162,9 @@ public String filter(String url) { try { // match for suffix, domain, and host in that order. more general will // override more specific - String domain = URLUtil.getDomainName(url).toLowerCase().trim(); + String domain = URLUtil.getDomainName(url); String host = URLUtil.getHost(url); - String suffix = null; - DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url); - if (domainSuffix != null) { - suffix = domainSuffix.getDomain(); - } + String suffix = URLUtil.getDomainSuffix(url); if (domainSet.contains(suffix) || domainSet.contains(domain) || domainSet.contains(host)) { diff --git a/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java b/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java index 7b38bfca00..1e86426c76 100644 --- a/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java +++ b/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java @@ -33,7 +33,6 @@ import org.apache.nutch.plugin.Extension; import org.apache.nutch.plugin.PluginRepository; import org.apache.nutch.util.URLUtil; -import org.apache.nutch.util.domain.DomainSuffix; /** *

@@ -161,13 +160,9 @@ public String filter(String url) { try { // match for suffix, domain, and host in that order. more general will // override more specific - String domain = URLUtil.getDomainName(url).toLowerCase().trim(); + String domain = URLUtil.getDomainName(url); String host = URLUtil.getHost(url); - String suffix = null; - DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url); - if (domainSuffix != null) { - suffix = domainSuffix.getDomain(); - } + String suffix = URLUtil.getDomainSuffix(url); if (domainSet.contains(suffix) || domainSet.contains(domain) || domainSet.contains(host)) { From 40881e8b755e24d78a60689bd818058daba1a6fc Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 11 Sep 2024 14:07:29 +0200 Subject: [PATCH 5/5] NUTCH-1806 Delegate processing of URL domains to crawler commons - restore previous behavior of URLUtil.getDomainSuffix(...) and getTopLevelDomainName(...) to return null if there is no valid public suffix resp. TLD - unify spelling of top-level domain --- src/java/org/apache/nutch/util/URLUtil.java | 21 +++++++++++-------- .../org/apache/nutch/util/TestURLUtil.java | 12 +++++------ 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java index b439e5f402..0cfce1c650 100644 --- a/src/java/org/apache/nutch/util/URLUtil.java +++ b/src/java/org/apache/nutch/util/URLUtil.java @@ -149,7 +149,7 @@ public static String getDomainName(String url) throws MalformedURLException { } /** - * Returns the top-level domain name of the URL. The top level domain name of + * Returns the top-level domain name of the URL. The top-level domain name of * a URL is the substring of the URL's hostname, w/o subdomain names. As an * example
* @@ -162,11 +162,14 @@ public static String getDomainName(String url) throws MalformedURLException { * returned. * * @param url - * input {@link URL} to extract the top level domain name from - * @return the top level domain name or the empty string if there is none + * input {@link URL} to extract the top-level domain name from + * @return the top-level domain name or null if there is none */ public static String getTopLevelDomainName(URL url) { String suffix = getDomainSuffix(url); + if (suffix == null) { + return null; + } int idx = suffix.lastIndexOf("."); if (idx != -1) { return suffix.substring(idx + 1); @@ -176,7 +179,7 @@ public static String getTopLevelDomainName(URL url) { } /** - * Returns the top-level domain name of the URL. The top level domain name of + * Returns the top-level domain name of the URL. The top-level domain name of * a URL is the substring of the URL's hostname, w/o subdomain names. As an * example
* @@ -189,8 +192,8 @@ public static String getTopLevelDomainName(URL url) { * returned. * * @param url - * input URL string to extract the top level domain name from - * @return the top level domain name or the empty string if there is none + * input URL string to extract the top-level domain name from + * @return the top-level domain name or null if there is none * @throws MalformedURLException * if the input URL is malformed */ @@ -247,7 +250,7 @@ public static boolean isSameDomainName(String url1, String url2) * * @param url * a {@link URL} to extract the domain suffix from - * @return the domain suffix or the empty string if there is none + * @return the domain suffix or null if there is none */ public static String getDomainSuffix(URL url) { String host = url.getHost(); @@ -262,7 +265,7 @@ public static String getDomainSuffix(URL url) { return suffix.getDomain(); } - return ""; + return null; } /** @@ -275,7 +278,7 @@ public static String getDomainSuffix(URL url) { * * @param url * a {@link URL} to extract the domain suffix from - * @return the domain suffix or the empty string if there is none + * @return the domain suffix or null if there is none * @throws MalformedURLException * if the input URL string is malformed */ diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java index 36724c32f6..f8a0a88766 100644 --- a/src/test/org/apache/nutch/util/TestURLUtil.java +++ b/src/test/org/apache/nutch/util/TestURLUtil.java @@ -98,7 +98,7 @@ public void testGetDomainSuffix() throws Exception { Assert.assertEquals("org", URLUtil.getDomainSuffix(url)); url = new URL("http://140.211.11.130/foundation/contributing.html"); - Assert.assertEquals("", URLUtil.getDomainSuffix(url)); + Assert.assertNull(URLUtil.getDomainSuffix(url)); url = new URL("http://www.example.co.uk:8080/index.html"); Assert.assertEquals("co.uk", URLUtil.getDomainSuffix(url)); @@ -111,10 +111,10 @@ public void testGetDomainSuffix() throws Exception { // "nn" is not a public suffix url = new URL("http://example.com.nn"); - Assert.assertEquals("", URLUtil.getDomainSuffix(url)); + Assert.assertNull(URLUtil.getDomainSuffix(url)); url = new URL("http://"); - Assert.assertEquals("", URLUtil.getDomainSuffix(url)); + Assert.assertNull(URLUtil.getDomainSuffix(url)); /* * "xyz" is an ICANN suffix since 2014, see @@ -157,17 +157,17 @@ public void testGetTopLevelDomain() throws Exception { Assert.assertEquals("org", URLUtil.getTopLevelDomainName(url)); url = new URL("http://140.211.11.130/foundation/contributing.html"); - Assert.assertEquals("", URLUtil.getTopLevelDomainName(url)); + Assert.assertNull(URLUtil.getTopLevelDomainName(url)); url = new URL("http://www.example.co.uk:8080/index.html"); Assert.assertEquals("uk", URLUtil.getTopLevelDomainName(url)); // "nn" is not a public suffix url = new URL("http://example.com.nn"); - Assert.assertEquals("", URLUtil.getTopLevelDomainName(url)); + Assert.assertNull(URLUtil.getTopLevelDomainName(url)); url = new URL("http://"); - Assert.assertEquals("", URLUtil.getTopLevelDomainName(url)); + Assert.assertNull(URLUtil.getTopLevelDomainName(url)); url = new URL("http://nic.삼성/"); Assert.assertEquals("xn--cg4bki", URLUtil.getTopLevelDomainName(url));