Skip to content

Commit

Permalink
Merge pull request #816 from sebastian-nagel/NUTCH-1942-domain-utils-…
Browse files Browse the repository at this point in the history
…to-use-crawler-commons

NUTCH-1806 Delegate processing of URL domains to crawler-commons
NUTCH-1942 Remove TopLevelDomain
  • Loading branch information
sebastian-nagel authored Sep 17, 2024
2 parents 582cdd4 + 40881e8 commit 8b11962
Show file tree
Hide file tree
Showing 17 changed files with 208 additions and 5,192 deletions.
4,428 changes: 0 additions & 4,428 deletions conf/domain-suffixes.xml.template

This file was deleted.

130 changes: 0 additions & 130 deletions conf/domain-suffixes.xsd

This file was deleted.

1 change: 0 additions & 1 deletion default.properties
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ plugins.scoring=\
org.apache.nutch.scoring.opic*:\
org.apache.nutch.scoring.orphan*:\
org.apache.nutch.scoring.similarity*:\
org.apache.nutch.scoring.tld*:\
org.apache.nutch.scoring.urlmeta*\
org.apache.nutch.scoring.metadata*

Expand Down
2 changes: 1 addition & 1 deletion src/bin/nutch
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ elif [ "$COMMAND" = "filterchecker" ] ; then
elif [ "$COMMAND" = "normalizerchecker" ] ; then
CLASS=org.apache.nutch.net.URLNormalizerChecker
elif [ "$COMMAND" = "domainstats" ] ; then
CLASS=org.apache.nutch.util.domain.DomainStatistics
CLASS=org.apache.nutch.util.DomainStatistics
elif [ "$COMMAND" = "protocolstats" ] ; then
CLASS=org.apache.nutch.util.ProtocolStatusStatistics
elif [ "$COMMAND" = "crawlcomplete" ] ; then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.util.domain;
package org.apache.nutch.util;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
Expand All @@ -38,9 +38,6 @@
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand Down Expand Up @@ -192,7 +189,7 @@ public void map(Text urlText, CrawlDatum datum, Context context)
out = URLUtil.getDomainName(url);
break;
case MODE_SUFFIX:
out = URLUtil.getDomainSuffix(url).getDomain();
out = URLUtil.getDomainSuffix(url);
break;
case MODE_TLD:
out = URLUtil.getTopLevelDomainName(url);
Expand Down
Loading

0 comments on commit 8b11962

Please sign in to comment.