From 9202bb5593f4805ac82ba7e839909930c7261ee0 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 12 Dec 2025 09:32:14 +0100 Subject: [PATCH 1/2] NUTCH-3136 Upgrade crawler-commons dependency Robots.txt parser: use URL objects in newly introduced methods to avoid the unnecessary parsing of URLs. --- ivy/ivy.xml | 2 +- .../apache/nutch/fetcher/FetcherThread.java | 4 ++-- .../org/apache/nutch/protocol/Protocol.java | 21 +++++++++++++++++++ src/java/org/apache/nutch/util/URLUtil.java | 2 +- .../nutch/protocol/http/api/HttpBase.java | 6 ++++++ .../org/apache/nutch/protocol/file/File.java | 10 +++++++++ .../org/apache/nutch/protocol/ftp/Ftp.java | 9 ++++++++ 7 files changed, 50 insertions(+), 4 deletions(-) diff --git a/ivy/ivy.xml b/ivy/ivy.xml index f149ce13d5..d2cbfc8509 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -83,7 +83,7 @@ - + diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java index 8b4e5c95c3..edf8fcfb41 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherThread.java +++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java @@ -358,7 +358,7 @@ public void run() { LOG.debug("redirectCount={}", redirectCount); redirecting = false; Protocol protocol = this.protocolFactory.getProtocol(fit.u); - BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum, + BaseRobotRules rules = protocol.getRobotRules(fit.u, fit.datum, robotsTxtContent); if (robotsTxtContent != null) { outputRobotsTxt(robotsTxtContent); @@ -381,7 +381,7 @@ public void run() { } continue; } - if (!rules.isAllowed(fit.url.toString())) { + if (!rules.isAllowed(fit.u)) { // unblock fetchQueues.finishFetchItem(fit, true); LOG.info("Denied by robots.txt: {}", fit.url); diff --git a/src/java/org/apache/nutch/protocol/Protocol.java b/src/java/org/apache/nutch/protocol/Protocol.java index ab4162c87f..2514eae33e 100644 --- a/src/java/org/apache/nutch/protocol/Protocol.java +++ b/src/java/org/apache/nutch/protocol/Protocol.java @@ -16,6 +16,7 @@ */ package org.apache.nutch.protocol; +import java.net.URL; import java.util.List; import org.apache.hadoop.conf.Configurable; @@ -57,4 +58,24 @@ public interface Protocol extends Pluggable, Configurable { BaseRobotRules getRobotRules(Text url, CrawlDatum datum, List robotsTxtContent); + /** + * Retrieve robot rules applicable for this URL. + * + * @param url + * URL to check + * @param datum + * page datum + * @param robotsTxtContent + * container to store responses when fetching the robots.txt file for + * debugging or archival purposes. Instead of a robots.txt file, it + * may include redirects or an error page (404, etc.). Response + * {@link Content} is appended to the passed list. If null is passed + * nothing is stored. + * @return robot rules (specific for this URL or default), never null + */ + default BaseRobotRules getRobotRules(URL url, CrawlDatum datum, + List robotsTxtContent) { + return getRobotRules(new Text(url.toString()), datum, robotsTxtContent); + } + } diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java index 0cfce1c650..afd6f13857 100644 --- a/src/java/org/apache/nutch/util/URLUtil.java +++ b/src/java/org/apache/nutch/util/URLUtil.java @@ -103,7 +103,7 @@ static URL fixPureQueryTargets(URL base, String target) * https://publicsuffix.org/list/public_suffix_list.dat and are compared * using + * "https://crawler-commons.github.io/crawler-commons/1.6/crawlercommons/domains/EffectiveTldFinder.html"> * crawler-commons' EffectiveTldFinder. Only ICANN domain suffixes are * used. Because EffectiveTldFinder loads the public suffix list as file * "effective_tld_names.dat" from the Java classpath, it's possible to use the diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index 79b45882eb..caa3f861ea 100755 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -721,6 +721,12 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, return this.robots.getRobotRulesSet(this, url, robotsTxtContent); } + @Override + public BaseRobotRules getRobotRules(URL url, CrawlDatum datum, + List robotsTxtContent) { + return this.robots.getRobotRulesSet(this, url, robotsTxtContent); + } + /** * Transforming a String[] into a HashMap for faster searching * diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java index e4d2010696..877873b64b 100644 --- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java +++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java @@ -232,4 +232,14 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, return RobotRulesParser.EMPTY_RULES; } + /** + * No robots parsing is done for file protocol. So this returns a set of empty + * rules which will allow every url. + */ + @Override + public BaseRobotRules getRobotRules(URL url, CrawlDatum datum, + List robotsTxtContent) { + return RobotRulesParser.EMPTY_RULES; + } + } diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java index 2a47b63d61..8cf58f75e7 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java @@ -304,6 +304,15 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum, return robots.getRobotRulesSet(this, url, robotsTxtContent); } + /** + * Get the robots rules for a given url + */ + @Override + public BaseRobotRules getRobotRules(URL url, CrawlDatum datum, + List robotsTxtContent) { + return robots.getRobotRulesSet(this, url, robotsTxtContent); + } + public int getBufferSize() { return BUFFER_SIZE; } From 080c2c1529a51674e4b6400d69aa9ea02480d10c Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 12 Dec 2025 15:14:04 +0100 Subject: [PATCH 2/2] NUTCH-3136 Upgrade crawler-commons dependency Update URLUtil test to adapt to a change in the public suffix list --- .../org/apache/nutch/util/TestURLUtil.java | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java index 59e486d696..32dda0929d 100644 --- a/src/test/org/apache/nutch/util/TestURLUtil.java +++ b/src/test/org/apache/nutch/util/TestURLUtil.java @@ -78,8 +78,13 @@ public void testGetDomainName() throws Exception { assertEquals("example.2000.hu", URLUtil.getDomainName(url)); // test non-ascii - url = new URL("http://www.example.商業.tw"); - assertEquals("example.商業.tw", URLUtil.getDomainName(url)); + url = new URL("http://www.example.flå.no"); + assertEquals("example.flå.no", URLUtil.getDomainName(url)); + url = new URL("http://www.example.栃木.jp"); + assertEquals("example.栃木.jp", URLUtil.getDomainName(url)); + // broken by https://github.com/publicsuffix/list/commit/408a7b0bdec993884865baaa2f0d14cc9a060885 + // url = new URL("http://www.example.商業.tw"); + // Assert.assertEquals("example.商業.tw", URLUtil.getDomainName(url)); // test URL without host/authority url = new URL("file:/path/index.html"); @@ -141,9 +146,14 @@ public void testGetDomainSuffix() throws Exception { url = new URL("http://www.example.2000.hu"); assertEquals("2000.hu", URLUtil.getDomainSuffix(url)); - // test non-ascii - url = new URL("http://www.example.商業.tw"); - assertEquals("xn--czrw28b.tw", URLUtil.getDomainSuffix(url)); + // test non-ASCII + url = new URL("http://www.example.flå.no"); + assertEquals("xn--fl-zia.no", URLUtil.getDomainSuffix(url)); + url = new URL("http://www.example.栃木.jp"); + assertEquals("xn--4pvxs.jp", URLUtil.getDomainSuffix(url)); + // broken by https://github.com/publicsuffix/list/commit/408a7b0bdec993884865baaa2f0d14cc9a060885 + // url = new URL("http://www.example.商業.tw"); + // assertEquals("xn--czrw28b.tw", URLUtil.getDomainSuffix(url)); } @Test