apache · sebastian-nagel · Dec 18, 2025 · Dec 12, 2025 · Dec 12, 2025
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
@@ -83,7 +83,7 @@
 
 		<dependency org="com.google.guava" name="guava" rev="33.4.8-jre" />
 
-		<dependency org="com.github.crawler-commons" name="crawler-commons" rev="1.4" />
+		<dependency org="com.github.crawler-commons" name="crawler-commons" rev="1.6" />
 
 		<dependency org="com.google.code.gson" name="gson" rev="2.13.1"/>
 		<dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0">

diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -358,7 +358,7 @@ public void run() {
             LOG.debug("redirectCount={}", redirectCount);
             redirecting = false;
             Protocol protocol = this.protocolFactory.getProtocol(fit.u);
-            BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum,
+            BaseRobotRules rules = protocol.getRobotRules(fit.u, fit.datum,
                 robotsTxtContent);
             if (robotsTxtContent != null) {
               outputRobotsTxt(robotsTxtContent);
@@ -381,7 +381,7 @@ public void run() {
               }
               continue;
             }
-            if (!rules.isAllowed(fit.url.toString())) {
+            if (!rules.isAllowed(fit.u)) {
               // unblock
               fetchQueues.finishFetchItem(fit, true);
               LOG.info("Denied by robots.txt: {}", fit.url);

diff --git a/src/java/org/apache/nutch/protocol/Protocol.java b/src/java/org/apache/nutch/protocol/Protocol.java
@@ -16,6 +16,7 @@
  */
 package org.apache.nutch.protocol;
 
+import java.net.URL;
 import java.util.List;
 
 import org.apache.hadoop.conf.Configurable;
@@ -57,4 +58,24 @@ public interface Protocol extends Pluggable, Configurable {
   BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
       List<Content> robotsTxtContent);
 
+  /**
+   * Retrieve robot rules applicable for this URL.
+   *
+   * @param url
+   *          URL to check
+   * @param datum
+   *          page datum
+   * @param robotsTxtContent
+   *          container to store responses when fetching the robots.txt file for
+   *          debugging or archival purposes. Instead of a robots.txt file, it
+   *          may include redirects or an error page (404, etc.). Response
+   *          {@link Content} is appended to the passed list. If null is passed
+   *          nothing is stored.
+   * @return robot rules (specific for this URL or default), never null
+   */
+  default BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
+      List<Content> robotsTxtContent) {
+    return getRobotRules(new Text(url.toString()), datum, robotsTxtContent);
+  }
+
 }
diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java
@@ -103,7 +103,7 @@ static URL fixPureQueryTargets(URL base, String target)
    * <a href= "https://publicsuffix.org/list/public_suffix_list.dat"
    * >https://publicsuffix.org/list/public_suffix_list.dat</a> and are compared
    * using <a href=
-   * "https://crawler-commons.github.io/crawler-commons/1.4/crawlercommons/domains/EffectiveTldFinder.html">
+   * "https://crawler-commons.github.io/crawler-commons/1.6/crawlercommons/domains/EffectiveTldFinder.html">
    * crawler-commons' EffectiveTldFinder</a>. Only ICANN domain suffixes are
    * used. Because EffectiveTldFinder loads the public suffix list as file
    * "effective_tld_names.dat" from the Java classpath, it's possible to use the

diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -721,6 +721,12 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
     return this.robots.getRobotRulesSet(this, url, robotsTxtContent);
   }
 
+  @Override
+  public BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
+      List<Content> robotsTxtContent) {
+    return this.robots.getRobotRulesSet(this, url, robotsTxtContent);
+  }
+
   /**
    * Transforming a String[] into a HashMap for faster searching
    * 

diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
@@ -232,4 +232,14 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
     return RobotRulesParser.EMPTY_RULES;
   }
 
+  /**
+   * No robots parsing is done for file protocol. So this returns a set of empty
+   * rules which will allow every url.
+   */
+  @Override
+  public BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
+      List<Content> robotsTxtContent) {
+    return RobotRulesParser.EMPTY_RULES;
+  }
+
 }
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -304,6 +304,15 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
     return robots.getRobotRulesSet(this, url, robotsTxtContent);
   }
 
+  /**
+   * Get the robots rules for a given url
+   */
+  @Override
+  public BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
+      List<Content> robotsTxtContent) {
+    return robots.getRobotRulesSet(this, url, robotsTxtContent);
+  }
+
   public int getBufferSize() {
     return BUFFER_SIZE;
   }

diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java
@@ -78,8 +78,13 @@ public void testGetDomainName() throws Exception {
     assertEquals("example.2000.hu", URLUtil.getDomainName(url));
 
     // test non-ascii
-    url = new URL("http://www.example.商業.tw");
-    assertEquals("example.商業.tw", URLUtil.getDomainName(url));
+    url = new URL("http://www.example.flå.no");
+    assertEquals("example.flå.no", URLUtil.getDomainName(url));
+    url = new URL("http://www.example.栃木.jp");
+    assertEquals("example.栃木.jp", URLUtil.getDomainName(url));
+    // broken by https://github.com/publicsuffix/list/commit/408a7b0bdec993884865baaa2f0d14cc9a060885
+    // url = new URL("http://www.example.商業.tw");
+    // Assert.assertEquals("example.商業.tw", URLUtil.getDomainName(url));
 
     // test URL without host/authority
     url = new URL("file:/path/index.html");
@@ -141,9 +146,14 @@ public void testGetDomainSuffix() throws Exception {
     url = new URL("http://www.example.2000.hu");
     assertEquals("2000.hu", URLUtil.getDomainSuffix(url));
 
-    // test non-ascii
-    url = new URL("http://www.example.商業.tw");
-    assertEquals("xn--czrw28b.tw", URLUtil.getDomainSuffix(url));
+    // test non-ASCII
+    url = new URL("http://www.example.flå.no");
+    assertEquals("xn--fl-zia.no", URLUtil.getDomainSuffix(url));
+    url = new URL("http://www.example.栃木.jp");
+    assertEquals("xn--4pvxs.jp", URLUtil.getDomainSuffix(url));
+    // broken by https://github.com/publicsuffix/list/commit/408a7b0bdec993884865baaa2f0d14cc9a060885
+    // url = new URL("http://www.example.商業.tw");
+    // assertEquals("xn--czrw28b.tw", URLUtil.getDomainSuffix(url));
   }
 
   @Test