From 9202bb5593f4805ac82ba7e839909930c7261ee0 Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <snagel@apache.org>
Date: Fri, 12 Dec 2025 09:32:14 +0100
Subject: [PATCH 1/2] NUTCH-3136 Upgrade crawler-commons dependency

Robots.txt parser: use URL objects in newly introduced
methods to avoid the unnecessary parsing of URLs.
---
 ivy/ivy.xml                                   |  2 +-
 .../apache/nutch/fetcher/FetcherThread.java   |  4 ++--
 .../org/apache/nutch/protocol/Protocol.java   | 21 +++++++++++++++++++
 src/java/org/apache/nutch/util/URLUtil.java   |  2 +-
 .../nutch/protocol/http/api/HttpBase.java     |  6 ++++++
 .../org/apache/nutch/protocol/file/File.java  | 10 +++++++++
 .../org/apache/nutch/protocol/ftp/Ftp.java    |  9 ++++++++
 7 files changed, 50 insertions(+), 4 deletions(-)
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index f149ce13d5..d2cbfc8509 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -83,7 +83,7 @@
 
 		<dependency org="com.google.guava" name="guava" rev="33.4.8-jre" />
 
-		<dependency org="com.github.crawler-commons" name="crawler-commons" rev="1.4" />
+		<dependency org="com.github.crawler-commons" name="crawler-commons" rev="1.6" />
 
 		<dependency org="com.google.code.gson" name="gson" rev="2.13.1"/>
 		<dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0">
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 8b4e5c95c3..edf8fcfb41 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -358,7 +358,7 @@ public void run() {
             LOG.debug("redirectCount={}", redirectCount);
             redirecting = false;
             Protocol protocol = this.protocolFactory.getProtocol(fit.u);
-            BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum,
+            BaseRobotRules rules = protocol.getRobotRules(fit.u, fit.datum,
                 robotsTxtContent);
             if (robotsTxtContent != null) {
               outputRobotsTxt(robotsTxtContent);
@@ -381,7 +381,7 @@ public void run() {
               }
               continue;
             }
-            if (!rules.isAllowed(fit.url.toString())) {
+            if (!rules.isAllowed(fit.u)) {
               // unblock
               fetchQueues.finishFetchItem(fit, true);
               LOG.info("Denied by robots.txt: {}", fit.url);
diff --git a/src/java/org/apache/nutch/protocol/Protocol.java b/src/java/org/apache/nutch/protocol/Protocol.java
index ab4162c87f..2514eae33e 100644
--- a/src/java/org/apache/nutch/protocol/Protocol.java
+++ b/src/java/org/apache/nutch/protocol/Protocol.java
@@ -16,6 +16,7 @@
  */
 package org.apache.nutch.protocol;
 
+import java.net.URL;
 import java.util.List;
 
 import org.apache.hadoop.conf.Configurable;
@@ -57,4 +58,24 @@ public interface Protocol extends Pluggable, Configurable {
   BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
       List<Content> robotsTxtContent);
 
+  /**
+   * Retrieve robot rules applicable for this URL.
+   *
+   * @param url
+   *          URL to check
+   * @param datum
+   *          page datum
+   * @param robotsTxtContent
+   *          container to store responses when fetching the robots.txt file for
+   *          debugging or archival purposes. Instead of a robots.txt file, it
+   *          may include redirects or an error page (404, etc.). Response
+   *          {@link Content} is appended to the passed list. If null is passed
+   *          nothing is stored.
+   * @return robot rules (specific for this URL or default), never null
+   */
+  default BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
+      List<Content> robotsTxtContent) {
+    return getRobotRules(new Text(url.toString()), datum, robotsTxtContent);
+  }
+
 }
diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java
index 0cfce1c650..afd6f13857 100644
--- a/src/java/org/apache/nutch/util/URLUtil.java
+++ b/src/java/org/apache/nutch/util/URLUtil.java
@@ -103,7 +103,7 @@ static URL fixPureQueryTargets(URL base, String target)
    * <a href= "https://publicsuffix.org/list/public_suffix_list.dat"
    * >https://publicsuffix.org/list/public_suffix_list.dat</a> and are compared
    * using <a href=
-   * "https://crawler-commons.github.io/crawler-commons/1.4/crawlercommons/domains/EffectiveTldFinder.html">
+   * "https://crawler-commons.github.io/crawler-commons/1.6/crawlercommons/domains/EffectiveTldFinder.html">
    * crawler-commons' EffectiveTldFinder</a>. Only ICANN domain suffixes are
    * used. Because EffectiveTldFinder loads the public suffix list as file
    * "effective_tld_names.dat" from the Java classpath, it's possible to use the
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 79b45882eb..caa3f861ea 100755
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -721,6 +721,12 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
     return this.robots.getRobotRulesSet(this, url, robotsTxtContent);
   }
 
+  @Override
+  public BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
+      List<Content> robotsTxtContent) {
+    return this.robots.getRobotRulesSet(this, url, robotsTxtContent);
+  }
+
   /**
    * Transforming a String[] into a HashMap for faster searching
    * 
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
index e4d2010696..877873b64b 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
@@ -232,4 +232,14 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
     return RobotRulesParser.EMPTY_RULES;
   }
 
+  /**
+   * No robots parsing is done for file protocol. So this returns a set of empty
+   * rules which will allow every url.
+   */
+  @Override
+  public BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
+      List<Content> robotsTxtContent) {
+    return RobotRulesParser.EMPTY_RULES;
+  }
+
 }
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
index 2a47b63d61..8cf58f75e7 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -304,6 +304,15 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
     return robots.getRobotRulesSet(this, url, robotsTxtContent);
   }
 
+  /**
+   * Get the robots rules for a given url
+   */
+  @Override
+  public BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
+      List<Content> robotsTxtContent) {
+    return robots.getRobotRulesSet(this, url, robotsTxtContent);
+  }
+
   public int getBufferSize() {
     return BUFFER_SIZE;
   }

From 080c2c1529a51674e4b6400d69aa9ea02480d10c Mon Sep 17 00:00:00 2001
From: Sebastian Nagel <snagel@apache.org>
Date: Fri, 12 Dec 2025 15:14:04 +0100
Subject: [PATCH 2/2] NUTCH-3136 Upgrade crawler-commons dependency

Update URLUtil test to adapt to a change in the public suffix list
---
 .../org/apache/nutch/util/TestURLUtil.java    | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java
index 59e486d696..32dda0929d 100644
--- a/src/test/org/apache/nutch/util/TestURLUtil.java
+++ b/src/test/org/apache/nutch/util/TestURLUtil.java
@@ -78,8 +78,13 @@ public void testGetDomainName() throws Exception {
     assertEquals("example.2000.hu", URLUtil.getDomainName(url));
 
     // test non-ascii
-    url = new URL("http://www.example.商業.tw");
-    assertEquals("example.商業.tw", URLUtil.getDomainName(url));
+    url = new URL("http://www.example.flå.no");
+    assertEquals("example.flå.no", URLUtil.getDomainName(url));
+    url = new URL("http://www.example.栃木.jp");
+    assertEquals("example.栃木.jp", URLUtil.getDomainName(url));
+    // broken by https://github.com/publicsuffix/list/commit/408a7b0bdec993884865baaa2f0d14cc9a060885
+    // url = new URL("http://www.example.商業.tw");
+    // Assert.assertEquals("example.商業.tw", URLUtil.getDomainName(url));
 
     // test URL without host/authority
     url = new URL("file:/path/index.html");
@@ -141,9 +146,14 @@ public void testGetDomainSuffix() throws Exception {
     url = new URL("http://www.example.2000.hu");
     assertEquals("2000.hu", URLUtil.getDomainSuffix(url));
 
-    // test non-ascii
-    url = new URL("http://www.example.商業.tw");
-    assertEquals("xn--czrw28b.tw", URLUtil.getDomainSuffix(url));
+    // test non-ASCII
+    url = new URL("http://www.example.flå.no");
+    assertEquals("xn--fl-zia.no", URLUtil.getDomainSuffix(url));
+    url = new URL("http://www.example.栃木.jp");
+    assertEquals("xn--4pvxs.jp", URLUtil.getDomainSuffix(url));
+    // broken by https://github.com/publicsuffix/list/commit/408a7b0bdec993884865baaa2f0d14cc9a060885
+    // url = new URL("http://www.example.商業.tw");
+    // assertEquals("xn--czrw28b.tw", URLUtil.getDomainSuffix(url));
   }
 
   @Test