diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index f149ce13d5..d2cbfc8509 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -83,7 +83,7 @@
-
+
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 8b4e5c95c3..edf8fcfb41 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -358,7 +358,7 @@ public void run() {
LOG.debug("redirectCount={}", redirectCount);
redirecting = false;
Protocol protocol = this.protocolFactory.getProtocol(fit.u);
- BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum,
+ BaseRobotRules rules = protocol.getRobotRules(fit.u, fit.datum,
robotsTxtContent);
if (robotsTxtContent != null) {
outputRobotsTxt(robotsTxtContent);
@@ -381,7 +381,7 @@ public void run() {
}
continue;
}
- if (!rules.isAllowed(fit.url.toString())) {
+ if (!rules.isAllowed(fit.u)) {
// unblock
fetchQueues.finishFetchItem(fit, true);
LOG.info("Denied by robots.txt: {}", fit.url);
diff --git a/src/java/org/apache/nutch/protocol/Protocol.java b/src/java/org/apache/nutch/protocol/Protocol.java
index ab4162c87f..2514eae33e 100644
--- a/src/java/org/apache/nutch/protocol/Protocol.java
+++ b/src/java/org/apache/nutch/protocol/Protocol.java
@@ -16,6 +16,7 @@
*/
package org.apache.nutch.protocol;
+import java.net.URL;
import java.util.List;
import org.apache.hadoop.conf.Configurable;
@@ -57,4 +58,24 @@ public interface Protocol extends Pluggable, Configurable {
BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
List robotsTxtContent);
+ /**
+ * Retrieve robot rules applicable for this URL.
+ *
+ * @param url
+ * URL to check
+ * @param datum
+ * page datum
+ * @param robotsTxtContent
+ * container to store responses when fetching the robots.txt file for
+ * debugging or archival purposes. Instead of a robots.txt file, it
+ * may include redirects or an error page (404, etc.). Response
+ * {@link Content} is appended to the passed list. If null is passed
+ * nothing is stored.
+ * @return robot rules (specific for this URL or default), never null
+ */
+ default BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
+ List robotsTxtContent) {
+ return getRobotRules(new Text(url.toString()), datum, robotsTxtContent);
+ }
+
}
diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java
index 0cfce1c650..afd6f13857 100644
--- a/src/java/org/apache/nutch/util/URLUtil.java
+++ b/src/java/org/apache/nutch/util/URLUtil.java
@@ -103,7 +103,7 @@ static URL fixPureQueryTargets(URL base, String target)
* https://publicsuffix.org/list/public_suffix_list.dat and are compared
* using
+ * "https://crawler-commons.github.io/crawler-commons/1.6/crawlercommons/domains/EffectiveTldFinder.html">
* crawler-commons' EffectiveTldFinder. Only ICANN domain suffixes are
* used. Because EffectiveTldFinder loads the public suffix list as file
* "effective_tld_names.dat" from the Java classpath, it's possible to use the
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 79b45882eb..caa3f861ea 100755
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -721,6 +721,12 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
return this.robots.getRobotRulesSet(this, url, robotsTxtContent);
}
+ @Override
+ public BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
+ List robotsTxtContent) {
+ return this.robots.getRobotRulesSet(this, url, robotsTxtContent);
+ }
+
/**
* Transforming a String[] into a HashMap for faster searching
*
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
index e4d2010696..877873b64b 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
@@ -232,4 +232,14 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
return RobotRulesParser.EMPTY_RULES;
}
+ /**
+ * No robots parsing is done for file protocol. So this returns a set of empty
+ * rules which will allow every url.
+ */
+ @Override
+ public BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
+ List robotsTxtContent) {
+ return RobotRulesParser.EMPTY_RULES;
+ }
+
}
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
index 2a47b63d61..8cf58f75e7 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -304,6 +304,15 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
return robots.getRobotRulesSet(this, url, robotsTxtContent);
}
+ /**
+ * Get the robots rules for a given url
+ */
+ @Override
+ public BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
+ List robotsTxtContent) {
+ return robots.getRobotRulesSet(this, url, robotsTxtContent);
+ }
+
public int getBufferSize() {
return BUFFER_SIZE;
}
diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java
index 59e486d696..32dda0929d 100644
--- a/src/test/org/apache/nutch/util/TestURLUtil.java
+++ b/src/test/org/apache/nutch/util/TestURLUtil.java
@@ -78,8 +78,13 @@ public void testGetDomainName() throws Exception {
assertEquals("example.2000.hu", URLUtil.getDomainName(url));
// test non-ascii
- url = new URL("http://www.example.商業.tw");
- assertEquals("example.商業.tw", URLUtil.getDomainName(url));
+ url = new URL("http://www.example.flå.no");
+ assertEquals("example.flå.no", URLUtil.getDomainName(url));
+ url = new URL("http://www.example.栃木.jp");
+ assertEquals("example.栃木.jp", URLUtil.getDomainName(url));
+ // broken by https://github.com/publicsuffix/list/commit/408a7b0bdec993884865baaa2f0d14cc9a060885
+ // url = new URL("http://www.example.商業.tw");
+ // Assert.assertEquals("example.商業.tw", URLUtil.getDomainName(url));
// test URL without host/authority
url = new URL("file:/path/index.html");
@@ -141,9 +146,14 @@ public void testGetDomainSuffix() throws Exception {
url = new URL("http://www.example.2000.hu");
assertEquals("2000.hu", URLUtil.getDomainSuffix(url));
- // test non-ascii
- url = new URL("http://www.example.商業.tw");
- assertEquals("xn--czrw28b.tw", URLUtil.getDomainSuffix(url));
+ // test non-ASCII
+ url = new URL("http://www.example.flå.no");
+ assertEquals("xn--fl-zia.no", URLUtil.getDomainSuffix(url));
+ url = new URL("http://www.example.栃木.jp");
+ assertEquals("xn--4pvxs.jp", URLUtil.getDomainSuffix(url));
+ // broken by https://github.com/publicsuffix/list/commit/408a7b0bdec993884865baaa2f0d14cc9a060885
+ // url = new URL("http://www.example.商業.tw");
+ // assertEquals("xn--czrw28b.tw", URLUtil.getDomainSuffix(url));
}
@Test