Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ivy/ivy.xml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@

<dependency org="com.google.guava" name="guava" rev="33.4.8-jre" />

<dependency org="com.github.crawler-commons" name="crawler-commons" rev="1.4" />
<dependency org="com.github.crawler-commons" name="crawler-commons" rev="1.6" />

<dependency org="com.google.code.gson" name="gson" rev="2.13.1"/>
<dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0">
Expand Down
4 changes: 2 additions & 2 deletions src/java/org/apache/nutch/fetcher/FetcherThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ public void run() {
LOG.debug("redirectCount={}", redirectCount);
redirecting = false;
Protocol protocol = this.protocolFactory.getProtocol(fit.u);
BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum,
BaseRobotRules rules = protocol.getRobotRules(fit.u, fit.datum,
robotsTxtContent);
if (robotsTxtContent != null) {
outputRobotsTxt(robotsTxtContent);
Expand All @@ -381,7 +381,7 @@ public void run() {
}
continue;
}
if (!rules.isAllowed(fit.url.toString())) {
if (!rules.isAllowed(fit.u)) {
// unblock
fetchQueues.finishFetchItem(fit, true);
LOG.info("Denied by robots.txt: {}", fit.url);
Expand Down
21 changes: 21 additions & 0 deletions src/java/org/apache/nutch/protocol/Protocol.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
*/
package org.apache.nutch.protocol;

import java.net.URL;
import java.util.List;

import org.apache.hadoop.conf.Configurable;
Expand Down Expand Up @@ -57,4 +58,24 @@ public interface Protocol extends Pluggable, Configurable {
BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
List<Content> robotsTxtContent);

/**
* Retrieve robot rules applicable for this URL.
*
* @param url
* URL to check
* @param datum
* page datum
* @param robotsTxtContent
* container to store responses when fetching the robots.txt file for
* debugging or archival purposes. Instead of a robots.txt file, it
* may include redirects or an error page (404, etc.). Response
* {@link Content} is appended to the passed list. If null is passed
* nothing is stored.
* @return robot rules (specific for this URL or default), never null
*/
default BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
List<Content> robotsTxtContent) {
return getRobotRules(new Text(url.toString()), datum, robotsTxtContent);
}

}
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/util/URLUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ static URL fixPureQueryTargets(URL base, String target)
* <a href= "https://publicsuffix.org/list/public_suffix_list.dat"
* >https://publicsuffix.org/list/public_suffix_list.dat</a> and are compared
* using <a href=
* "https://crawler-commons.github.io/crawler-commons/1.4/crawlercommons/domains/EffectiveTldFinder.html">
* "https://crawler-commons.github.io/crawler-commons/1.6/crawlercommons/domains/EffectiveTldFinder.html">
* crawler-commons' EffectiveTldFinder</a>. Only ICANN domain suffixes are
* used. Because EffectiveTldFinder loads the public suffix list as file
* "effective_tld_names.dat" from the Java classpath, it's possible to use the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,12 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
return this.robots.getRobotRulesSet(this, url, robotsTxtContent);
}

@Override
public BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
List<Content> robotsTxtContent) {
return this.robots.getRobotRulesSet(this, url, robotsTxtContent);
}

/**
* Transforming a String[] into a HashMap for faster searching
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,4 +232,14 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
return RobotRulesParser.EMPTY_RULES;
}

/**
* No robots parsing is done for file protocol. So this returns a set of empty
* rules which will allow every url.
*/
@Override
public BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
List<Content> robotsTxtContent) {
return RobotRulesParser.EMPTY_RULES;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,15 @@ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
return robots.getRobotRulesSet(this, url, robotsTxtContent);
}

/**
* Get the robots rules for a given url
*/
@Override
public BaseRobotRules getRobotRules(URL url, CrawlDatum datum,
List<Content> robotsTxtContent) {
return robots.getRobotRulesSet(this, url, robotsTxtContent);
}

public int getBufferSize() {
return BUFFER_SIZE;
}
Expand Down
20 changes: 15 additions & 5 deletions src/test/org/apache/nutch/util/TestURLUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,13 @@ public void testGetDomainName() throws Exception {
assertEquals("example.2000.hu", URLUtil.getDomainName(url));

// test non-ascii
url = new URL("http://www.example.商業.tw");
assertEquals("example.商業.tw", URLUtil.getDomainName(url));
url = new URL("http://www.example.flå.no");
assertEquals("example.flå.no", URLUtil.getDomainName(url));
url = new URL("http://www.example.栃木.jp");
assertEquals("example.栃木.jp", URLUtil.getDomainName(url));
// broken by https://github.com/publicsuffix/list/commit/408a7b0bdec993884865baaa2f0d14cc9a060885
// url = new URL("http://www.example.商業.tw");
// Assert.assertEquals("example.商業.tw", URLUtil.getDomainName(url));

// test URL without host/authority
url = new URL("file:/path/index.html");
Expand Down Expand Up @@ -141,9 +146,14 @@ public void testGetDomainSuffix() throws Exception {
url = new URL("http://www.example.2000.hu");
assertEquals("2000.hu", URLUtil.getDomainSuffix(url));

// test non-ascii
url = new URL("http://www.example.商業.tw");
assertEquals("xn--czrw28b.tw", URLUtil.getDomainSuffix(url));
// test non-ASCII
url = new URL("http://www.example.flå.no");
assertEquals("xn--fl-zia.no", URLUtil.getDomainSuffix(url));
url = new URL("http://www.example.栃木.jp");
assertEquals("xn--4pvxs.jp", URLUtil.getDomainSuffix(url));
// broken by https://github.com/publicsuffix/list/commit/408a7b0bdec993884865baaa2f0d14cc9a060885
// url = new URL("http://www.example.商業.tw");
// assertEquals("xn--czrw28b.tw", URLUtil.getDomainSuffix(url));
}

@Test
Expand Down