Merge pull request #506 from sebastian-nagel/NUTCH-2775-robots-min-delay
NUTCH-2775 Fetcher to guarantee minimum delay even if robots.txt defines shorter Crawl-delay
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 85d9933..6dfbe64 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -959,6 +959,18 @@
</property>
<property>
+ <name>fetcher.min.crawl.delay</name>
+ <value>${fetcher.server.delay}</value>
+ <description>
+ Minimum Crawl-Delay (in seconds) accepted in robots.txt, even if the
+ robots.txt specifies a shorter delay. By default the minimum Crawl-Delay
+ is set to the value of `fetcher.server.delay` which guarantees that
+ a value set in the robots.txt cannot make the crawler more aggressive
+ than the default configuration.
+ </description>
+</property>
+
+<property>
<name>fetcher.threads.fetch</name>
<value>10</value>
<description>The number of FetcherThreads the fetcher should use.
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 5d5a20b..549cd36 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -85,6 +85,7 @@
private URLNormalizers normalizers;
private ProtocolFactory protocolFactory;
private long maxCrawlDelay;
+ private long minCrawlDelay;
private String queueMode;
private int maxRedirect;
private boolean maxRedirectExceededSkip = false;
@@ -165,6 +166,9 @@
this.protocolFactory = new ProtocolFactory(conf);
this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
this.maxCrawlDelay = conf.getInt("fetcher.max.crawl.delay", 30) * 1000;
+ float crawlDelay = conf.getFloat("fetcher.server.delay", 1.0f);
+ this.minCrawlDelay = (long) (conf.getFloat("fetcher.min.crawl.delay",
+ crawlDelay) * 1000);
this.activeThreads = activeThreads;
this.fetchQueues = fetchQueues;
this.feeder = feeder;
@@ -324,8 +328,8 @@
if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
// unblock
fetchQueues.finishFetchItem(fit, true);
- LOG.info("Crawl-Delay for {} too long ({}), skipping", fit.url,
- rules.getCrawlDelay());
+ LOG.info("Crawl-Delay for {} too long ({} ms), skipping",
+ fit.url, rules.getCrawlDelay());
output(fit.url, fit.datum, null,
ProtocolStatus.STATUS_ROBOTS_DENIED,
CrawlDatum.STATUS_FETCH_GONE);
@@ -334,7 +338,14 @@
continue;
} else {
FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
- fiq.crawlDelay = rules.getCrawlDelay();
+ long crawlDelay = rules.getCrawlDelay();
+ if (crawlDelay < minCrawlDelay) {
+ LOG.info(
+ "Crawl-Delay for {} too short ({} ms), adjusting to {} ms",
+ fit.url, rules.getCrawlDelay(), minCrawlDelay);
+ crawlDelay = minCrawlDelay;
+ }
+ fiq.crawlDelay = crawlDelay;
if (LOG.isDebugEnabled()) {
LOG.debug("Crawl delay for queue: " + fit.queueID
+ " is set to " + fiq.crawlDelay