Merge pull request #485 from sebastian-nagel/NUTCH-2748-redir-exceeded
NUTCH-2748 Fetch status gone (redirect exceeded) not to overwrite existing items in CrawlDb
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 01f4578..58db620 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -331,15 +331,6 @@
</property>
<property>
- <name>http.redirect.max</name>
- <value>0</value>
- <description>The maximum number of redirects the fetcher will follow when
- trying to fetch a page. If set to negative or 0, fetcher won't immediately
- follow redirected URLs, instead it will record them for later fetching.
- </description>
-</property>
-
-<property>
<name>http.useHttp11</name>
<value>true</value>
<description>
@@ -1197,6 +1188,27 @@
<description>Whether fetcher will normalize URLs (with the configured URL normalizers).</description>
</property>
+<property>
+ <name>http.redirect.max</name>
+ <value>0</value>
+ <description>The maximum number of redirects the fetcher will follow when
+ trying to fetch a page. If set to negative or 0, fetcher won't immediately
+ follow redirected URLs, instead it will record them for later fetching.
+ </description>
+</property>
+
+<property>
+ <name>http.redirect.max.exceeded.skip</name>
+ <value>false</value>
+ <description>
+ Whether to skip the last URL in a redirect chain when when redirects
+ are followed (http.redirect.max > 0) and the maximum number of redirects
+ in a chain is exceeded (redirect_count > http.redirect.max).
+ If not skipped the redirect target URLs are stored as `linked`
+ and fetched in one of the following cycles. See also NUTCH-2748.
+ </description>
+</property>
+
<!-- any23 plugin properties -->
<property>
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index e52b9ea..e3cf411 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -87,6 +87,7 @@
private long maxCrawlDelay;
private String queueMode;
private int maxRedirect;
+ private boolean maxRedirectExceededSkip = false;
private String reprUrl;
private boolean redirecting;
private int redirectCount;
@@ -197,7 +198,10 @@
queueMode = FetchItemQueues.checkQueueMode(queueMode);
LOG.info("{} {} Using queue mode : {}", getName(),
Thread.currentThread().getId(), queueMode);
+
this.maxRedirect = conf.getInt("http.redirect.max", 3);
+ this.maxRedirectExceededSkip = conf
+ .getBoolean("http.redirect.max.exceeded.skip", false);
int maxOutlinksPerPage = conf.getInt("db.max.outlinks.per.page", 100);
maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
@@ -449,12 +453,18 @@
if (redirecting && redirectCount > maxRedirect) {
((FetchItemQueues) fetchQueues).finishFetchItem(fit);
if (LOG.isInfoEnabled()) {
- LOG.info("{} {} - redirect count exceeded {}", getName(),
- Thread.currentThread().getId(), fit.url);
+ LOG.info("{} {} - redirect count exceeded {} ({})", getName(),
+ Thread.currentThread().getId(), fit.url,
+ maxRedirectExceededSkip ? "skipped" : "linked");
}
- output(fit.url, fit.datum, null,
- ProtocolStatus.STATUS_REDIR_EXCEEDED,
- CrawlDatum.STATUS_FETCH_GONE);
+ if (maxRedirectExceededSkip) {
+ // skip redirect target when redirect count is exceeded
+ } else {
+ Text newUrl = new Text(status.getMessage());
+ CrawlDatum newDatum = createRedirDatum(newUrl, fit,
+ CrawlDatum.STATUS_LINKED);
+ output(newUrl, newDatum, null, null, CrawlDatum.STATUS_LINKED);
+ }
}
} while (redirecting && (redirectCount <= maxRedirect));
@@ -550,36 +560,33 @@
LOG.debug(" - {} redirect to {} (fetching now)", redirType, url);
return url;
} else {
- CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED,
- fit.datum.getFetchInterval(), fit.datum.getScore());
- // transfer existing metadata
- newDatum.getMetaData().putAll(fit.datum.getMetaData());
- try {
- scfilters.initialScore(url, newDatum);
- } catch (ScoringFilterException e) {
- e.printStackTrace();
- }
- if (reprUrl != null) {
- newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
- new Text(reprUrl));
- }
+ CrawlDatum newDatum = createRedirDatum(url, fit, CrawlDatum.STATUS_LINKED);
output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
LOG.debug(" - {} redirect to {} (fetching later)", redirType, url);
return null;
}
}
- private FetchItem queueRedirect(Text redirUrl, FetchItem fit)
- throws ScoringFilterException {
- CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED,
- fit.datum.getFetchInterval(), fit.datum.getScore());
- // transfer all existing metadata to the redirect
+ private CrawlDatum createRedirDatum(Text redirUrl, FetchItem fit, byte status) {
+ CrawlDatum newDatum = new CrawlDatum(status, fit.datum.getFetchInterval(),
+ fit.datum.getScore());
+ // transfer existing metadata
newDatum.getMetaData().putAll(fit.datum.getMetaData());
- scfilters.initialScore(redirUrl, newDatum);
+ try {
+ scfilters.initialScore(redirUrl, newDatum);
+ } catch (ScoringFilterException e) {
+ LOG.error("Scoring filtering failed for {}: ", redirUrl, e);
+ }
if (reprUrl != null) {
newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
new Text(reprUrl));
}
+ return newDatum;
+ }
+
+ private FetchItem queueRedirect(Text redirUrl, FetchItem fit)
+ throws ScoringFilterException {
+ CrawlDatum newDatum = createRedirDatum(redirUrl, fit, CrawlDatum.STATUS_DB_UNFETCHED);
fit = FetchItem.create(redirUrl, newDatum, queueMode);
if (fit != null) {
FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);