Merge pull request #479 from YossiTamari/patch-6
NUTCH-2511 Support large sitemaps
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 2ba04d2..97fcbe4 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2671,4 +2671,12 @@
Maximum number of redirects to follow.
</description>
</property>
+
+<property>
+ <name>sitemap.size.max</name>
+ <value>52428800</value>
+ <description>
+ Maximum sitemap size in bytes.
+ </description>
+</property>
</configuration>
diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java
index 18e3871..f558c46 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -92,7 +92,8 @@
public static final String SITEMAP_ALWAYS_TRY_SITEMAPXML_ON_ROOT = "sitemap.url.default.sitemap.xml";
public static final String SITEMAP_OVERWRITE_EXISTING = "sitemap.url.overwrite.existing";
public static final String SITEMAP_REDIR_MAX = "sitemap.redir.max";
-
+ public static final String SITEMAP_SIZE_MAX = "sitemap.size.max";
+
private static class SitemapMapper extends Mapper<Text, Writable, Text, CrawlDatum> {
private ProtocolFactory protocolFactory = null;
private boolean strict = true;
@@ -107,6 +108,9 @@
public void setup(Context context) {
Configuration conf = context.getConfiguration();
+ int maxSize = conf.getInt(SITEMAP_SIZE_MAX, SiteMapParser.MAX_BYTES_ALLOWED);
+ conf.setInt("http.content.limit", maxSize);
+ conf.setInt("file.content.limit", maxSize);
this.protocolFactory = new ProtocolFactory(conf);
this.filter = conf.getBoolean(SITEMAP_URL_FILTERING, true);
this.normalize = conf.getBoolean(SITEMAP_URL_NORMALIZING, true);