Merge pull request #479 from YossiTamari/patch-6

NUTCH-2511 Support large sitemaps
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 2ba04d2..97fcbe4 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2671,4 +2671,12 @@
     Maximum number of redirects to follow.
    </description>
 </property>
+
+<property>
+  <name>sitemap.size.max</name>
+  <value>52428800</value>
+  <description>
+    Maximum sitemap size in bytes.
+   </description>
+</property>
 </configuration>
diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java
index 18e3871..f558c46 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -92,7 +92,8 @@
   public static final String SITEMAP_ALWAYS_TRY_SITEMAPXML_ON_ROOT = "sitemap.url.default.sitemap.xml";
   public static final String SITEMAP_OVERWRITE_EXISTING = "sitemap.url.overwrite.existing";
   public static final String SITEMAP_REDIR_MAX = "sitemap.redir.max";
-  
+  public static final String SITEMAP_SIZE_MAX = "sitemap.size.max";
+
   private static class SitemapMapper extends Mapper<Text, Writable, Text, CrawlDatum> {
     private ProtocolFactory protocolFactory = null;
     private boolean strict = true;
@@ -107,6 +108,9 @@
 
     public void setup(Context context) {
       Configuration conf = context.getConfiguration();
+      int maxSize = conf.getInt(SITEMAP_SIZE_MAX, SiteMapParser.MAX_BYTES_ALLOWED);
+      conf.setInt("http.content.limit", maxSize);
+      conf.setInt("file.content.limit", maxSize);
       this.protocolFactory = new ProtocolFactory(conf);
       this.filter = conf.getBoolean(SITEMAP_URL_FILTERING, true);
       this.normalize = conf.getBoolean(SITEMAP_URL_NORMALIZING, true);