Added filtering on whole string + documented config in nutch-default + fixed tests

Signed-off-by: Julien Nioche <julien@digitalpebble.com>
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index d8bf764..8d97dca 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1877,6 +1877,30 @@
 </property>
 
 <property>
+  <name>urlfilter.fast.url.max.length</name>
+  <value>-1</value>
+  <description>Filters URLs based on their overall length.
+  The default value of -1 means that it is deactivated.
+  </description>
+</property>
+
+<property>
+  <name>urlfilter.fast.url.path.max.length</name>
+  <value>-1</value>
+  <description>Filters URLs based on the length of their path element.
+  The default value of -1 means that it is deactivated.
+  </description>
+</property>
+
+<property>
+  <name>urlfilter.fast.url.query.max.length</name>
+  <value>-1</value>
+  <description>Filters URLs based on the length of their query element.
+  The default value of -1 means that it is deactivated.
+  </description>
+</property>
+
+<property>
   <name>urlfilter.order</name>
   <value></value>
   <description>The order by which URL filters are applied.
diff --git a/src/plugin/urlfilter-fast/README.md b/src/plugin/urlfilter-fast/README.md
index e6205fc..b4b0dfc 100644
--- a/src/plugin/urlfilter-fast/README.md
+++ b/src/plugin/urlfilter-fast/README.md
@@ -76,5 +76,6 @@
 
 In addition to this, the filter checks that the length of the path element of the URL and its query
 done not exceed the values set in the properties `urlfilter.fast.url.path.max.length` and 
-`urlfilter.fast.url.query.max.length` if set. 
+`urlfilter.fast.url.query.max.length` if set. The overall length of the URL can also be used for 
+filtering through the config `urlfilter.fast.url.max.length`.
 
diff --git a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
index 6761d5e..ab905c1 100644
--- a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
+++ b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
@@ -97,6 +97,7 @@
 
   private Configuration conf;
   public static final String URLFILTER_FAST_FILE = "urlfilter.fast.file";
+  public static final String URLFILTER_FAST_MAX_LENGTH = "urlfilter.fast.url.max.length";
   public static final String URLFILTER_FAST_PATH_MAX_LENGTH = "urlfilter.fast.url.path.max.length";
   public static final String URLFILTER_FAST_QUERY_MAX_LENGTH = "urlfilter.fast.url.query.max.length";
   
@@ -107,21 +108,34 @@
   private int maxLengthPath = -1;
   /** Max allowed size of the query of a URL **/
   private int maxLengthQuery = -1;
+  /** Max allowed size for the whole URL **/
+  private int maxLength = -1;
 
   private static final Pattern CATCH_ALL_RULE = Pattern
       .compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$");
 
   public FastURLFilter() {}
 
+  /** Used by the tests so that the rules file doesn't have to be in the jar **/
   FastURLFilter(Reader rules) throws IOException, PatternSyntaxException {
     reloadRules(rules);
   }
+  
+  /** Used by the tests so that the rules file doesn't have to be in the jar AND 
+   * we can set the conf for the length-based filtering **/
+  FastURLFilter(Reader rules, Configuration conf) throws IOException, PatternSyntaxException {
+    maxLengthPath = conf.getInt(URLFILTER_FAST_PATH_MAX_LENGTH, -1);
+    maxLengthQuery = conf.getInt(URLFILTER_FAST_QUERY_MAX_LENGTH, -1);
+    maxLength = conf.getInt(URLFILTER_FAST_MAX_LENGTH, -1);
+    reloadRules(rules);
+  }
 
   @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
     maxLengthPath = conf.getInt(URLFILTER_FAST_PATH_MAX_LENGTH, -1);
     maxLengthQuery = conf.getInt(URLFILTER_FAST_QUERY_MAX_LENGTH, -1);
+    maxLength = conf.getInt(URLFILTER_FAST_MAX_LENGTH, -1);
     try {
       reloadRules();
     } catch (Exception e) {
@@ -138,6 +152,12 @@
   @Override
   public String filter(String url) {
 
+    if (maxLength != -1 && url.length() > maxLength) {
+      LOG.debug("Rejected {} because URL length ({}) greater than limit {}", url,
+          url.length(), maxLength);
+      return null;
+    }
+    
     URL u;
 
     try {
@@ -209,6 +229,10 @@
     String fileRules = conf.get(URLFILTER_FAST_FILE);
     try (Reader reader = conf.getConfResourceAsReader(fileRules)) {
       reloadRules(reader);
+    } catch (Exception e) {
+      String message = "Couldn't load the rules from "+fileRules;
+      LOG.error(message);
+      throw new IOException(message);
     }
   }
 
diff --git a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
index 0b31a5a..75b3725 100644
--- a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
+++ b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
@@ -16,10 +16,10 @@
  */
 package org.apache.nutch.urlfilter.fast;
 
-import java.io.FileNotFoundException;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.Reader;
+import java.io.StringReader;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.net.URLFilter;
@@ -27,7 +27,6 @@
 import org.junit.Assert;
 import org.junit.Test;
 
-
 public class TestFastURLFilter extends RegexURLFilterBaseTest {
 
   @Override
@@ -55,12 +54,13 @@
     bench(800, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
   }
 
-  public void lengthQueryAndPath() throws FileNotFoundException {
-    URLFilter filter = getURLFilter(new FileReader(SAMPLES + SEPARATOR + "fast-urlfilter-test.txt"));
+  @Test
+  public void lengthQueryAndPath() throws Exception {
     Configuration conf = new Configuration();
     conf.setInt(FastURLFilter.URLFILTER_FAST_PATH_MAX_LENGTH, 50);
     conf.setInt(FastURLFilter.URLFILTER_FAST_QUERY_MAX_LENGTH, 50);
-    filter.setConf(conf);
+    // not interested in testing rules
+    URLFilter filter = new FastURLFilter(new StringReader(""), conf);
 
     StringBuilder url = new StringBuilder("http://nutch.apache.org/");
     for (int i = 0; i < 50; i++) {
@@ -75,4 +75,18 @@
 
     Assert.assertEquals(null, filter.filter(url.toString()));
   }
+
+  @Test
+  public void overalLengthTest() throws Exception {
+    Configuration conf = new Configuration();
+    conf.setInt(FastURLFilter.URLFILTER_FAST_MAX_LENGTH, 100);
+    // not interested in testing rules
+    URLFilter filter = new FastURLFilter(new StringReader(""), conf);
+
+    StringBuilder url = new StringBuilder("http://nutch.apache.org/");
+    for (int i = 0; i < 500; i++) {
+      url.append(i);
+    }
+    Assert.assertEquals(null, filter.filter(url.toString()));
+  }
 }