Added filtering on whole string + documented config in nutch-default + fixed tests
Signed-off-by: Julien Nioche <julien@digitalpebble.com>
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index d8bf764..8d97dca 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1877,6 +1877,30 @@
</property>
<property>
+ <name>urlfilter.fast.url.max.length</name>
+ <value>-1</value>
+ <description>Filters URLs based on their overall length.
+ The default value of -1 means that it is deactivated.
+ </description>
+</property>
+
+<property>
+ <name>urlfilter.fast.url.path.max.length</name>
+ <value>-1</value>
+ <description>Filters URLs based on the length of their path element.
+ The default value of -1 means that it is deactivated.
+ </description>
+</property>
+
+<property>
+ <name>urlfilter.fast.url.query.max.length</name>
+ <value>-1</value>
+ <description>Filters URLs based on the length of their query element.
+ The default value of -1 means that it is deactivated.
+ </description>
+</property>
+
+<property>
<name>urlfilter.order</name>
<value></value>
<description>The order by which URL filters are applied.
diff --git a/src/plugin/urlfilter-fast/README.md b/src/plugin/urlfilter-fast/README.md
index e6205fc..b4b0dfc 100644
--- a/src/plugin/urlfilter-fast/README.md
+++ b/src/plugin/urlfilter-fast/README.md
@@ -76,5 +76,6 @@
In addition to this, the filter checks that the length of the path element of the URL and its query
done not exceed the values set in the properties `urlfilter.fast.url.path.max.length` and
-`urlfilter.fast.url.query.max.length` if set.
+`urlfilter.fast.url.query.max.length` if set. The overall length of the URL can also be used for
+filtering through the config `urlfilter.fast.url.max.length`.
diff --git a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
index 6761d5e..ab905c1 100644
--- a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
+++ b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
@@ -97,6 +97,7 @@
private Configuration conf;
public static final String URLFILTER_FAST_FILE = "urlfilter.fast.file";
+ public static final String URLFILTER_FAST_MAX_LENGTH = "urlfilter.fast.url.max.length";
public static final String URLFILTER_FAST_PATH_MAX_LENGTH = "urlfilter.fast.url.path.max.length";
public static final String URLFILTER_FAST_QUERY_MAX_LENGTH = "urlfilter.fast.url.query.max.length";
@@ -107,21 +108,34 @@
private int maxLengthPath = -1;
/** Max allowed size of the query of a URL **/
private int maxLengthQuery = -1;
+ /** Max allowed size for the whole URL **/
+ private int maxLength = -1;
private static final Pattern CATCH_ALL_RULE = Pattern
.compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$");
public FastURLFilter() {}
+ /** Used by the tests so that the rules file doesn't have to be in the jar **/
FastURLFilter(Reader rules) throws IOException, PatternSyntaxException {
reloadRules(rules);
}
+
+ /** Used by the tests so that the rules file doesn't have to be in the jar AND
+ * we can set the conf for the length-based filtering **/
+ FastURLFilter(Reader rules, Configuration conf) throws IOException, PatternSyntaxException {
+ maxLengthPath = conf.getInt(URLFILTER_FAST_PATH_MAX_LENGTH, -1);
+ maxLengthQuery = conf.getInt(URLFILTER_FAST_QUERY_MAX_LENGTH, -1);
+ maxLength = conf.getInt(URLFILTER_FAST_MAX_LENGTH, -1);
+ reloadRules(rules);
+ }
@Override
public void setConf(Configuration conf) {
this.conf = conf;
maxLengthPath = conf.getInt(URLFILTER_FAST_PATH_MAX_LENGTH, -1);
maxLengthQuery = conf.getInt(URLFILTER_FAST_QUERY_MAX_LENGTH, -1);
+ maxLength = conf.getInt(URLFILTER_FAST_MAX_LENGTH, -1);
try {
reloadRules();
} catch (Exception e) {
@@ -138,6 +152,12 @@
@Override
public String filter(String url) {
+ if (maxLength != -1 && url.length() > maxLength) {
+ LOG.debug("Rejected {} because URL length ({}) greater than limit {}", url,
+ url.length(), maxLength);
+ return null;
+ }
+
URL u;
try {
@@ -209,6 +229,10 @@
String fileRules = conf.get(URLFILTER_FAST_FILE);
try (Reader reader = conf.getConfResourceAsReader(fileRules)) {
reloadRules(reader);
+ } catch (Exception e) {
+ String message = "Couldn't load the rules from "+fileRules;
+ LOG.error(message);
+ throw new IOException(message);
}
}
diff --git a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
index 0b31a5a..75b3725 100644
--- a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
+++ b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
@@ -16,10 +16,10 @@
*/
package org.apache.nutch.urlfilter.fast;
-import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
+import java.io.StringReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.net.URLFilter;
@@ -27,7 +27,6 @@
import org.junit.Assert;
import org.junit.Test;
-
public class TestFastURLFilter extends RegexURLFilterBaseTest {
@Override
@@ -55,12 +54,13 @@
bench(800, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
}
- public void lengthQueryAndPath() throws FileNotFoundException {
- URLFilter filter = getURLFilter(new FileReader(SAMPLES + SEPARATOR + "fast-urlfilter-test.txt"));
+ @Test
+ public void lengthQueryAndPath() throws Exception {
Configuration conf = new Configuration();
conf.setInt(FastURLFilter.URLFILTER_FAST_PATH_MAX_LENGTH, 50);
conf.setInt(FastURLFilter.URLFILTER_FAST_QUERY_MAX_LENGTH, 50);
- filter.setConf(conf);
+ // not interested in testing rules
+ URLFilter filter = new FastURLFilter(new StringReader(""), conf);
StringBuilder url = new StringBuilder("http://nutch.apache.org/");
for (int i = 0; i < 50; i++) {
@@ -75,4 +75,18 @@
Assert.assertEquals(null, filter.filter(url.toString()));
}
+
+ @Test
+ public void overalLengthTest() throws Exception {
+ Configuration conf = new Configuration();
+ conf.setInt(FastURLFilter.URLFILTER_FAST_MAX_LENGTH, 100);
+ // not interested in testing rules
+ URLFilter filter = new FastURLFilter(new StringReader(""), conf);
+
+ StringBuilder url = new StringBuilder("http://nutch.apache.org/");
+ for (int i = 0; i < 500; i++) {
+ url.append(i);
+ }
+ Assert.assertEquals(null, filter.filter(url.toString()));
+ }
}