Merge pull request #451 from sebastian-nagel/NUTCH-2709-remove-unused-http-properties
NUTCH-2709 Remove unused properties and code related to HTTP protocol
diff --git a/src/java/org/apache/nutch/util/PrefixStringMatcher.java b/src/java/org/apache/nutch/util/PrefixStringMatcher.java
index 36e2d9e..3be0fd7 100644
--- a/src/java/org/apache/nutch/util/PrefixStringMatcher.java
+++ b/src/java/org/apache/nutch/util/PrefixStringMatcher.java
@@ -16,8 +16,11 @@
*/
package org.apache.nutch.util;
+import java.util.Arrays;
import java.util.Collection;
+import java.util.Collections;
import java.util.Iterator;
+import java.util.List;
/**
* A class for efficiently matching <code>String</code>s against a set of
@@ -102,8 +105,9 @@
}
public static final void main(String[] argv) {
- PrefixStringMatcher matcher = new PrefixStringMatcher(new String[] {
- "abcd", "abc", "aac", "baz", "foo", "foobar" });
+ String[] prefixes = new String[] { "abcd", "abc", "aac", "baz", "foo",
+ "foobar" };
+ PrefixStringMatcher matcher = new PrefixStringMatcher(prefixes);
String[] tests = { "a", "ab", "abc", "abcdefg", "apple", "aa", "aac",
"aaccca", "abaz", "baz", "bazooka", "fo", "foobar", "kite", };
@@ -114,5 +118,23 @@
System.out.println(" shortest: " + matcher.shortestMatch(tests[i]));
System.out.println(" longest: " + matcher.longestMatch(tests[i]));
}
+
+ int iterations = 1000;
+ System.out.println("Testing thread-safety (NUTCH-2585) with " + iterations
+ + " iterations:");
+ List<String> testsList = Arrays.asList(tests);
+ for (int i = 0; i < iterations; i++) {
+ matcher = new PrefixStringMatcher(prefixes);
+ Collections.shuffle(testsList);
+ try {
+ long count = testsList.parallelStream().filter(matcher::matches).count();
+ System.out.print(String.format("Cycle %4d : %d matches\r", i, count));
+ } catch (Exception e) {
+ // flush output
+ System.out.println("");
+ throw e;
+ }
+ }
+ System.out.println("");
}
}
diff --git a/src/java/org/apache/nutch/util/TrieStringMatcher.java b/src/java/org/apache/nutch/util/TrieStringMatcher.java
index fddecf3..d974ecb 100644
--- a/src/java/org/apache/nutch/util/TrieStringMatcher.java
+++ b/src/java/org/apache/nutch/util/TrieStringMatcher.java
@@ -23,6 +23,8 @@
/**
* TrieStringMatcher is a base class for simple tree-based string matching.
*
+ * This class is thread-safe during string matching but not when adding strings
+ * to the trie.
*/
public abstract class TrieStringMatcher {
protected TrieNode root;
@@ -103,9 +105,7 @@
*/
TrieNode getChild(char nextChar) {
if (children == null) {
- children = childrenList.toArray(new TrieNode[childrenList.size()]);
- childrenList = null;
- Arrays.sort(children);
+ compile();
}
int min = 0;
@@ -137,6 +137,18 @@
// if (this.nodeChar > other.nodeChar)
return 1;
}
+
+ /**
+ * Prepare node for matching. Note: this method is synchronized because it
+ * may be called concurrently when the trie is used for matching.
+ */
+ synchronized void compile() {
+ if (childrenList != null) {
+ children = childrenList.toArray(new TrieNode[childrenList.size()]);
+ childrenList = null;
+ Arrays.sort(children);
+ }
+ }
}
/**
diff --git a/src/plugin/urlfilter-automaton/ivy.xml b/src/plugin/urlfilter-automaton/ivy.xml
index 7c1968f..5ddf1db 100644
--- a/src/plugin/urlfilter-automaton/ivy.xml
+++ b/src/plugin/urlfilter-automaton/ivy.xml
@@ -36,7 +36,7 @@
</publications>
<dependencies>
- <dependency org="dk.brics.automaton" name="automaton" rev="1.11-8" conf="*->default" />
+ <dependency org="dk.brics" name="automaton" rev="1.12-1" conf="*->default" />
</dependencies>
</ivy-module>