Merge pull request #649 from sebastian-nagel/NUTCH-2868-urlnormalizer-protocol-exception-reading-config-file
NUTCH-2868 urlnormalizer-protocol fails with StringIndexOutOfBoundsException
diff --git a/src/plugin/urlnormalizer-protocol/data/protocols.txt b/src/plugin/urlnormalizer-protocol/data/protocols.txt
index 7e49703..fc7d86c 100644
--- a/src/plugin/urlnormalizer-protocol/data/protocols.txt
+++ b/src/plugin/urlnormalizer-protocol/data/protocols.txt
@@ -19,3 +19,13 @@
example.nl https
*.example.com https
+
+# invalid input to verify whether this is handled nicely by the configuration file reader
+# no host/domain
+ https
+# no protocol
+invalid-rule1.example.top
+# more than two fields (skip rule)
+invalid-rule2.example.top https http
+# invalid protocol, not following RFC 1630 (skip rule)
+invalid-rule3.example.top @mail
diff --git a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
index d747858..f2b475a 100644
--- a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
+++ b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
@@ -27,6 +27,7 @@
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
+import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
@@ -69,9 +70,14 @@
// Map of domain suffixes and protocol to be used for all hosts below this domain
private final Map<String,String> domainProtocolsMap = new HashMap<>();
+
// Matcher for domain suffixes
private SuffixStringMatcher domainMatcher = null;
+ // validator for protocols/schemes following RFC 1630
+ private final static Pattern PROTOCOL_VALIDATOR = Pattern.compile(
+ "^[a-z](?:[a-z0-9$\\-_@.&!*\"'(),]|%[0-9a-f]{2})*$",
+ Pattern.CASE_INSENSITIVE);
private synchronized void readConfiguration(Reader configReader) throws IOException {
if (protocolsMap.size() > 0) {
@@ -82,19 +88,31 @@
String line, host;
String protocol;
int delimiterIndex;
+ int lineNumber = 0;
while ((line = reader.readLine()) != null) {
+ lineNumber++;
+ line = line.trim();
if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
- line = line.trim();
delimiterIndex = line.indexOf(" ");
// try tabulator
if (delimiterIndex == -1) {
delimiterIndex = line.indexOf("\t");
}
+ if (delimiterIndex == -1) {
+ LOG.warn("Invalid line {}, no delimiter between <host/domain> and <protocol> found: {}", lineNumber, line);
+ continue;
+ }
host = line.substring(0, delimiterIndex);
protocol = line.substring(delimiterIndex + 1).trim();
+ if (!PROTOCOL_VALIDATOR.matcher(protocol).matches()) {
+ LOG.warn("Skipping rule with protocol not following RFC 1630 in line {}: {}",
+ lineNumber, line);
+ continue;
+ }
+
/*
* dedup protocol values to reduce memory footprint of map: equal
* strings are represented by the same string object
@@ -172,13 +190,14 @@
if (stringRules != null && !stringRules.isEmpty()) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
- LOG.info("Reading {} rules file {}", pluginName, file);
+ LOG.info("Reading {} rules file {} from Java class path", pluginName, file);
reader = conf.getConfResourceAsReader(file);
}
try {
if (reader == null) {
Path path = new Path(file);
FileSystem fs = path.getFileSystem(conf);
+ LOG.info("Reading {} rules file {}", pluginName, path.toUri());
reader = new InputStreamReader(fs.open(path));
}
readConfiguration(reader);
diff --git a/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
index de9f77a..9775250 100644
--- a/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
+++ b/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
@@ -83,5 +83,14 @@
normalizer.normalize(
"http://www.subdomain.example.com:8080/path?q=uery",
URLNormalizers.SCOPE_DEFAULT));
+
+ // No change because of invalid rules in protocols.txt
+ // (verify that these rules are skipped)
+ assertEquals("http://invalid-rule3.example.top/", normalizer
+ .normalize("http://invalid-rule3.example.top/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://invalid-rule2.example.top/", normalizer
+ .normalize("http://invalid-rule2.example.top/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://invalid-rule3.example.top/", normalizer
+ .normalize("http://invalid-rule3.example.top/", URLNormalizers.SCOPE_DEFAULT));
}
}