NUTCH-2868 urlnormalizer-protocol fails with StringIndexOutOfBoundsException
when reading invalid line in configuration file
- log invalid line and skip over it
- more verbose logging which configuration file is read
- add unit test to proof that invalid configuration lines are skipped
diff --git a/src/plugin/urlnormalizer-protocol/data/protocols.txt b/src/plugin/urlnormalizer-protocol/data/protocols.txt
index 7e49703..fc7d86c 100644
--- a/src/plugin/urlnormalizer-protocol/data/protocols.txt
+++ b/src/plugin/urlnormalizer-protocol/data/protocols.txt
@@ -19,3 +19,13 @@
 example.nl	https
 
 *.example.com	https
+
+# invalid input to verify whether this is handled nicely by the configuration file reader
+# no host/domain
+	https
+# no protocol
+invalid-rule1.example.top
+# more than two fields (skip rule)
+invalid-rule2.example.top	https	http
+# invalid protocol, not following RFC 1630 (skip rule)
+invalid-rule3.example.top	@mail
diff --git a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
index d747858..f2b475a 100644
--- a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
+++ b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
@@ -27,6 +27,7 @@
 import java.util.HashMap;
 import java.util.Map;
 import java.util.TreeMap;
+import java.util.regex.Pattern;
 
 import org.apache.commons.lang.StringUtils;
 import org.slf4j.Logger;
@@ -69,9 +70,14 @@
 
   // Map of domain suffixes and protocol to be used for all hosts below this domain
   private final Map<String,String> domainProtocolsMap = new HashMap<>();
+
   // Matcher for domain suffixes
   private SuffixStringMatcher domainMatcher = null;
 
+  // validator for protocols/schemes following RFC 1630
+  private final static Pattern PROTOCOL_VALIDATOR = Pattern.compile(
+      "^[a-z](?:[a-z0-9$\\-_@.&!*\"'(),]|%[0-9a-f]{2})*$",
+      Pattern.CASE_INSENSITIVE);
 
   private synchronized void readConfiguration(Reader configReader) throws IOException {
     if (protocolsMap.size() > 0) {
@@ -82,19 +88,31 @@
     String line, host;
     String protocol;
     int delimiterIndex;
+    int lineNumber = 0;
 
     while ((line = reader.readLine()) != null) {
+      lineNumber++;
+      line = line.trim();
       if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
-        line = line.trim();
         delimiterIndex = line.indexOf(" ");
         // try tabulator
         if (delimiterIndex == -1) {
           delimiterIndex = line.indexOf("\t");
         }
+        if (delimiterIndex == -1) {
+          LOG.warn("Invalid line {}, no delimiter between <host/domain> and <protocol> found: {}", lineNumber, line);
+          continue;
+        }
 
         host = line.substring(0, delimiterIndex);
         protocol = line.substring(delimiterIndex + 1).trim();
 
+        if (!PROTOCOL_VALIDATOR.matcher(protocol).matches()) {
+          LOG.warn("Skipping rule with protocol not following RFC 1630 in line {}: {}",
+              lineNumber, line);
+          continue;
+        }
+
         /*
          * dedup protocol values to reduce memory footprint of map: equal
          * strings are represented by the same string object
@@ -172,13 +190,14 @@
     if (stringRules != null && !stringRules.isEmpty()) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
-      LOG.info("Reading {} rules file {}", pluginName, file);
+      LOG.info("Reading {} rules file {} from Java class path", pluginName, file);
       reader = conf.getConfResourceAsReader(file);
     }
     try {
       if (reader == null) {
         Path path = new Path(file);
         FileSystem fs = path.getFileSystem(conf);
+        LOG.info("Reading {} rules file {}", pluginName, path.toUri());
         reader = new InputStreamReader(fs.open(path));
       }
       readConfiguration(reader);
diff --git a/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
index de9f77a..9775250 100644
--- a/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
+++ b/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
@@ -83,5 +83,14 @@
         normalizer.normalize(
             "http://www.subdomain.example.com:8080/path?q=uery",
             URLNormalizers.SCOPE_DEFAULT));
+
+    // No change because of invalid rules in protocols.txt
+    // (verify that these rules are skipped)
+    assertEquals("http://invalid-rule3.example.top/", normalizer
+        .normalize("http://invalid-rule3.example.top/", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://invalid-rule2.example.top/", normalizer
+        .normalize("http://invalid-rule2.example.top/", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://invalid-rule3.example.top/", normalizer
+        .normalize("http://invalid-rule3.example.top/", URLNormalizers.SCOPE_DEFAULT));
   }
 }