Merge pull request #428 from r0ann3l/NUTCH-2686

NUTCH-2686 New property: "moreIndexingFilter.mapMimeTypes.field"
diff --git a/conf/host-protocol-mapping.txt.template b/conf/host-protocol-mapping.txt.template
new file mode 100644
index 0000000..a09bca6
--- /dev/null
+++ b/conf/host-protocol-mapping.txt.template
@@ -0,0 +1,16 @@
+# This file defines a hostname to protocol plugin mapping. Each line takes a
+# host name followed by a tab, followed by the ID of the protocol plugin. You
+# can find the ID in the protocol plugin's plugin.xml file.
+#
+# <hostname>\t<plugin_id>\n
+# nutch.apache.org	org.apache.nutch.protocol.httpclient.Http
+# tika.apache.org	org.apache.nutch.protocol.http.Http
+#
+# If the requested host is not mapped, Nutch can choose any of the enabled
+# plugins so you can force defaults using:
+#
+# protocol:<protocol>\t<plugin_id>\n
+#
+# This example forces httpclient for all protocol in case the host is not mapped:
+# protocol:http	org.apache.nutch.protocol.httpclient.Http
+# protocol:https	org.apache.nutch.protocol.httpclient.Http
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 3e5cede..a42e6a9 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1369,11 +1369,11 @@
   <value>protocol-http|urlfilter-(regex|validator)|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
   <description>Regular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
-  In any case you need at least include the nutch-extensionpoints plugin. By
-  default Nutch includes crawling just HTML and plain text via HTTP,
-  and basic indexing and search plugins. In order to use HTTPS please enable 
-  protocol-httpclient, but be aware of possible intermittent problems with the 
-  underlying commons-httpclient library. Set parsefilter-naivebayes for classification based focused crawler.
+  By default Nutch includes plugins to crawl HTML and various other
+  document formats via HTTP/HTTPS and indexing the crawled content
+  into Solr.  More plugins are available to support more indexing
+  backends, to fetch ftp:// and file:// URLs, for focused crawling,
+  and many other use cases.
   </description>
 </property>
 
diff --git a/src/java/org/apache/nutch/plugin/Extension.java b/src/java/org/apache/nutch/plugin/Extension.java
index e73b850..be737cb 100644
--- a/src/java/org/apache/nutch/plugin/Extension.java
+++ b/src/java/org/apache/nutch/plugin/Extension.java
@@ -197,4 +197,8 @@
   public void setDescriptor(PluginDescriptor pDescriptor) {
     fDescriptor = pDescriptor;
   }
+
+  public String toString() {
+    return getId() + ", " + getClazz() + ", " + getTargetPoint();
+  }
 }
diff --git a/src/java/org/apache/nutch/protocol/ProtocolFactory.java b/src/java/org/apache/nutch/protocol/ProtocolFactory.java
index 2d20ecd..7f900b2 100644
--- a/src/java/org/apache/nutch/protocol/ProtocolFactory.java
+++ b/src/java/org/apache/nutch/protocol/ProtocolFactory.java
@@ -17,8 +17,13 @@
 
 package org.apache.nutch.protocol;
 
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
 import java.net.URL;
 import java.net.MalformedURLException;
+import java.util.HashMap;
+import java.util.Map;
 
 import org.apache.nutch.plugin.Extension;
 import org.apache.nutch.plugin.ExtensionPoint;
@@ -26,8 +31,13 @@
 import org.apache.nutch.plugin.PluginRuntimeException;
 import org.apache.nutch.util.ObjectCache;
 
+import org.apache.commons.lang.StringUtils;
+
 import org.apache.hadoop.conf.Configuration;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 /**
  * Creates and caches {@link Protocol} plugins. Protocol plugins should define
  * the attribute "protocolName" with the name of the protocol that they
@@ -37,10 +47,16 @@
  */
 public class ProtocolFactory {
 
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
   private ExtensionPoint extensionPoint;
 
   private Configuration conf;
 
+  protected Map<String, String> defaultProtocolImplMapping = new HashMap<>();
+  protected Map<String, String> hostProtocolMapping = new HashMap<>();
+
   public ProtocolFactory(Configuration conf) {
     this.conf = conf;
     this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
@@ -49,8 +65,35 @@
       throw new RuntimeException("x-point " + Protocol.X_POINT_ID
           + " not found.");
     }
-  }
 
+    try {
+      BufferedReader reader = new BufferedReader(conf.getConfResourceAsReader("host-protocol-mapping.txt"));
+      String line;
+      String parts[];
+      while ((line = reader.readLine()) != null) {
+        if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+          line = line.trim();
+          parts = line.split("\t");
+
+          // Must be at least two parts
+          if (parts.length == 2) {
+            // Is this a host to plugin mapping, or a default?
+            if (parts[0].indexOf(":") == -1) {
+              hostProtocolMapping.put(parts[0].trim(), parts[1].trim());
+            } else {
+              String[] moreParts = parts[0].split(":");
+              defaultProtocolImplMapping.put(moreParts[1].trim(), parts[1].trim());
+            }
+          } else {
+            LOG.warn("Wrong format of line: {}", line);
+            LOG.warn("Expected format: <hostname> <tab> <plugin_id> or protocol:<protocol> <tab> <plugin_id>");
+          }
+        }
+      }
+    } catch (IOException e) {
+      LOG.error("Unable to read host-protocol-mapping.txt", e);
+    }
+  }
   /**
    * Returns the appropriate {@link Protocol} implementation for a url.
    * 
@@ -83,52 +126,92 @@
    */
   public Protocol getProtocol(URL url)
       throws ProtocolNotFound {
-    ObjectCache objectCache = ObjectCache.get(conf);
     try {
-      String protocolName = url.getProtocol();
-      if (protocolName == null) {
-        throw new ProtocolNotFound(url.toString());
+      Protocol protocol = null;
+
+      // First attempt to resolve a protocol implementation by hostname
+      String host = url.getHost();
+      if (hostProtocolMapping.containsKey(host)) {
+        Extension extension = getExtensionById(hostProtocolMapping.get(host));
+        if (extension != null) {
+          protocol = getProtocolInstanceByExtension(extension);
+        }
       }
 
-      String cacheId = Protocol.X_POINT_ID + protocolName;
-      synchronized (objectCache) {
-        Protocol protocol = (Protocol) objectCache.getObject(cacheId);
-        if (protocol != null) {
-          return protocol;
+      // Nothing, see if we have defaults configured
+      if (protocol == null) {
+        // Protocol listed in default map?
+        if (defaultProtocolImplMapping.containsKey(url.getProtocol())) {
+          Extension extension = getExtensionById(defaultProtocolImplMapping.get(url.getProtocol()));
+          if (extension != null) {
+            protocol = getProtocolInstanceByExtension(extension);
+          }
         }
+      }
 
-        Extension extension = findExtension(protocolName);
-        if (extension == null) {
-          throw new ProtocolNotFound(protocolName);
+      // Still couldn't find a protocol? Attempt by protocol
+      if (protocol == null) {
+        Extension extension = findExtension(url.getProtocol(), "protocolName");
+        if (extension != null) {
+          protocol = getProtocolInstanceByExtension(extension);
         }
+      }
 
-        protocol = (Protocol) extension.getExtensionInstance();
-        objectCache.setObject(cacheId, protocol);
+      // Got anything?
+      if (protocol != null) {
         return protocol;
       }
+
+      // Nothing!
+      throw new ProtocolNotFound(url.toString());
     } catch (PluginRuntimeException e) {
       throw new ProtocolNotFound(url.toString(), e.toString());
     }
   }
 
-  private Extension findExtension(String name) throws PluginRuntimeException {
+  private Protocol getProtocolInstanceByExtension(Extension extension) throws PluginRuntimeException {
+    Protocol protocol = null;
+    String cacheId = extension.getId();
+    ObjectCache objectCache = ObjectCache.get(conf);
+    synchronized (objectCache) {
+      if (!objectCache.hasObject(cacheId)) {
+        protocol = (Protocol) extension.getExtensionInstance();
+        objectCache.setObject(cacheId, protocol);
+      }
+      protocol = (Protocol) objectCache.getObject(cacheId);
+    }
 
+    return protocol;
+  }
+
+  private Extension getExtensionById(String id) {
     Extension[] extensions = this.extensionPoint.getExtensions();
-
     for (int i = 0; i < extensions.length; i++) {
-      Extension extension = extensions[i];
+      if (id.equals(extensions[i].getId())) {
+        return extensions[i];
+      }
+    }
 
-      if (contains(name, extension.getAttribute("protocolName")))
+    return null;
+  }
+
+  private Extension findExtension(String name, String attribute) throws PluginRuntimeException {
+    for (int i = 0; i < this.extensionPoint.getExtensions().length; i++) {
+      Extension extension = this.extensionPoint.getExtensions()[i];
+
+      if (contains(name, extension.getAttribute(attribute)))
         return extension;
     }
     return null;
   }
 
   boolean contains(String what, String where) {
-    String parts[] = where.split("[, ]");
-    for (int i = 0; i < parts.length; i++) {
-      if (parts[i].equals(what))
-        return true;
+    if (where != null) {
+      String parts[] = where.split("[, ]");
+      for (int i = 0; i < parts.length; i++) {
+        if (parts[i].equals(what))
+          return true;
+      }
     }
     return false;
   }
diff --git a/src/java/org/apache/nutch/util/JexlUtil.java b/src/java/org/apache/nutch/util/JexlUtil.java
index f5e67cc..b480033 100644
--- a/src/java/org/apache/nutch/util/JexlUtil.java
+++ b/src/java/org/apache/nutch/util/JexlUtil.java
@@ -29,45 +29,47 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * A collection of Jexl utilit(y|ies).
+ * Utility methods for handling JEXL expressions
  */
 public class JexlUtil {
 
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
 
-  /**
-   * 
-   */
-  public static Pattern datePattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
+  /** Supported format for date parsing yyyy-MM-ddTHH:mm:ssZ */
+  private static final Pattern DATE_PATTERN = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
 
   /**
-   * Parses the given experssion to a Jexl expression. This supports
+   * Parses the given expression to a JEXL expression. This supports
    * date parsing.
    *
-   * @param expr the Jexl expression
-   * @return parsed Jexl expression or null in case of parse error
+   * @param expr string JEXL expression
+   * @return parsed JEXL expression or null in case of parse error
    */
   public static Expression parseExpression(String expr) {
     if (expr == null) return null;
     
     try {
-      // Translate any date object into a long, dates must be specified as 20-03-2016T00:00:00Z
-      Matcher matcher = datePattern.matcher(expr);
+      // Translate any date object into a long. Dates must be in the DATE_PATTERN
+      // format. For example: 2016-03-20T00:00:00Z
+      Matcher matcher = DATE_PATTERN.matcher(expr);
+
       if (matcher.find()) {
         String date = matcher.group();
         
-        // Parse the thing and get epoch!
+        // parse the matched substring and get the epoch
         Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"yyyy-MM-dd'T'HH:mm:ss'Z'"});
         long time = parsedDate.getTime();
         
-        // Replace in the original expression
+        // replace the original string date with the numeric value
         expr = expr.replace(date, Long.toString(time));
       }
-      
+
       JexlEngine jexl = new JexlEngine();
+
       jexl.setSilent(true);
       jexl.setStrict(true);
+
       return jexl.createExpression(expr);
     } catch (Exception e) {
       LOG.error(e.getMessage());
diff --git a/src/java/org/apache/nutch/util/ObjectCache.java b/src/java/org/apache/nutch/util/ObjectCache.java
index 4ed3fd0..f1b14c8 100644
--- a/src/java/org/apache/nutch/util/ObjectCache.java
+++ b/src/java/org/apache/nutch/util/ObjectCache.java
@@ -52,6 +52,10 @@
     return objectMap.get(key);
   }
 
+  public boolean hasObject(String key) {
+    return objectMap.containsKey(key);
+  }
+
   public synchronized void setObject(String key, Object value) {
     objectMap.put(key, value);
   }
diff --git a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
index 3fa2294..890020a 100644
--- a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
+++ b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
@@ -18,6 +18,7 @@
 package org.apache.nutch.indexer.jexl;
 
 import java.lang.invoke.MethodHandles;
+import java.util.List;
 import java.util.Map.Entry;
 
 import org.apache.commons.jexl2.Expression;
@@ -41,7 +42,6 @@
 /**
  * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering of
  * documents based on a JEXL expression.
- *
  */
 public class JexlIndexingFilter implements IndexingFilter {
 
@@ -84,9 +84,12 @@
         metadataToContext(parse.getData().getParseMeta()));
 
     JexlContext context = new MapContext();
+
     for (Entry<String, NutchField> entry : doc) {
-      context.set(entry.getKey(), entry.getValue().getValues());
+      List<Object> values = entry.getValue().getValues();
+      context.set(entry.getKey(), values.size() > 1 ? values : values.get(0));
     }
+
     jcontext.set("doc", context);
 
     try {
@@ -102,16 +105,21 @@
   @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
-    String str = conf.get("index.jexl.filter");
-    if (str == null) {
-      LOG.warn(
+    String strExpr = conf.get("index.jexl.filter");
+
+    if (strExpr == null) {
+      LOG.error(
           "The property index.jexl.filter must have a value when index-jexl-filter is used. You can use 'true' or 'false' to index all/none");
+
       throw new RuntimeException(
           "The property index.jexl.filter must have a value when index-jexl-filter is used. You can use 'true' or 'false' to index all/none");
     }
-    expr = JexlUtil.parseExpression(str);
+
+    expr = JexlUtil.parseExpression(strExpr);
+
     if (expr == null) {
-      LOG.warn("Failed parsing JEXL from index.jexl.filter: {}", str);
+      LOG.error("Failed parsing JEXL from index.jexl.filter: {}", strExpr);
+
       throw new RuntimeException("Failed parsing JEXL from index.jexl.filter");
     }
   }
@@ -123,9 +131,12 @@
 
   private JexlContext metadataToContext(Metadata metadata) {
     JexlContext context = new MapContext();
+
     for (String name : metadata.names()) {
-      context.set(name, metadata.getValues(name));
+      String[] values = metadata.getValues(name);
+      context.set(name, values.length > 1 ? values : values[0]);
     }
+
     return context;
   }
 }
diff --git a/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/jexl/TestJexlIndexingFilter.java b/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/jexl/TestJexlIndexingFilter.java
index 0427ad4..f3cc655 100644
--- a/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/jexl/TestJexlIndexingFilter.java
+++ b/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/jexl/TestJexlIndexingFilter.java
@@ -39,7 +39,7 @@
   @Test
   public void testAllowMatchingDocument() throws Exception {
     Configuration conf = NutchConfiguration.create();
-    conf.set("index.jexl.filter", "doc.lang[0]=='en'");
+    conf.set("index.jexl.filter", "doc.lang=='en'");
 
     JexlIndexingFilter filter = new JexlIndexingFilter();
     filter.setConf(conf);
@@ -73,7 +73,7 @@
   @Test
   public void testBlockNotMatchingDocuments() throws Exception {
     Configuration conf = NutchConfiguration.create();
-    conf.set("index.jexl.filter", "doc.lang[0]=='en'");
+    conf.set("index.jexl.filter", "doc.lang=='en'");
 
     JexlIndexingFilter filter = new JexlIndexingFilter();
     filter.setConf(conf);
@@ -115,7 +115,7 @@
   @Test
   public void testInvalidExpression() throws Exception {
     Configuration conf = NutchConfiguration.create();
-    conf.set("index.jexl.filter", "doc.lang[0]=<>:='en'");
+    conf.set("index.jexl.filter", "doc.lang=<>:='en'");
 
     JexlIndexingFilter filter = new JexlIndexingFilter();
     thrown.expect(RuntimeException.class);
diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
index 81fbb22..05c215f 100644
--- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
+++ b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
@@ -277,7 +277,7 @@
   static {
     try {
       // order here is important
-      patterns[0] = Pattern.compile("\\bfilename=['\"](.+)['\"]");
+      patterns[0] = Pattern.compile("\\bfilename=['\"]([^\"]+)");
       patterns[1] = Pattern.compile("\\bfilename=(\\S+)\\b");
     } catch (PatternSyntaxException e) {
       // just ignore
diff --git a/src/test/org/apache/nutch/protocol/TestProtocolFactory.java b/src/test/org/apache/nutch/protocol/TestProtocolFactory.java
index 394c303..7cab623 100644
--- a/src/test/org/apache/nutch/protocol/TestProtocolFactory.java
+++ b/src/test/org/apache/nutch/protocol/TestProtocolFactory.java
@@ -59,12 +59,6 @@
       Assert.fail("Must not throw any other exception");
     }
 
-    // cache key
-    Object protocol = ObjectCache.get(conf).getObject(
-        Protocol.X_POINT_ID + "http");
-    Assert.assertNotNull(protocol);
-    Assert.assertEquals(httpProtocol, protocol);
-
     // test same object instance
     try {
       Assert.assertTrue(httpProtocol == factory.getProtocol("http://somehost"));