NUTCH-3032 Code for an ArbitraryIndexingFilter to index values resolved by user POJO code at index time (#810)

diff --git a/build.xml b/build.xml
index 0a18682..49187d3 100644
--- a/build.xml
+++ b/build.xml
@@ -203,6 +203,7 @@
       <packageset dir="${plugins.dir}/headings/src/java"/>
       <packageset dir="${plugins.dir}/exchange-jexl/src/java"/>
       <packageset dir="${plugins.dir}/index-anchor/src/java"/>
+      <packageset dir="${plugins.dir}/index-arbitrary/src/java"/>
       <packageset dir="${plugins.dir}/index-basic/src/java"/>
       <packageset dir="${plugins.dir}/index-geoip/src/java"/>
       <packageset dir="${plugins.dir}/index-jexl-filter/src/java"/>
@@ -646,6 +647,7 @@
       <packageset dir="${plugins.dir}/headings/src/java"/>
       <packageset dir="${plugins.dir}/exchange-jexl/src/java"/>
       <packageset dir="${plugins.dir}/index-anchor/src/java"/>
+      <packageset dir="${plugins.dir}/index-arbitrary/src/java"/>
       <packageset dir="${plugins.dir}/index-basic/src/java"/>
       <packageset dir="${plugins.dir}/index-geoip/src/java"/>
       <packageset dir="${plugins.dir}/index-jexl-filter/src/java"/>
@@ -1173,6 +1175,8 @@
         <source path="${plugins.dir}/exchange-jexl/src/java/" />
         <source path="${plugins.dir}/index-anchor/src/java/" />
         <source path="${plugins.dir}/index-anchor/src/test/" />
+        <source path="${plugins.dir}/index-arbitrary/src/java/" />
+        <source path="${plugins.dir}/index-arbitrary/src/test/" />
         <source path="${plugins.dir}/index-basic/src/java/" />
         <source path="${plugins.dir}/index-basic/src/test/" />
         <source path="${plugins.dir}/index-geoip/src/java/" />
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 8b24f09..edcaeb5 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2252,6 +2252,72 @@
   </description>
 </property>
 
+<!-- index-arbitrary plugin properties -->
+<property>
+  <name>index.arbitrary.function.count</name>
+  <value></value>
+  <description>The count of arbitrary additions/edits to the document.
+    Specify the remaining properties (fieldName, className, constructorArgs,
+    methodName, and methodArgs) independently in this file by appending a
+    dot (.) followed by integer numerals (beginning with '0') to the property
+    names, e.g.:
+
+    index.arbitrary.fieldName.0
+    for the field to add/set with the first arbitrary addition or:
+
+    index.arbitrary.className.3
+    for the POJO class name to use in setting the fourth arbitrary addition.
+  </description>
+</property>
+
+<property>
+  <name>index.arbitrary.fieldName.0</name>
+  <value></value>
+  <description>The name of the field to add to the document with the value
+    returned from the custom POJO.</description>
+</property>
+
+<property>
+  <name>index.arbitrary.className.0</name>
+  <value></value>
+  <description>The fully qualified name of the POJO class that will supply
+    values for the new field.</description>
+</property>
+
+<property>
+  <name>index.arbitrary.constructorArgs.0</name>
+  <value></value>
+  <description>The values (as strings) to pass into the POJO constructor.
+    The POJO must accept a String representation of the NutchDocument's URL
+    as the first parameter in the constructor. The values you specify here 
+    will populate the constructor arguments 1,..,n-1 where n=the count of
+    arguments to the constructor. Argument #0 will be the NutchDocument's URL.
+  </description>
+</property>
+
+<property>
+  <name>index.arbitrary.methodName.0</name>
+  <value></value>
+  <description>The name of the method to invoke on the instance of your custom
+    class in order to determine the value to add to the document.</description>
+  </property>
+
+<property>
+  <name>index.arbitrary.methodArgs.0</name>
+  <value></value>
+  <description>The values (as strings) to pass into the named method on the POJO
+    instance. Unlike the constructor args, there is no required argument that this
+    method in the POJO must accept, i.e., the Arbitrary Indexer doesn't supply any
+    arguments taken from the NutchDocument values by default.</description>
+</property>
+
+<property>
+  <name>index.arbitrary.overwrite.0</name>
+  <description>Whether to overwrite any existing value in the doc for
+    for fieldName. Default is false if not specified in config</description>
+  <value></value>
+</property>
+
 <!-- parse-metatags plugin properties -->
 <property>
   <name>metatags.names</name>
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 34688ed..498259a 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -40,6 +40,7 @@
     <ant dir="headings" target="deploy"/>
     <ant dir="exchange-jexl" target="deploy"/>
     <ant dir="index-anchor" target="deploy"/>
+    <ant dir="index-arbitrary" target="deploy"/>
     <ant dir="index-basic" target="deploy"/>
     <ant dir="index-geoip" target="deploy"/>
     <ant dir="index-jexl-filter" target="deploy"/>
@@ -117,6 +118,7 @@
      <ant dir="feed" target="test"/>
      <ant dir="headings" target="test"/>
      <ant dir="index-anchor" target="test"/>
+     <ant dir="index-arbitrary" target="test"/>
      <ant dir="index-basic" target="test"/>
      <!--ant dir="index-geoip" target="test"/-->
      <ant dir="index-jexl-filter" target="test"/>
@@ -179,6 +181,7 @@
     <ant dir="headings" target="clean"/>
     <ant dir="exchange-jexl" target="clean"/>
     <ant dir="index-anchor" target="clean"/>
+    <ant dir="index-arbitrary" target="clean"/>
     <ant dir="index-basic" target="clean"/>
     <ant dir="index-geoip" target="clean"/>
     <ant dir="index-jexl-filter" target="clean"/>
diff --git a/src/plugin/index-arbitrary/build.xml b/src/plugin/index-arbitrary/build.xml
new file mode 100644
index 0000000..818020c
--- /dev/null
+++ b/src/plugin/index-arbitrary/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<project name="index-arbitrary" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>
diff --git a/src/plugin/index-arbitrary/ivy.xml b/src/plugin/index-arbitrary/ivy.xml
new file mode 100644
index 0000000..9feb1e1
--- /dev/null
+++ b/src/plugin/index-arbitrary/ivy.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" ?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="https://nutch.apache.org/"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>
diff --git a/src/plugin/index-arbitrary/plugin.xml b/src/plugin/index-arbitrary/plugin.xml
new file mode 100644
index 0000000..f79188a
--- /dev/null
+++ b/src/plugin/index-arbitrary/plugin.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+    id="index-arbitrary"
+    name="Index Arbitrary"
+    version="1.0.0"
+    provider-name="nutch.org">
+
+  <runtime>
+    <library name="index-arbitrary.jar">
+      <export name="*"/>
+    </library>
+  </runtime>
+
+  <requires>
+    <import plugin="nutch-extensionpoints"/>
+  </requires>
+
+
+  <extension id="org.apache.nutch.indexer.arbitrary"
+    name="Nutch arbitrary data indexer"
+    point="org.apache.nutch.indexer.IndexingFilter">
+    <implementation id="ArbitraryIndexingFilter"
+     class="org.apache.nutch.indexer.arbitrary.ArbitraryIndexingFilter"/>
+  </extension>
+
+</plugin>
diff --git a/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java
new file mode 100644
index 0000000..7677ef7
--- /dev/null
+++ b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java
@@ -0,0 +1,286 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.arbitrary;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+import org.apache.hadoop.io.Text;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.invoke.MethodHandles;
+import java.lang.Class;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Method;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Adds arbitrary searchable fields to a document from the class and method
+ * the user identifies in the config. The user supplies the name of the field
+ * to add with the class and method names that supply the value.
+ * 
+ * Example:<br><br>
+ * &lt;property&gt;<br>
+ *   &lt;name&gt;index.arbitrary.function.count&lt;/name&gt;<br>
+ *   &lt;value&gt;1&lt;/value&gt;<br>
+ * &lt;/property&gt;<br>
+ * <br>
+ * &lt;property&gt;<br>
+ *   &lt;name&gt;index.arbitrary.fieldName.0&lt;/name&gt;<br>
+ *   &lt;value&gt;advisors&lt;/value&gt;<br>
+ * &lt;/property&gt;<br>
+ * <br>
+ * &lt;property&gt;<br>
+ *   &lt;name&gt;index.arbitrary.className.0&lt;/name&gt;<br>
+ *   &lt;value&gt;com.example.arbitrary.AdvisorCalculator&lt;/value&gt;<br>
+ * &lt;/property&gt;<br>
+ * <br>
+ * &lt;property&gt;<br>
+ *   &lt;name&gt;index.arbitrary.constructorArgs.0&lt;/name&gt;<br>
+ *   &lt;value&gt;Kirk&lt;/value&gt;<br>
+ * &lt;/property&gt;<br>
+ * <br>
+ * &lt;property&gt;<br>
+ *   &lt;name&gt;index.arbitrary.methodName.0&lt;/name&gt;<br>
+ *   &lt;value&gt;countAdvisors&lt;/value&gt;<br>
+ * &lt;/property&gt;<br>
+ * <br>
+ * &lt;property&gt;<br>
+ *   &lt;name&gt;index.arbitrary.methodArgs.0&lt;/name&gt;<br>
+ *   &lt;value&gt;Spock,McCoy&lt;/value&gt;<br>
+ * &lt;/property&gt;<br>
+ * <br>
+ * To set more than one arbitrary field value,
+ * increment {@code index.arbitrary.function.count} and
+ * repeat the rest of these blocks with successive int values
+ * appended to the property names, e.g. fieldName.1, methodName.1, etc.
+ */
+public class ArbitraryIndexingFilter implements IndexingFilter {
+
+  private static final Logger LOG = LoggerFactory
+    .getLogger(MethodHandles.lookup().lookupClass());
+
+  /** How many arbitrary field definitions to set. */
+  private int arbitraryAddsCount = 0;
+  
+  /** The name of the field to insert/overwrite in the NutchDocument */
+  private String fieldName;
+  
+  /** The fully-qualified class name of the custom class to use for the
+   *  new field. This class must be in the Nutch runtime classpath,
+   *  e.g., nutch/lib/ dierctory. */
+  private String className;
+  
+  /** The String values to pass to the custom class constructor. The plugin
+   *  will add the document url as the first argument in className's
+   *  String[] args. */
+  private String[] userConstrArgs;
+  
+  /** The array where the plugin copies the url &amp; the userConstrArgs
+   *  to create the instance of className. */
+  private String[] constrArgs;
+  
+  /** The name of the method in the custom class to call. Its return value
+   *  will become the value of fieldName in the NutchDocument. */
+  private String methodName;
+  
+  /** The String values of the arguments to methodName. It's up to the
+   *  developer of className to do any casts/conversions from String to
+   *  another class in the code of className. */
+  private String[] methodArgs;
+  
+  /** The result that returns from methodName. The plugin will set the value
+   *  of fieldName to this. */
+  private Object result;
+  
+  /** Optional flag to determine whether to overwrite the existing value in the
+   *  NutchDocument fieldName if this is set to true. Default behavior is to
+   *  add the value from calling methodName to existing values for fieldName. */
+  private boolean overwrite = false;
+  
+  /** Hadoop Configuration object to pass these values into the plugin. */
+  private Configuration conf;
+
+  /**
+   * The {@link ArbitraryIndexingFilter} filter object uses reflection
+   * to instantiate the configured class and invoke the configured method.
+   * It requires a few configuration settings for adding arbitrary fields
+   * and values to the NutchDocument as searchable fields.
+   * See {@code index.arbitrary.function.count}, and (possibly multiple
+   * instances when {@code index.arbitrary.function.count} &gt; 1) of the following
+   * {@code index.arbitrary.fieldName}.<em>index</em>,
+   * {@code index.arbitrary.className}.<em>index</em>,
+   * {@code index.arbitrary.constructorArgs}.<em>index</em>,
+   * {@code index.arbitrary.methodName}.<em>index</em>, and
+   * {@code index.arbitrary.methodArgs}.<em>index</em>
+   * in nutch-default.xml or nutch-site.xml where <em>index</em> ranges from 0
+   * to {@code index.arbitrary.function.count} - 1.
+   * 
+   * @param doc
+   *          The {@link NutchDocument} object
+   * @param parse
+   *          The relevant {@link Parse} object passing through the filter
+   * @param url
+   *          URL to be filtered by the user-specified class
+   * @param datum
+   *          The {@link CrawlDatum} entry
+   * @param inlinks
+   *          The {@link Inlinks} containing anchor text
+   * @return filtered NutchDocument
+   */
+  @Override
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+                              CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    Class theClass = null;
+    Method theMethod = null;
+    Constructor<?> theConstructor = null;
+    Object instance = null;
+
+    // This'll be quick
+    if (doc == null) {
+      LOG.debug("In filter() where doc is null for url == {}",
+               String.valueOf(url));
+      return doc;
+    } else if (url == null) {
+      LOG.debug("In filter() where url is null. Nothing to do.");
+      return doc;
+    }
+
+    int cfgCounter = 0;
+    while (cfgCounter <  arbitraryAddsCount) {
+      setIndexedConf(conf,cfgCounter);
+      cfgCounter++;
+      try {
+        theClass = Class.forName(className);
+        if (methodArgs.length > 0) {
+          theMethod = theClass.getDeclaredMethod(methodName,String[].class);
+        } else {
+          theMethod = theClass.getMethod(methodName);
+        }
+        theConstructor = theClass.getDeclaredConstructor(String[].class);
+      } catch (Exception e) {
+        LOG.error("Exception preparing reflection tasks. className was {}",
+		 String.valueOf(className));
+        e.printStackTrace();
+      }
+      try {
+        constrArgs = new String[userConstrArgs.length + 1];
+        constrArgs[0] = url.toString();
+        System.arraycopy(userConstrArgs,0,constrArgs,1,userConstrArgs.length);
+        instance = theConstructor.newInstance(new Object[]{constrArgs});
+        if (methodArgs.length > 0) {
+          result = theMethod.invoke(instance, new Object[]{methodArgs});
+        } else {
+          result = theMethod.invoke(instance);
+        }
+      } catch (Exception e) {
+        LOG.error("Exception in reflection trying to instantiate/invoke. "
+		  + "url was {} & className was {}",
+		  String.valueOf(url), String.valueOf(className));
+        if (constrArgs.length > 0) {
+          LOG.error("constrArgs[1] was {}", String.valueOf(constrArgs[1]));
+        }
+        LOG.error("methodName was {}", String.valueOf(className));
+        if (methodArgs.length > 0) {
+          LOG.error("methodArgs[0] was {}", String.valueOf(methodArgs[0]));
+        }
+        e.printStackTrace();
+      }
+
+      LOG.debug("{}.{}() returned {} for field {}.", className,
+		methodName, String.valueOf(result), String.valueOf(fieldName));
+      
+      // If user chose to overwrite, remove existing value
+      if (overwrite) {
+	LOG.debug("overwrite == true for fieldName == {} ", fieldName);
+	if (doc.getFieldNames().contains(fieldName)) {
+	  LOG.debug("Removing field '{}' from doc for overwrite", fieldName);
+	  doc.removeField(fieldName);
+	}
+      }
+      if (result == null) {
+        LOG.debug("Call to {}.{} returned null", className, methodName);
+        if (overwrite) {
+          LOG.debug("{} has been cleared.", fieldName);
+        }
+      }
+      LOG.debug("Adding value '{}' for field '{}' to doc", result, fieldName);
+      doc.add(fieldName, result);
+    }
+    return doc;
+  }
+
+  /**
+   * Set the {@link Configuration} object
+   */
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    arbitraryAddsCount = conf.getInt("index.arbitrary.function.count",1);
+    LOG.info("Will process the first {} fieldName defs in config.", String.valueOf(arbitraryAddsCount));
+  }
+
+  /**
+   * Set the {@link Configuration} object for a specific set of values in the config
+   *
+   * @param conf
+   *        The Configuration object holding values for the current arbitrary field.
+   * @param ndx
+   *          The ordinal counter value for the current arbitrary field appended to the
+   *          base property names in the xml configuration file.
+   */
+  public void setIndexedConf(Configuration conf, int ndx) {
+    LOG.debug("In setIndexedConf() where ndx was passed in as {}", String.valueOf(ndx));
+    fieldName = conf.get("index.arbitrary.fieldName.".concat(String.valueOf(ndx)));
+    LOG.debug("Looking now for index.arbitrary.fieldname.{} which was: {}",
+	      String.valueOf(ndx),String.valueOf(fieldName));
+
+    if (fieldName == null || fieldName == "") {
+      throw new RuntimeException ("Problem in configuration where the index.arbitrary.fieldName."
+                                  + String.valueOf(ndx) + " is missing.");
+    }
+
+    className = conf.get("index.arbitrary.className.".concat(String.valueOf(ndx)));
+    if (className == null || className == "") {
+      throw new RuntimeException ("Problem in configuration where the index.arbitrary.className."
+                                  + String.valueOf(ndx) + " is missing.");
+    }
+
+    userConstrArgs = conf.getTrimmedStrings("index.arbitrary.constructorArgs.".concat(String.valueOf(ndx)));
+    methodName = conf.get("index.arbitrary.methodName.".concat(String.valueOf(ndx)),"");
+    methodArgs = conf.getTrimmedStrings("index.arbitrary.methodArgs.".concat(String.valueOf(ndx)));
+    overwrite = conf.getBoolean("index.arbitrary.overwrite.".concat(String.valueOf(ndx)),false);
+    if (overwrite) {
+      LOG.info("overwrite set == true for processing {}.", fieldName);
+    }
+  }
+
+  /**
+   * Get the {@link Configuration} object */
+  @Override
+  public Configuration getConf() {
+    return this.conf;
+  }
+}
diff --git a/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/package-info.java b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/package-info.java
new file mode 100644
index 0000000..6e6d475
--- /dev/null
+++ b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Indexing filter to add document arbitrary data to the index
+ * from the output of a user-specified class.
+ */
+package org.apache.nutch.indexer.arbitrary;
+
diff --git a/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Echo.java b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Echo.java
new file mode 100644
index 0000000..52e8939
--- /dev/null
+++ b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Echo.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.arbitrary;
+
+
+import java.io.PrintStream;
+
+public class Echo {
+
+  private static PrintStream out = System.out;
+  private String words;
+
+  public Echo(String args[]) {
+    super();
+    words = String.valueOf(args[1]);
+  }
+
+  public String getText() {
+    return words;
+  }
+
+  public static void main(String[] args) {
+    Echo echo = new Echo(args);
+    out.println(echo.getText());
+  }
+}
diff --git a/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Multiplier.java b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Multiplier.java
new file mode 100644
index 0000000..38875d0
--- /dev/null
+++ b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Multiplier.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.arbitrary;
+
+import java.io.PrintStream;
+
+public class Multiplier {
+  private float product = 1;
+  private static PrintStream err = System.err;
+  private static PrintStream out = System.out;
+  
+  public Multiplier(String args[]) {
+    super();
+  }
+  
+  public String getProduct(String args[]) {
+    int i = args.length - 1;
+    try {
+      while (i >= 0) {
+        product = product * Float.parseFloat(args[i]);
+        i--;
+      }
+    } catch (NumberFormatException nfe) {
+      err.println("NumberFormatException while trying to parse " + String.valueOf(args[i]));
+    }
+    return String.valueOf(product);
+  }
+
+  public static void main(String[] args) {
+	Multiplier mp = new Multiplier(args);
+	out.println(mp.getProduct(args));
+  }
+}
diff --git a/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java
new file mode 100644
index 0000000..17f31b1
--- /dev/null
+++ b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.arbitrary;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import java.lang.invoke.MethodHandles;
+
+/**
+ * Tests that the index-arbitrary filter can add a new field with an arbitrary
+ * value, supplement an existing field with an arbitrary value, and overwrite
+ * an existing field with an arbitrary value where it takes the arbitrary value
+ * from some POJO outside the normal Nutch codebase.
+ *
+ * @author Joe Gilvary
+ */
+
+public class TestArbitraryIndexingFilter {
+
+  Configuration conf;
+  Inlinks inlinks;
+  ParseImpl parse;
+  CrawlDatum crawlDatum;
+  Text url;
+  ArbitraryIndexingFilter filter;
+  NutchDocument doc;
+
+  @Before
+  public void setUp() throws Exception {
+    parse = new ParseImpl();
+    url = new Text("http://nutch.apache.org/index.html");
+    crawlDatum = new CrawlDatum();
+    inlinks = new Inlinks();
+  }
+
+
+  /**
+   * Test adding field with arbitrary content from POJO
+   * 
+   * @throws Exception
+   */
+   @Test
+   public void testAddingNewField() throws Exception {
+     conf = NutchConfiguration.create();
+     conf.set("index.arbitrary.function.count","1");
+     conf.set("index.arbitrary.fieldName.0","foo");
+     conf.set("index.arbitrary.className.0","org.apache.nutch.indexer.arbitrary.Echo");
+     conf.set("index.arbitrary.constructorArgs.0","Arbitrary text to add - bar");
+     conf.set("index.arbitrary.methodName.0","getText");
+
+     filter = new ArbitraryIndexingFilter();
+     Assert.assertNotNull("No filter exists for testAddingNewField",filter);
+
+     filter.setConf(conf);
+     doc = new NutchDocument();
+    
+     try {
+       filter.filter(doc, parse, url, crawlDatum, inlinks);
+     } catch (Exception e) {
+       e.printStackTrace();
+       Assert.fail(e.getMessage());
+     }
+
+     Assert.assertNotNull(doc);
+     Assert.assertFalse("test if doc is not empty", doc.getFieldNames()
+                        .isEmpty());
+     Assert.assertTrue("test if doc has new field with arbitrary value", doc.getField("foo")
+                       .getValues().contains("Arbitrary text to add - bar"));
+   }
+
+  /**
+   * Test supplementing a doc field with arbitrary content from POJO
+   * 
+   * @throws Exception
+   */
+  @Test
+  public void testSupplementExistingField() throws Exception {
+
+    conf = NutchConfiguration.create();
+    conf.set("index.arbitrary.function.count","2");
+    conf.set("index.arbitrary.fieldName.0","foo");
+    conf.set("index.arbitrary.className.0","org.apache.nutch.indexer.arbitrary.Echo");
+    conf.set("index.arbitrary.constructorArgs.0","Arbitrary text to add - bar");
+    conf.set("index.arbitrary.methodName.0","getText");
+    conf.set("index.arbitrary.fieldName.1","description");
+    conf.set("index.arbitrary.className.1","org.apache.nutch.indexer.arbitrary.Multiplier");
+    conf.set("index.arbitrary.constructorArgs.1","");
+    conf.set("index.arbitrary.methodName.1","getProduct");
+    conf.set("index.arbitrary.methodArgs.1","-1,3.14");
+
+    filter = new ArbitraryIndexingFilter();
+    Assert.assertNotNull("No filter exists for testSupplementExistingField", filter);
+
+    filter.setConf(conf);
+    
+    doc = new NutchDocument();
+    Assert.assertNotNull("doc doesn't exist", doc);
+
+    doc.add("description","irrational");
+
+    Assert.assertFalse("doc is empty", doc.getFieldNames().isEmpty());
+
+    Assert.assertEquals("field description does not have exactly one value", 1,
+                         doc.getField("description").getValues().size());
+    
+    Assert.assertTrue("field description does not have initial value 'irrational'",
+                       doc.getField("description").getValues().contains("irrational"));
+    
+    try {
+      filter.filter(doc, parse, url, crawlDatum, inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.getMessage());
+    }
+
+    Assert.assertTrue("doc doesn't have new field with arbitrary value",
+                      doc.getField("foo").getValues()
+                      .contains("Arbitrary text to add - bar"));
+
+    Assert.assertEquals("field description does not have 2 values", 2,
+                       doc.getField("description").getValues().size());
+
+    Assert.assertTrue("field description original value gone", doc.getField("description")
+                      .getValues().contains("irrational"));
+
+    Assert.assertTrue("field description missing new value", doc.getField("description")
+                     .getValues().contains("-3.14"));
+  }
+
+
+  /**
+   * Test overwriting a doc field with arbitrary content from POJO
+   * 
+   * @throws Exception
+   */
+  @Test
+  public void testOverwritingExistingField() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.set("index.arbitrary.function.count","3");
+    conf.set("index.arbitrary.fieldName.0","foo");
+    conf.set("index.arbitrary.className.0","org.apache.nutch.indexer.arbitrary.Echo");
+    conf.set("index.arbitrary.constructorArgs.0","Arbitrary text to add - bar");
+    conf.set("index.arbitrary.methodName.0","getText");
+    conf.set("index.arbitrary.fieldName.1","description");
+    conf.set("index.arbitrary.className.1","org.apache.nutch.indexer.arbitrary.Multiplier");
+    conf.set("index.arbitrary.methodArgs.1","-1,3.14159265");
+    conf.set("index.arbitrary.methodName.1","getProduct");
+    conf.set("index.arbitrary.fieldName.2","philosopher");
+    conf.set("index.arbitrary.className.2","org.apache.nutch.indexer.arbitrary.Echo");
+    conf.set("index.arbitrary.constructorArgs.2","Popeye");
+    conf.set("index.arbitrary.methodName.2","getText");
+    conf.set("index.arbitrary.overwrite.2","true");
+    
+    filter = new ArbitraryIndexingFilter();
+    Assert.assertNotNull("No filter exists for testOverwritingExistingField",filter);
+
+    filter.setConf(conf);
+    Assert.assertNotNull("conf does not exist",conf);
+    
+    doc = new NutchDocument();
+
+    Assert.assertNotNull("doc does not exist",doc);
+
+    doc.add("description","irrational");
+    doc.add("philosopher","Socrates");
+
+    Assert.assertEquals("field description does not have exactly one value", 1, doc.getField("description")
+                        .getValues().size());
+
+    Assert.assertEquals("field philosopher does not have exactly one value", 1, doc.getField("philosopher")
+                        .getValues().size());
+
+    Assert.assertTrue("field description does not have initial value 'irrational'", doc.getField("description")
+                      .getValues().contains("irrational"));
+
+    Assert.assertTrue("field philosopher does not have initial value 'Socrates'", doc.getField("philosopher")
+                      .getValues().contains("Socrates"));
+    
+    try {
+      filter.filter(doc, parse, url, crawlDatum, inlinks);
+    } catch (Exception e) {
+      e.printStackTrace(System.out);
+      Assert.fail(e.getMessage());
+    }
+
+    Assert.assertNotNull(doc);
+
+    Assert.assertEquals("field philosopher no longer has only one value", 1, doc.getField("philosopher")
+                       .getValues().size());
+
+    Assert.assertFalse("field philosopher's original value 'Socrates' NOT overwritten", doc.getField("philosopher")
+                      .getValues().contains("Socrates"));
+
+    Assert.assertTrue("field philosopher does not have new value 'Popeye'", doc.getField("philosopher")
+                    .getValues().contains("Popeye"));
+  }
+}