NUTCH-3032 Code for an ArbitraryIndexingFilter to index values resolved by user POJO code at index time (#810)
diff --git a/build.xml b/build.xml
index 0a18682..49187d3 100644
--- a/build.xml
+++ b/build.xml
@@ -203,6 +203,7 @@
<packageset dir="${plugins.dir}/headings/src/java"/>
<packageset dir="${plugins.dir}/exchange-jexl/src/java"/>
<packageset dir="${plugins.dir}/index-anchor/src/java"/>
+ <packageset dir="${plugins.dir}/index-arbitrary/src/java"/>
<packageset dir="${plugins.dir}/index-basic/src/java"/>
<packageset dir="${plugins.dir}/index-geoip/src/java"/>
<packageset dir="${plugins.dir}/index-jexl-filter/src/java"/>
@@ -646,6 +647,7 @@
<packageset dir="${plugins.dir}/headings/src/java"/>
<packageset dir="${plugins.dir}/exchange-jexl/src/java"/>
<packageset dir="${plugins.dir}/index-anchor/src/java"/>
+ <packageset dir="${plugins.dir}/index-arbitrary/src/java"/>
<packageset dir="${plugins.dir}/index-basic/src/java"/>
<packageset dir="${plugins.dir}/index-geoip/src/java"/>
<packageset dir="${plugins.dir}/index-jexl-filter/src/java"/>
@@ -1173,6 +1175,8 @@
<source path="${plugins.dir}/exchange-jexl/src/java/" />
<source path="${plugins.dir}/index-anchor/src/java/" />
<source path="${plugins.dir}/index-anchor/src/test/" />
+ <source path="${plugins.dir}/index-arbitrary/src/java/" />
+ <source path="${plugins.dir}/index-arbitrary/src/test/" />
<source path="${plugins.dir}/index-basic/src/java/" />
<source path="${plugins.dir}/index-basic/src/test/" />
<source path="${plugins.dir}/index-geoip/src/java/" />
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 8b24f09..edcaeb5 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2252,6 +2252,72 @@
</description>
</property>
+<!-- index-arbitrary plugin properties -->
+<property>
+ <name>index.arbitrary.function.count</name>
+ <value></value>
+ <description>The count of arbitrary additions/edits to the document.
+ Specify the remaining properties (fieldName, className, constructorArgs,
+ methodName, and methodArgs) independently in this file by appending a
+ dot (.) followed by integer numerals (beginning with '0') to the property
+ names, e.g.:
+
+ index.arbitrary.fieldName.0
+ for the field to add/set with the first arbitrary addition or:
+
+ index.arbitrary.className.3
+ for the POJO class name to use in setting the fourth arbitrary addition.
+ </description>
+</property>
+
+<property>
+ <name>index.arbitrary.fieldName.0</name>
+ <value></value>
+ <description>The name of the field to add to the document with the value
+ returned from the custom POJO.</description>
+</property>
+
+<property>
+ <name>index.arbitrary.className.0</name>
+ <value></value>
+ <description>The fully qualified name of the POJO class that will supply
+ values for the new field.</description>
+</property>
+
+<property>
+ <name>index.arbitrary.constructorArgs.0</name>
+ <value></value>
+ <description>The values (as strings) to pass into the POJO constructor.
+ The POJO must accept a String representation of the NutchDocument's URL
+ as the first parameter in the constructor. The values you specify here
+ will populate the constructor arguments 1,..,n-1 where n=the count of
+ arguments to the constructor. Argument #0 will be the NutchDocument's URL.
+ </description>
+</property>
+
+<property>
+ <name>index.arbitrary.methodName.0</name>
+ <value></value>
+ <description>The name of the method to invoke on the instance of your custom
+ class in order to determine the value to add to the document.</description>
+ </property>
+
+<property>
+ <name>index.arbitrary.methodArgs.0</name>
+ <value></value>
+ <description>The values (as strings) to pass into the named method on the POJO
+ instance. Unlike the constructor args, there is no required argument that this
+ method in the POJO must accept, i.e., the Arbitrary Indexer doesn't supply any
+ arguments taken from the NutchDocument values by default.</description>
+</property>
+
+<property>
+ <name>index.arbitrary.overwrite.0</name>
+ <description>Whether to overwrite any existing value in the doc for
+ for fieldName. Default is false if not specified in config</description>
+ <value></value>
+</property>
+
<!-- parse-metatags plugin properties -->
<property>
<name>metatags.names</name>
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 34688ed..498259a 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -40,6 +40,7 @@
<ant dir="headings" target="deploy"/>
<ant dir="exchange-jexl" target="deploy"/>
<ant dir="index-anchor" target="deploy"/>
+ <ant dir="index-arbitrary" target="deploy"/>
<ant dir="index-basic" target="deploy"/>
<ant dir="index-geoip" target="deploy"/>
<ant dir="index-jexl-filter" target="deploy"/>
@@ -117,6 +118,7 @@
<ant dir="feed" target="test"/>
<ant dir="headings" target="test"/>
<ant dir="index-anchor" target="test"/>
+ <ant dir="index-arbitrary" target="test"/>
<ant dir="index-basic" target="test"/>
<!--ant dir="index-geoip" target="test"/-->
<ant dir="index-jexl-filter" target="test"/>
@@ -179,6 +181,7 @@
<ant dir="headings" target="clean"/>
<ant dir="exchange-jexl" target="clean"/>
<ant dir="index-anchor" target="clean"/>
+ <ant dir="index-arbitrary" target="clean"/>
<ant dir="index-basic" target="clean"/>
<ant dir="index-geoip" target="clean"/>
<ant dir="index-jexl-filter" target="clean"/>
diff --git a/src/plugin/index-arbitrary/build.xml b/src/plugin/index-arbitrary/build.xml
new file mode 100644
index 0000000..818020c
--- /dev/null
+++ b/src/plugin/index-arbitrary/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-arbitrary" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
diff --git a/src/plugin/index-arbitrary/ivy.xml b/src/plugin/index-arbitrary/ivy.xml
new file mode 100644
index 0000000..9feb1e1
--- /dev/null
+++ b/src/plugin/index-arbitrary/ivy.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="https://nutch.apache.org/"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../../ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ </dependencies>
+
+</ivy-module>
diff --git a/src/plugin/index-arbitrary/plugin.xml b/src/plugin/index-arbitrary/plugin.xml
new file mode 100644
index 0000000..f79188a
--- /dev/null
+++ b/src/plugin/index-arbitrary/plugin.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="index-arbitrary"
+ name="Index Arbitrary"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="index-arbitrary.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+
+ <extension id="org.apache.nutch.indexer.arbitrary"
+ name="Nutch arbitrary data indexer"
+ point="org.apache.nutch.indexer.IndexingFilter">
+ <implementation id="ArbitraryIndexingFilter"
+ class="org.apache.nutch.indexer.arbitrary.ArbitraryIndexingFilter"/>
+ </extension>
+
+</plugin>
diff --git a/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java
new file mode 100644
index 0000000..7677ef7
--- /dev/null
+++ b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/ArbitraryIndexingFilter.java
@@ -0,0 +1,286 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.arbitrary;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+import org.apache.hadoop.io.Text;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.invoke.MethodHandles;
+import java.lang.Class;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Method;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Adds arbitrary searchable fields to a document from the class and method
+ * the user identifies in the config. The user supplies the name of the field
+ * to add with the class and method names that supply the value.
+ *
+ * Example:<br><br>
+ * <property><br>
+ * <name>index.arbitrary.function.count</name><br>
+ * <value>1</value><br>
+ * </property><br>
+ * <br>
+ * <property><br>
+ * <name>index.arbitrary.fieldName.0</name><br>
+ * <value>advisors</value><br>
+ * </property><br>
+ * <br>
+ * <property><br>
+ * <name>index.arbitrary.className.0</name><br>
+ * <value>com.example.arbitrary.AdvisorCalculator</value><br>
+ * </property><br>
+ * <br>
+ * <property><br>
+ * <name>index.arbitrary.constructorArgs.0</name><br>
+ * <value>Kirk</value><br>
+ * </property><br>
+ * <br>
+ * <property><br>
+ * <name>index.arbitrary.methodName.0</name><br>
+ * <value>countAdvisors</value><br>
+ * </property><br>
+ * <br>
+ * <property><br>
+ * <name>index.arbitrary.methodArgs.0</name><br>
+ * <value>Spock,McCoy</value><br>
+ * </property><br>
+ * <br>
+ * To set more than one arbitrary field value,
+ * increment {@code index.arbitrary.function.count} and
+ * repeat the rest of these blocks with successive int values
+ * appended to the property names, e.g. fieldName.1, methodName.1, etc.
+ */
+public class ArbitraryIndexingFilter implements IndexingFilter {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
+ /** How many arbitrary field definitions to set. */
+ private int arbitraryAddsCount = 0;
+
+ /** The name of the field to insert/overwrite in the NutchDocument */
+ private String fieldName;
+
+ /** The fully-qualified class name of the custom class to use for the
+ * new field. This class must be in the Nutch runtime classpath,
+ * e.g., nutch/lib/ dierctory. */
+ private String className;
+
+ /** The String values to pass to the custom class constructor. The plugin
+ * will add the document url as the first argument in className's
+ * String[] args. */
+ private String[] userConstrArgs;
+
+ /** The array where the plugin copies the url & the userConstrArgs
+ * to create the instance of className. */
+ private String[] constrArgs;
+
+ /** The name of the method in the custom class to call. Its return value
+ * will become the value of fieldName in the NutchDocument. */
+ private String methodName;
+
+ /** The String values of the arguments to methodName. It's up to the
+ * developer of className to do any casts/conversions from String to
+ * another class in the code of className. */
+ private String[] methodArgs;
+
+ /** The result that returns from methodName. The plugin will set the value
+ * of fieldName to this. */
+ private Object result;
+
+ /** Optional flag to determine whether to overwrite the existing value in the
+ * NutchDocument fieldName if this is set to true. Default behavior is to
+ * add the value from calling methodName to existing values for fieldName. */
+ private boolean overwrite = false;
+
+ /** Hadoop Configuration object to pass these values into the plugin. */
+ private Configuration conf;
+
+ /**
+ * The {@link ArbitraryIndexingFilter} filter object uses reflection
+ * to instantiate the configured class and invoke the configured method.
+ * It requires a few configuration settings for adding arbitrary fields
+ * and values to the NutchDocument as searchable fields.
+ * See {@code index.arbitrary.function.count}, and (possibly multiple
+ * instances when {@code index.arbitrary.function.count} > 1) of the following
+ * {@code index.arbitrary.fieldName}.<em>index</em>,
+ * {@code index.arbitrary.className}.<em>index</em>,
+ * {@code index.arbitrary.constructorArgs}.<em>index</em>,
+ * {@code index.arbitrary.methodName}.<em>index</em>, and
+ * {@code index.arbitrary.methodArgs}.<em>index</em>
+ * in nutch-default.xml or nutch-site.xml where <em>index</em> ranges from 0
+ * to {@code index.arbitrary.function.count} - 1.
+ *
+ * @param doc
+ * The {@link NutchDocument} object
+ * @param parse
+ * The relevant {@link Parse} object passing through the filter
+ * @param url
+ * URL to be filtered by the user-specified class
+ * @param datum
+ * The {@link CrawlDatum} entry
+ * @param inlinks
+ * The {@link Inlinks} containing anchor text
+ * @return filtered NutchDocument
+ */
+ @Override
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+ Class theClass = null;
+ Method theMethod = null;
+ Constructor<?> theConstructor = null;
+ Object instance = null;
+
+ // This'll be quick
+ if (doc == null) {
+ LOG.debug("In filter() where doc is null for url == {}",
+ String.valueOf(url));
+ return doc;
+ } else if (url == null) {
+ LOG.debug("In filter() where url is null. Nothing to do.");
+ return doc;
+ }
+
+ int cfgCounter = 0;
+ while (cfgCounter < arbitraryAddsCount) {
+ setIndexedConf(conf,cfgCounter);
+ cfgCounter++;
+ try {
+ theClass = Class.forName(className);
+ if (methodArgs.length > 0) {
+ theMethod = theClass.getDeclaredMethod(methodName,String[].class);
+ } else {
+ theMethod = theClass.getMethod(methodName);
+ }
+ theConstructor = theClass.getDeclaredConstructor(String[].class);
+ } catch (Exception e) {
+ LOG.error("Exception preparing reflection tasks. className was {}",
+ String.valueOf(className));
+ e.printStackTrace();
+ }
+ try {
+ constrArgs = new String[userConstrArgs.length + 1];
+ constrArgs[0] = url.toString();
+ System.arraycopy(userConstrArgs,0,constrArgs,1,userConstrArgs.length);
+ instance = theConstructor.newInstance(new Object[]{constrArgs});
+ if (methodArgs.length > 0) {
+ result = theMethod.invoke(instance, new Object[]{methodArgs});
+ } else {
+ result = theMethod.invoke(instance);
+ }
+ } catch (Exception e) {
+ LOG.error("Exception in reflection trying to instantiate/invoke. "
+ + "url was {} & className was {}",
+ String.valueOf(url), String.valueOf(className));
+ if (constrArgs.length > 0) {
+ LOG.error("constrArgs[1] was {}", String.valueOf(constrArgs[1]));
+ }
+ LOG.error("methodName was {}", String.valueOf(className));
+ if (methodArgs.length > 0) {
+ LOG.error("methodArgs[0] was {}", String.valueOf(methodArgs[0]));
+ }
+ e.printStackTrace();
+ }
+
+ LOG.debug("{}.{}() returned {} for field {}.", className,
+ methodName, String.valueOf(result), String.valueOf(fieldName));
+
+ // If user chose to overwrite, remove existing value
+ if (overwrite) {
+ LOG.debug("overwrite == true for fieldName == {} ", fieldName);
+ if (doc.getFieldNames().contains(fieldName)) {
+ LOG.debug("Removing field '{}' from doc for overwrite", fieldName);
+ doc.removeField(fieldName);
+ }
+ }
+ if (result == null) {
+ LOG.debug("Call to {}.{} returned null", className, methodName);
+ if (overwrite) {
+ LOG.debug("{} has been cleared.", fieldName);
+ }
+ }
+ LOG.debug("Adding value '{}' for field '{}' to doc", result, fieldName);
+ doc.add(fieldName, result);
+ }
+ return doc;
+ }
+
+ /**
+ * Set the {@link Configuration} object
+ */
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ arbitraryAddsCount = conf.getInt("index.arbitrary.function.count",1);
+ LOG.info("Will process the first {} fieldName defs in config.", String.valueOf(arbitraryAddsCount));
+ }
+
+ /**
+ * Set the {@link Configuration} object for a specific set of values in the config
+ *
+ * @param conf
+ * The Configuration object holding values for the current arbitrary field.
+ * @param ndx
+ * The ordinal counter value for the current arbitrary field appended to the
+ * base property names in the xml configuration file.
+ */
+ public void setIndexedConf(Configuration conf, int ndx) {
+ LOG.debug("In setIndexedConf() where ndx was passed in as {}", String.valueOf(ndx));
+ fieldName = conf.get("index.arbitrary.fieldName.".concat(String.valueOf(ndx)));
+ LOG.debug("Looking now for index.arbitrary.fieldname.{} which was: {}",
+ String.valueOf(ndx),String.valueOf(fieldName));
+
+ if (fieldName == null || fieldName == "") {
+ throw new RuntimeException ("Problem in configuration where the index.arbitrary.fieldName."
+ + String.valueOf(ndx) + " is missing.");
+ }
+
+ className = conf.get("index.arbitrary.className.".concat(String.valueOf(ndx)));
+ if (className == null || className == "") {
+ throw new RuntimeException ("Problem in configuration where the index.arbitrary.className."
+ + String.valueOf(ndx) + " is missing.");
+ }
+
+ userConstrArgs = conf.getTrimmedStrings("index.arbitrary.constructorArgs.".concat(String.valueOf(ndx)));
+ methodName = conf.get("index.arbitrary.methodName.".concat(String.valueOf(ndx)),"");
+ methodArgs = conf.getTrimmedStrings("index.arbitrary.methodArgs.".concat(String.valueOf(ndx)));
+ overwrite = conf.getBoolean("index.arbitrary.overwrite.".concat(String.valueOf(ndx)),false);
+ if (overwrite) {
+ LOG.info("overwrite set == true for processing {}.", fieldName);
+ }
+ }
+
+ /**
+ * Get the {@link Configuration} object */
+ @Override
+ public Configuration getConf() {
+ return this.conf;
+ }
+}
diff --git a/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/package-info.java b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/package-info.java
new file mode 100644
index 0000000..6e6d475
--- /dev/null
+++ b/src/plugin/index-arbitrary/src/java/org/apache/nutch/indexer/arbitrary/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Indexing filter to add document arbitrary data to the index
+ * from the output of a user-specified class.
+ */
+package org.apache.nutch.indexer.arbitrary;
+
diff --git a/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Echo.java b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Echo.java
new file mode 100644
index 0000000..52e8939
--- /dev/null
+++ b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Echo.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.arbitrary;
+
+
+import java.io.PrintStream;
+
+public class Echo {
+
+ private static PrintStream out = System.out;
+ private String words;
+
+ public Echo(String args[]) {
+ super();
+ words = String.valueOf(args[1]);
+ }
+
+ public String getText() {
+ return words;
+ }
+
+ public static void main(String[] args) {
+ Echo echo = new Echo(args);
+ out.println(echo.getText());
+ }
+}
diff --git a/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Multiplier.java b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Multiplier.java
new file mode 100644
index 0000000..38875d0
--- /dev/null
+++ b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/Multiplier.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.arbitrary;
+
+import java.io.PrintStream;
+
+public class Multiplier {
+ private float product = 1;
+ private static PrintStream err = System.err;
+ private static PrintStream out = System.out;
+
+ public Multiplier(String args[]) {
+ super();
+ }
+
+ public String getProduct(String args[]) {
+ int i = args.length - 1;
+ try {
+ while (i >= 0) {
+ product = product * Float.parseFloat(args[i]);
+ i--;
+ }
+ } catch (NumberFormatException nfe) {
+ err.println("NumberFormatException while trying to parse " + String.valueOf(args[i]));
+ }
+ return String.valueOf(product);
+ }
+
+ public static void main(String[] args) {
+ Multiplier mp = new Multiplier(args);
+ out.println(mp.getProduct(args));
+ }
+}
diff --git a/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java
new file mode 100644
index 0000000..17f31b1
--- /dev/null
+++ b/src/plugin/index-arbitrary/src/test/org/apache/nutch/indexer/arbitrary/TestArbitraryIndexingFilter.java
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.arbitrary;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import java.lang.invoke.MethodHandles;
+
+/**
+ * Tests that the index-arbitrary filter can add a new field with an arbitrary
+ * value, supplement an existing field with an arbitrary value, and overwrite
+ * an existing field with an arbitrary value where it takes the arbitrary value
+ * from some POJO outside the normal Nutch codebase.
+ *
+ * @author Joe Gilvary
+ */
+
+public class TestArbitraryIndexingFilter {
+
+ Configuration conf;
+ Inlinks inlinks;
+ ParseImpl parse;
+ CrawlDatum crawlDatum;
+ Text url;
+ ArbitraryIndexingFilter filter;
+ NutchDocument doc;
+
+ @Before
+ public void setUp() throws Exception {
+ parse = new ParseImpl();
+ url = new Text("http://nutch.apache.org/index.html");
+ crawlDatum = new CrawlDatum();
+ inlinks = new Inlinks();
+ }
+
+
+ /**
+ * Test adding field with arbitrary content from POJO
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testAddingNewField() throws Exception {
+ conf = NutchConfiguration.create();
+ conf.set("index.arbitrary.function.count","1");
+ conf.set("index.arbitrary.fieldName.0","foo");
+ conf.set("index.arbitrary.className.0","org.apache.nutch.indexer.arbitrary.Echo");
+ conf.set("index.arbitrary.constructorArgs.0","Arbitrary text to add - bar");
+ conf.set("index.arbitrary.methodName.0","getText");
+
+ filter = new ArbitraryIndexingFilter();
+ Assert.assertNotNull("No filter exists for testAddingNewField",filter);
+
+ filter.setConf(conf);
+ doc = new NutchDocument();
+
+ try {
+ filter.filter(doc, parse, url, crawlDatum, inlinks);
+ } catch (Exception e) {
+ e.printStackTrace();
+ Assert.fail(e.getMessage());
+ }
+
+ Assert.assertNotNull(doc);
+ Assert.assertFalse("test if doc is not empty", doc.getFieldNames()
+ .isEmpty());
+ Assert.assertTrue("test if doc has new field with arbitrary value", doc.getField("foo")
+ .getValues().contains("Arbitrary text to add - bar"));
+ }
+
+ /**
+ * Test supplementing a doc field with arbitrary content from POJO
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testSupplementExistingField() throws Exception {
+
+ conf = NutchConfiguration.create();
+ conf.set("index.arbitrary.function.count","2");
+ conf.set("index.arbitrary.fieldName.0","foo");
+ conf.set("index.arbitrary.className.0","org.apache.nutch.indexer.arbitrary.Echo");
+ conf.set("index.arbitrary.constructorArgs.0","Arbitrary text to add - bar");
+ conf.set("index.arbitrary.methodName.0","getText");
+ conf.set("index.arbitrary.fieldName.1","description");
+ conf.set("index.arbitrary.className.1","org.apache.nutch.indexer.arbitrary.Multiplier");
+ conf.set("index.arbitrary.constructorArgs.1","");
+ conf.set("index.arbitrary.methodName.1","getProduct");
+ conf.set("index.arbitrary.methodArgs.1","-1,3.14");
+
+ filter = new ArbitraryIndexingFilter();
+ Assert.assertNotNull("No filter exists for testSupplementExistingField", filter);
+
+ filter.setConf(conf);
+
+ doc = new NutchDocument();
+ Assert.assertNotNull("doc doesn't exist", doc);
+
+ doc.add("description","irrational");
+
+ Assert.assertFalse("doc is empty", doc.getFieldNames().isEmpty());
+
+ Assert.assertEquals("field description does not have exactly one value", 1,
+ doc.getField("description").getValues().size());
+
+ Assert.assertTrue("field description does not have initial value 'irrational'",
+ doc.getField("description").getValues().contains("irrational"));
+
+ try {
+ filter.filter(doc, parse, url, crawlDatum, inlinks);
+ } catch (Exception e) {
+ e.printStackTrace();
+ Assert.fail(e.getMessage());
+ }
+
+ Assert.assertTrue("doc doesn't have new field with arbitrary value",
+ doc.getField("foo").getValues()
+ .contains("Arbitrary text to add - bar"));
+
+ Assert.assertEquals("field description does not have 2 values", 2,
+ doc.getField("description").getValues().size());
+
+ Assert.assertTrue("field description original value gone", doc.getField("description")
+ .getValues().contains("irrational"));
+
+ Assert.assertTrue("field description missing new value", doc.getField("description")
+ .getValues().contains("-3.14"));
+ }
+
+
+ /**
+ * Test overwriting a doc field with arbitrary content from POJO
+ *
+ * @throws Exception
+ */
+ @Test
+ public void testOverwritingExistingField() throws Exception {
+ conf = NutchConfiguration.create();
+ conf.set("index.arbitrary.function.count","3");
+ conf.set("index.arbitrary.fieldName.0","foo");
+ conf.set("index.arbitrary.className.0","org.apache.nutch.indexer.arbitrary.Echo");
+ conf.set("index.arbitrary.constructorArgs.0","Arbitrary text to add - bar");
+ conf.set("index.arbitrary.methodName.0","getText");
+ conf.set("index.arbitrary.fieldName.1","description");
+ conf.set("index.arbitrary.className.1","org.apache.nutch.indexer.arbitrary.Multiplier");
+ conf.set("index.arbitrary.methodArgs.1","-1,3.14159265");
+ conf.set("index.arbitrary.methodName.1","getProduct");
+ conf.set("index.arbitrary.fieldName.2","philosopher");
+ conf.set("index.arbitrary.className.2","org.apache.nutch.indexer.arbitrary.Echo");
+ conf.set("index.arbitrary.constructorArgs.2","Popeye");
+ conf.set("index.arbitrary.methodName.2","getText");
+ conf.set("index.arbitrary.overwrite.2","true");
+
+ filter = new ArbitraryIndexingFilter();
+ Assert.assertNotNull("No filter exists for testOverwritingExistingField",filter);
+
+ filter.setConf(conf);
+ Assert.assertNotNull("conf does not exist",conf);
+
+ doc = new NutchDocument();
+
+ Assert.assertNotNull("doc does not exist",doc);
+
+ doc.add("description","irrational");
+ doc.add("philosopher","Socrates");
+
+ Assert.assertEquals("field description does not have exactly one value", 1, doc.getField("description")
+ .getValues().size());
+
+ Assert.assertEquals("field philosopher does not have exactly one value", 1, doc.getField("philosopher")
+ .getValues().size());
+
+ Assert.assertTrue("field description does not have initial value 'irrational'", doc.getField("description")
+ .getValues().contains("irrational"));
+
+ Assert.assertTrue("field philosopher does not have initial value 'Socrates'", doc.getField("philosopher")
+ .getValues().contains("Socrates"));
+
+ try {
+ filter.filter(doc, parse, url, crawlDatum, inlinks);
+ } catch (Exception e) {
+ e.printStackTrace(System.out);
+ Assert.fail(e.getMessage());
+ }
+
+ Assert.assertNotNull(doc);
+
+ Assert.assertEquals("field philosopher no longer has only one value", 1, doc.getField("philosopher")
+ .getValues().size());
+
+ Assert.assertFalse("field philosopher's original value 'Socrates' NOT overwritten", doc.getField("philosopher")
+ .getValues().contains("Socrates"));
+
+ Assert.assertTrue("field philosopher does not have new value 'Popeye'", doc.getField("philosopher")
+ .getValues().contains("Popeye"));
+ }
+}