Merge pull request #444 from r0ann3l/NUTCH-2688

NUTCH-2688: Unify the licence headers
diff --git a/build.xml b/build.xml
index 18f659a..04a36a6 100644
--- a/build.xml
+++ b/build.xml
@@ -192,6 +192,7 @@
       <packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
       <packageset dir="${plugins.dir}/indexer-elastic-rest/src/java/"/>
       <packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
+      <packageset dir="${plugins.dir}/indexer-kafka/src/java/" />
       <packageset dir="${plugins.dir}/indexer-rabbit/src/java"/>
       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
@@ -688,6 +689,7 @@
       <packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
       <packageset dir="${plugins.dir}/indexer-elastic-rest/src/java/"/>
       <packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
+      <packageset dir="${plugins.dir}/indexer-kafka/src/java/" />
       <packageset dir="${plugins.dir}/indexer-rabbit/src/java"/>
       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
@@ -1088,6 +1090,7 @@
         <source path="${plugins.dir}/indexer-elastic-rest/src/java/"/>
         <source path="${plugins.dir}/indexer-elastic/src/java/" />
         <source path="${plugins.dir}/indexer-elastic/src/test/" />
+        <source path="${plugins.dir}/indexer-kafka/src/java/" />
         <source path="${plugins.dir}/indexer-rabbit/src/java/" />
         <source path="${plugins.dir}/indexer-solr/src/java/" />
         <source path="${plugins.dir}/language-identifier/src/java/" />
diff --git a/conf/index-writers.xml.template b/conf/index-writers.xml.template
index eaa5870..268554a 100644
--- a/conf/index-writers.xml.template
+++ b/conf/index-writers.xml.template
@@ -161,4 +161,21 @@
       <remove />
     </mapping>
   </writer>
+  <writer id="indexer_kafka_1" class="org.apache.nutch.indexwriter.kafka.KafkaIndexWriter">
+    <parameters>
+      <param name="host" value=""/>
+      <param name="port" value="9092"/>
+      <param name="topic" value=""/>
+      <param name="key.serializer" value="org.apache.kafka.common.serialization.ByteArraySerializer"/>
+      <param name="value.serializer" value="org.apache.kafka.connect.json.JsonSerializer"/>
+      <param name="max.doc.count" value="100"/>
+    </parameters>
+    <mapping>
+      <copy>
+        <field source="title" dest="search"/>
+      </copy>
+      <rename />
+      <remove />
+    </mapping>
+  </writer>
 </writers>
diff --git a/default.properties b/default.properties
index bb987d9..a3bc0cf 100644
--- a/default.properties
+++ b/default.properties
@@ -198,6 +198,7 @@
    org.apache.nutch.indexwriter.elastic*:\
    org.apache.nutch.indexwriter.elasticrest*:\
    org.apache.nutch.indexwriter.rabbit*:\
+   org.apache.nutch.indexwriter.kafka*:\
    org.apache.nutch.indexwriter.solr*
 
 #
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 52826bb..12e0483 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -139,11 +139,9 @@
 			<exclude org="org.json"/>
 		</dependency>
 
-
 		<!-- RabbitMQ dependencies -->
 		<dependency org="com.rabbitmq" name="amqp-client" rev="5.2.0" conf="*->default" />
 
-
 		<!--Added Because of Elasticsearch JEST client-->
 		<!--TODO refactor these to indexer-elastic-rest plugin somehow, currently doesn't resolve correctly-->
 		<dependency org="org.apache.httpcomponents" name="httpcore-nio" rev="4.4.9"/>
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index d8826e8..2592357 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -54,6 +54,7 @@
     <ant dir="indexer-dummy" target="deploy"/>
     <ant dir="indexer-elastic" target="deploy"/>
     <ant dir="indexer-elastic-rest" target="deploy"/>
+    <ant dir="indexer-kafka" target="deploy"/>
     <ant dir="indexer-rabbit" target="deploy"/>
     <ant dir="indexer-solr" target="deploy"/>
     <ant dir="language-identifier" target="deploy"/>
@@ -191,6 +192,7 @@
     <ant dir="indexer-dummy" target="clean"/>
     <ant dir="indexer-elastic" target="clean"/>
     <ant dir="indexer-elastic-rest" target="clean"/>
+    <ant dir="indexer-kafka" target="clean"/>
     <ant dir="indexer-rabbit" target="clean"/>
     <ant dir="indexer-solr" target="clean"/>
     <ant dir="language-identifier" target="clean"/>
diff --git a/src/plugin/indexer-kafka/build-ivy.xml b/src/plugin/indexer-kafka/build-ivy.xml
new file mode 100644
index 0000000..0932dfc
--- /dev/null
+++ b/src/plugin/indexer-kafka/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="indexer-kafka" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0"/>
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+        <isset property="env.IVY_HOME"/>
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant"/>
+    <property name="ivy.checksums" value=""/>
+    <property name="ivy.jar.dir" value="${ivy.home}/lib"/>
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar"/>
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar"
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+        <!-- try to load ivy here from ivy home, in case the user has not already dropped
+                it into ant's lib dir (note that the latter copy will always take precedence).
+                We will not fail as long as local lib dir exists (it may be empty) and
+                ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+    <target name="deps-jar" depends="init-ivy">
+        <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+    </target>
+
+</project>
diff --git a/src/plugin/indexer-kafka/build.xml b/src/plugin/indexer-kafka/build.xml
new file mode 100644
index 0000000..c2f8078
--- /dev/null
+++ b/src/plugin/indexer-kafka/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="indexer-kafka" default="jar-core">
+
+    <import file="../build-plugin.xml"/>
+
+</project>
diff --git a/src/plugin/indexer-kafka/ivy.xml b/src/plugin/indexer-kafka/ivy.xml
new file mode 100644
index 0000000..26f143e
--- /dev/null
+++ b/src/plugin/indexer-kafka/ivy.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+      <license name="Apache 2.0"/>
+      <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+      <description>
+          Apache Nutch
+      </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+      <dependency org="org.apache.kafka" name="kafka_2.12" rev="1.1.0"/>
+      <dependency org="org.apache.kafka" name="connect-json" rev="1.1.0"/>
+  </dependencies>
+
+</ivy-module>
diff --git a/src/plugin/indexer-kafka/plugin.xml b/src/plugin/indexer-kafka/plugin.xml
new file mode 100644
index 0000000..c5cc21c
--- /dev/null
+++ b/src/plugin/indexer-kafka/plugin.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<plugin id="indexer-kafka" name="KafkaIndexWriter" version="1.0.0"
+        provider-name="nutch.apache.org">
+
+    <runtime>
+        <library name="indexer-kafka.jar">
+            <export name="*"/>
+        </library>
+        <library name="kafka_2.12-1.1.0.jar"/>
+        <library name="connect-json-1.1.0.jar"/>
+        <library name="connect-api-1.1.0.jar"/>
+        <library name="jackson-annotations-2.9.0.jar"/>
+        <library name="jackson-core-2.9.4.jar"/>
+        <library name="jackson-databind-2.9.4.jar"/>
+        <library name="jopt-simple-5.0.4.jar"/>
+        <library name="kafka-clients-1.1.0.jar"/>
+        <library name="lz4-java-1.4.jar"/>
+        <library name="metrics-core-2.2.0.jar"/>
+        <library name="scala-library-2.12.4.jar"/>
+        <library name="scala-logging_2.12-3.7.2.jar"/>
+        <library name="scala-reflect-2.12.4.jar"/>
+        <library name="snappy-java-1.1.7.1.jar"/>
+        <library name="zkclient-0.10.jar"/>
+        <library name="zookeeper-3.4.10.jar"/>
+    </runtime>
+
+    <requires>
+        <import plugin="nutch-extensionpoints"/>
+    </requires>
+
+    <extension id="org.apache.nutch.indexer.kafka"
+               name="Kafka Index Writer"
+               point="org.apache.nutch.indexer.IndexWriter">
+        <implementation id="KafkaIndexWriter"
+                        class="org.apache.nutch.indexwriter.kafka.KafkaIndexWriter"/>
+    </extension>
+
+</plugin>
diff --git a/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaConstants.java b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaConstants.java
new file mode 100644
index 0000000..f722382
--- /dev/null
+++ b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaConstants.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.kafka;
+
+public interface KafkaConstants {
+
+  public static final String HOST = "host";
+  public static final String PORT = "port";
+
+  public static final String KEY_SERIALIZER = "key.serializer";
+  public static final String VALUE_SERIALIZER = 
+      "value.serializer";
+  public static final String TOPIC =  "topic";
+  public static final String MAX_DOC_COUNT = "max.doc.count";
+}
diff --git a/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaIndexWriter.java b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaIndexWriter.java
new file mode 100644
index 0000000..1702004
--- /dev/null
+++ b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaIndexWriter.java
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.kafka;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.exception.ExceptionUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.IndexWriter;
+import org.apache.nutch.indexer.IndexWriterParams;
+import org.apache.nutch.indexer.NutchDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+import org.apache.kafka.clients.producer.KafkaProducer;
+import org.apache.kafka.clients.producer.ProducerConfig;
+import org.apache.kafka.clients.producer.ProducerRecord;
+import com.fasterxml.jackson.databind.JsonNode;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+/**
+ * Sends Nutch documents to a configured Kafka Cluster
+ */
+public class KafkaIndexWriter implements IndexWriter {
+  public static Logger LOG = LoggerFactory.getLogger(KafkaIndexWriter.class);
+
+  private org.apache.kafka.clients.producer.Producer<String, JsonNode> producer;
+  private ProducerRecord<String, JsonNode> data;
+
+  private Configuration config;
+
+  private int port = -1;
+  private String host = null;
+  private String valueSerializer = null;
+  private String keySerializer = null;
+  private String topic = null;
+  private int maxDocCount = -1;
+
+  private String jsonString = null;
+  private JsonNode json = null;
+
+  private List<ProducerRecord<String, JsonNode>> inputDocs = null;
+
+  @Override
+  public void open(Configuration job, String name) throws IOException {
+    //Implementation not required
+  }
+  
+  @Override
+  public void open(IndexWriterParams params) throws IOException {
+    host = params.get(KafkaConstants.HOST);
+    port = params.getInt(KafkaConstants.PORT, 9092);
+    
+    keySerializer = params.get(KafkaConstants.KEY_SERIALIZER,
+        "org.apache.kafka.common.serialization.ByteArraySerializer");
+    valueSerializer = params.get(KafkaConstants.VALUE_SERIALIZER,
+        "org.apache.kafka.connect.json.JsonSerializer");
+    topic = params.get(KafkaConstants.TOPIC);
+    maxDocCount = params.getInt(KafkaConstants.MAX_DOC_COUNT, 100);
+
+    inputDocs = new ArrayList<ProducerRecord<String, JsonNode>>(maxDocCount);
+    
+    if (StringUtils.isBlank(host)) {
+      String message = "Missing host. It should be set in index-writers.xml";
+      message += "\n" + describe();
+      LOG.error(message);
+      throw new RuntimeException(message);
+    }
+    
+    Properties configProperties = new Properties();
+    configProperties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,
+        host + ":" + port);
+    configProperties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
+        keySerializer);
+    configProperties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
+        valueSerializer);
+
+    Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());
+    producer = new KafkaProducer<String, JsonNode>(configProperties);
+  }
+
+  @Override
+  public void write(NutchDocument doc) throws IOException {
+
+    Map<String, Object> source = new HashMap<String, Object>();
+
+    // Loop through all fields of this doc
+    for (String fieldName : doc.getFieldNames()) {
+      Set<String> allFieldValues = new HashSet<String>();
+      for (Object value : doc.getField(fieldName).getValues()) {
+        allFieldValues.add(value.toString());
+      }
+      String[] fieldValues = allFieldValues
+          .toArray(new String[allFieldValues.size()]);
+      source.put(fieldName, fieldValues);
+    }
+    try {
+      jsonString = new ObjectMapper().writeValueAsString(source);
+      json = new ObjectMapper().readTree(jsonString);
+      data = new ProducerRecord<String, JsonNode>(topic, json);
+
+      inputDocs.add(data);
+      if (inputDocs.size() == maxDocCount) {
+        commit();
+      }
+    } catch (NullPointerException e) {
+      LOG.info("Data is empty, all messages have been sent");
+    }
+  }
+
+  @Override
+  public void delete(String key) throws IOException {
+    // Not applicable in Kafka
+  }
+
+  @Override
+  public void update(NutchDocument doc) throws IOException {
+    try {
+      write(doc);
+    } catch (IOException e) {
+      LOG.error(ExceptionUtils.getStackTrace(e));
+      throw e;
+    }
+  }
+
+  @Override
+  public void commit() throws IOException {
+    try {
+      for (ProducerRecord<String, JsonNode> datum : inputDocs) {
+        producer.send(datum);
+      }
+      inputDocs.clear();
+    } catch (NullPointerException e) {
+      LOG.info("All records have been sent to Kakfa on topic {}", topic);
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    commit();
+    producer.close();
+  }
+
+  @Override
+  public Map<String, Map.Entry<String, Object>> describe() {
+    Map<String, Map.Entry<String, Object>> properties = new LinkedHashMap<>();
+
+    properties.put(KafkaConstants.HOST,
+            new AbstractMap.SimpleEntry<>(
+                    "Location of the host Kafka cluster to connect to using producerConfig",
+                    this.host));
+
+    properties.put(KafkaConstants.PORT,
+            new AbstractMap.SimpleEntry<>(
+                    "The port to connect to using the producerConfig",
+                    this.port));
+
+    properties.put(KafkaConstants.TOPIC,
+            new AbstractMap.SimpleEntry<>(
+                    "Default index to attach to documents",
+                    this.topic));
+
+    properties.put(KafkaConstants.KEY_SERIALIZER,
+    new AbstractMap.SimpleEntry<>(
+            "instruct how to turn the key object the user provides with their ProducerRecord into bytes",
+            this.keySerializer));      
+
+    properties.put(KafkaConstants.VALUE_SERIALIZER,
+    new AbstractMap.SimpleEntry<>(
+            "instruct how to turn the value object the user provides with their ProducerRecord into bytes",
+            this.valueSerializer));
+
+    properties.put(KafkaConstants.MAX_DOC_COUNT,
+    new AbstractMap.SimpleEntry<>(
+            "Maximum number of documents before a commit is forced",
+            this.maxDocCount));
+    return properties;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    config = conf;
+  }
+
+  @Override
+  public Configuration getConf() {
+    return config;
+  }
+
+}
diff --git a/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/package-info.java b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/package-info.java
new file mode 100644
index 0000000..b720872
--- /dev/null
+++ b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/package-info.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Index writer plugin to produce JSON messages to <a href="https://kafka.apache.org/">Kafka</a>.
+ */
+package org.apache.nutch.indexwriter.kafka;