Merge pull request #444 from r0ann3l/NUTCH-2688
NUTCH-2688: Unify the licence headers
diff --git a/build.xml b/build.xml
index 18f659a..04a36a6 100644
--- a/build.xml
+++ b/build.xml
@@ -192,6 +192,7 @@
<packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
<packageset dir="${plugins.dir}/indexer-elastic-rest/src/java/"/>
<packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
+ <packageset dir="${plugins.dir}/indexer-kafka/src/java/" />
<packageset dir="${plugins.dir}/indexer-rabbit/src/java"/>
<packageset dir="${plugins.dir}/indexer-solr/src/java"/>
<packageset dir="${plugins.dir}/language-identifier/src/java"/>
@@ -688,6 +689,7 @@
<packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
<packageset dir="${plugins.dir}/indexer-elastic-rest/src/java/"/>
<packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
+ <packageset dir="${plugins.dir}/indexer-kafka/src/java/" />
<packageset dir="${plugins.dir}/indexer-rabbit/src/java"/>
<packageset dir="${plugins.dir}/indexer-solr/src/java"/>
<packageset dir="${plugins.dir}/language-identifier/src/java"/>
@@ -1088,6 +1090,7 @@
<source path="${plugins.dir}/indexer-elastic-rest/src/java/"/>
<source path="${plugins.dir}/indexer-elastic/src/java/" />
<source path="${plugins.dir}/indexer-elastic/src/test/" />
+ <source path="${plugins.dir}/indexer-kafka/src/java/" />
<source path="${plugins.dir}/indexer-rabbit/src/java/" />
<source path="${plugins.dir}/indexer-solr/src/java/" />
<source path="${plugins.dir}/language-identifier/src/java/" />
diff --git a/conf/index-writers.xml.template b/conf/index-writers.xml.template
index eaa5870..268554a 100644
--- a/conf/index-writers.xml.template
+++ b/conf/index-writers.xml.template
@@ -161,4 +161,21 @@
<remove />
</mapping>
</writer>
+ <writer id="indexer_kafka_1" class="org.apache.nutch.indexwriter.kafka.KafkaIndexWriter">
+ <parameters>
+ <param name="host" value=""/>
+ <param name="port" value="9092"/>
+ <param name="topic" value=""/>
+ <param name="key.serializer" value="org.apache.kafka.common.serialization.ByteArraySerializer"/>
+ <param name="value.serializer" value="org.apache.kafka.connect.json.JsonSerializer"/>
+ <param name="max.doc.count" value="100"/>
+ </parameters>
+ <mapping>
+ <copy>
+ <field source="title" dest="search"/>
+ </copy>
+ <rename />
+ <remove />
+ </mapping>
+ </writer>
</writers>
diff --git a/default.properties b/default.properties
index bb987d9..a3bc0cf 100644
--- a/default.properties
+++ b/default.properties
@@ -198,6 +198,7 @@
org.apache.nutch.indexwriter.elastic*:\
org.apache.nutch.indexwriter.elasticrest*:\
org.apache.nutch.indexwriter.rabbit*:\
+ org.apache.nutch.indexwriter.kafka*:\
org.apache.nutch.indexwriter.solr*
#
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 52826bb..12e0483 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -139,11 +139,9 @@
<exclude org="org.json"/>
</dependency>
-
<!-- RabbitMQ dependencies -->
<dependency org="com.rabbitmq" name="amqp-client" rev="5.2.0" conf="*->default" />
-
<!--Added Because of Elasticsearch JEST client-->
<!--TODO refactor these to indexer-elastic-rest plugin somehow, currently doesn't resolve correctly-->
<dependency org="org.apache.httpcomponents" name="httpcore-nio" rev="4.4.9"/>
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index d8826e8..2592357 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -54,6 +54,7 @@
<ant dir="indexer-dummy" target="deploy"/>
<ant dir="indexer-elastic" target="deploy"/>
<ant dir="indexer-elastic-rest" target="deploy"/>
+ <ant dir="indexer-kafka" target="deploy"/>
<ant dir="indexer-rabbit" target="deploy"/>
<ant dir="indexer-solr" target="deploy"/>
<ant dir="language-identifier" target="deploy"/>
@@ -191,6 +192,7 @@
<ant dir="indexer-dummy" target="clean"/>
<ant dir="indexer-elastic" target="clean"/>
<ant dir="indexer-elastic-rest" target="clean"/>
+ <ant dir="indexer-kafka" target="clean"/>
<ant dir="indexer-rabbit" target="clean"/>
<ant dir="indexer-solr" target="clean"/>
<ant dir="language-identifier" target="clean"/>
diff --git a/src/plugin/indexer-kafka/build-ivy.xml b/src/plugin/indexer-kafka/build-ivy.xml
new file mode 100644
index 0000000..0932dfc
--- /dev/null
+++ b/src/plugin/indexer-kafka/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="indexer-kafka" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+ <property name="ivy.install.version" value="2.1.0"/>
+ <condition property="ivy.home" value="${env.IVY_HOME}">
+ <isset property="env.IVY_HOME"/>
+ </condition>
+ <property name="ivy.home" value="${user.home}/.ant"/>
+ <property name="ivy.checksums" value=""/>
+ <property name="ivy.jar.dir" value="${ivy.home}/lib"/>
+ <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar"/>
+
+ <target name="download-ivy" unless="offline">
+
+ <mkdir dir="${ivy.jar.dir}"/>
+ <!-- download Ivy from web site so that it can be used even without any special installation -->
+ <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar"
+ dest="${ivy.jar.file}" usetimestamp="true"/>
+ </target>
+
+ <target name="init-ivy" depends="download-ivy">
+ <!-- try to load ivy here from ivy home, in case the user has not already dropped
+ it into ant's lib dir (note that the latter copy will always take precedence).
+ We will not fail as long as local lib dir exists (it may be empty) and
+ ivy is in at least one of ant's lib dir or the local lib dir. -->
+ <path id="ivy.lib.path">
+ <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+ </path>
+ <taskdef resource="org/apache/ivy/ant/antlib.xml"
+ uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+ </target>
+
+ <target name="deps-jar" depends="init-ivy">
+ <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+ </target>
+
+</project>
diff --git a/src/plugin/indexer-kafka/build.xml b/src/plugin/indexer-kafka/build.xml
new file mode 100644
index 0000000..c2f8078
--- /dev/null
+++ b/src/plugin/indexer-kafka/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="indexer-kafka" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
diff --git a/src/plugin/indexer-kafka/ivy.xml b/src/plugin/indexer-kafka/ivy.xml
new file mode 100644
index 0000000..26f143e
--- /dev/null
+++ b/src/plugin/indexer-kafka/ivy.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../../ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+ <dependencies>
+ <dependency org="org.apache.kafka" name="kafka_2.12" rev="1.1.0"/>
+ <dependency org="org.apache.kafka" name="connect-json" rev="1.1.0"/>
+ </dependencies>
+
+</ivy-module>
diff --git a/src/plugin/indexer-kafka/plugin.xml b/src/plugin/indexer-kafka/plugin.xml
new file mode 100644
index 0000000..c5cc21c
--- /dev/null
+++ b/src/plugin/indexer-kafka/plugin.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin id="indexer-kafka" name="KafkaIndexWriter" version="1.0.0"
+ provider-name="nutch.apache.org">
+
+ <runtime>
+ <library name="indexer-kafka.jar">
+ <export name="*"/>
+ </library>
+ <library name="kafka_2.12-1.1.0.jar"/>
+ <library name="connect-json-1.1.0.jar"/>
+ <library name="connect-api-1.1.0.jar"/>
+ <library name="jackson-annotations-2.9.0.jar"/>
+ <library name="jackson-core-2.9.4.jar"/>
+ <library name="jackson-databind-2.9.4.jar"/>
+ <library name="jopt-simple-5.0.4.jar"/>
+ <library name="kafka-clients-1.1.0.jar"/>
+ <library name="lz4-java-1.4.jar"/>
+ <library name="metrics-core-2.2.0.jar"/>
+ <library name="scala-library-2.12.4.jar"/>
+ <library name="scala-logging_2.12-3.7.2.jar"/>
+ <library name="scala-reflect-2.12.4.jar"/>
+ <library name="snappy-java-1.1.7.1.jar"/>
+ <library name="zkclient-0.10.jar"/>
+ <library name="zookeeper-3.4.10.jar"/>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.indexer.kafka"
+ name="Kafka Index Writer"
+ point="org.apache.nutch.indexer.IndexWriter">
+ <implementation id="KafkaIndexWriter"
+ class="org.apache.nutch.indexwriter.kafka.KafkaIndexWriter"/>
+ </extension>
+
+</plugin>
diff --git a/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaConstants.java b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaConstants.java
new file mode 100644
index 0000000..f722382
--- /dev/null
+++ b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaConstants.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.kafka;
+
+public interface KafkaConstants {
+
+ public static final String HOST = "host";
+ public static final String PORT = "port";
+
+ public static final String KEY_SERIALIZER = "key.serializer";
+ public static final String VALUE_SERIALIZER =
+ "value.serializer";
+ public static final String TOPIC = "topic";
+ public static final String MAX_DOC_COUNT = "max.doc.count";
+}
diff --git a/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaIndexWriter.java b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaIndexWriter.java
new file mode 100644
index 0000000..1702004
--- /dev/null
+++ b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaIndexWriter.java
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.kafka;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.exception.ExceptionUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.IndexWriter;
+import org.apache.nutch.indexer.IndexWriterParams;
+import org.apache.nutch.indexer.NutchDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+import org.apache.kafka.clients.producer.KafkaProducer;
+import org.apache.kafka.clients.producer.ProducerConfig;
+import org.apache.kafka.clients.producer.ProducerRecord;
+import com.fasterxml.jackson.databind.JsonNode;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+/**
+ * Sends Nutch documents to a configured Kafka Cluster
+ */
+public class KafkaIndexWriter implements IndexWriter {
+ public static Logger LOG = LoggerFactory.getLogger(KafkaIndexWriter.class);
+
+ private org.apache.kafka.clients.producer.Producer<String, JsonNode> producer;
+ private ProducerRecord<String, JsonNode> data;
+
+ private Configuration config;
+
+ private int port = -1;
+ private String host = null;
+ private String valueSerializer = null;
+ private String keySerializer = null;
+ private String topic = null;
+ private int maxDocCount = -1;
+
+ private String jsonString = null;
+ private JsonNode json = null;
+
+ private List<ProducerRecord<String, JsonNode>> inputDocs = null;
+
+ @Override
+ public void open(Configuration job, String name) throws IOException {
+ //Implementation not required
+ }
+
+ @Override
+ public void open(IndexWriterParams params) throws IOException {
+ host = params.get(KafkaConstants.HOST);
+ port = params.getInt(KafkaConstants.PORT, 9092);
+
+ keySerializer = params.get(KafkaConstants.KEY_SERIALIZER,
+ "org.apache.kafka.common.serialization.ByteArraySerializer");
+ valueSerializer = params.get(KafkaConstants.VALUE_SERIALIZER,
+ "org.apache.kafka.connect.json.JsonSerializer");
+ topic = params.get(KafkaConstants.TOPIC);
+ maxDocCount = params.getInt(KafkaConstants.MAX_DOC_COUNT, 100);
+
+ inputDocs = new ArrayList<ProducerRecord<String, JsonNode>>(maxDocCount);
+
+ if (StringUtils.isBlank(host)) {
+ String message = "Missing host. It should be set in index-writers.xml";
+ message += "\n" + describe();
+ LOG.error(message);
+ throw new RuntimeException(message);
+ }
+
+ Properties configProperties = new Properties();
+ configProperties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,
+ host + ":" + port);
+ configProperties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
+ keySerializer);
+ configProperties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
+ valueSerializer);
+
+ Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());
+ producer = new KafkaProducer<String, JsonNode>(configProperties);
+ }
+
+ @Override
+ public void write(NutchDocument doc) throws IOException {
+
+ Map<String, Object> source = new HashMap<String, Object>();
+
+ // Loop through all fields of this doc
+ for (String fieldName : doc.getFieldNames()) {
+ Set<String> allFieldValues = new HashSet<String>();
+ for (Object value : doc.getField(fieldName).getValues()) {
+ allFieldValues.add(value.toString());
+ }
+ String[] fieldValues = allFieldValues
+ .toArray(new String[allFieldValues.size()]);
+ source.put(fieldName, fieldValues);
+ }
+ try {
+ jsonString = new ObjectMapper().writeValueAsString(source);
+ json = new ObjectMapper().readTree(jsonString);
+ data = new ProducerRecord<String, JsonNode>(topic, json);
+
+ inputDocs.add(data);
+ if (inputDocs.size() == maxDocCount) {
+ commit();
+ }
+ } catch (NullPointerException e) {
+ LOG.info("Data is empty, all messages have been sent");
+ }
+ }
+
+ @Override
+ public void delete(String key) throws IOException {
+ // Not applicable in Kafka
+ }
+
+ @Override
+ public void update(NutchDocument doc) throws IOException {
+ try {
+ write(doc);
+ } catch (IOException e) {
+ LOG.error(ExceptionUtils.getStackTrace(e));
+ throw e;
+ }
+ }
+
+ @Override
+ public void commit() throws IOException {
+ try {
+ for (ProducerRecord<String, JsonNode> datum : inputDocs) {
+ producer.send(datum);
+ }
+ inputDocs.clear();
+ } catch (NullPointerException e) {
+ LOG.info("All records have been sent to Kakfa on topic {}", topic);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ commit();
+ producer.close();
+ }
+
+ @Override
+ public Map<String, Map.Entry<String, Object>> describe() {
+ Map<String, Map.Entry<String, Object>> properties = new LinkedHashMap<>();
+
+ properties.put(KafkaConstants.HOST,
+ new AbstractMap.SimpleEntry<>(
+ "Location of the host Kafka cluster to connect to using producerConfig",
+ this.host));
+
+ properties.put(KafkaConstants.PORT,
+ new AbstractMap.SimpleEntry<>(
+ "The port to connect to using the producerConfig",
+ this.port));
+
+ properties.put(KafkaConstants.TOPIC,
+ new AbstractMap.SimpleEntry<>(
+ "Default index to attach to documents",
+ this.topic));
+
+ properties.put(KafkaConstants.KEY_SERIALIZER,
+ new AbstractMap.SimpleEntry<>(
+ "instruct how to turn the key object the user provides with their ProducerRecord into bytes",
+ this.keySerializer));
+
+ properties.put(KafkaConstants.VALUE_SERIALIZER,
+ new AbstractMap.SimpleEntry<>(
+ "instruct how to turn the value object the user provides with their ProducerRecord into bytes",
+ this.valueSerializer));
+
+ properties.put(KafkaConstants.MAX_DOC_COUNT,
+ new AbstractMap.SimpleEntry<>(
+ "Maximum number of documents before a commit is forced",
+ this.maxDocCount));
+ return properties;
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ config = conf;
+ }
+
+ @Override
+ public Configuration getConf() {
+ return config;
+ }
+
+}
diff --git a/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/package-info.java b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/package-info.java
new file mode 100644
index 0000000..b720872
--- /dev/null
+++ b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/package-info.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Index writer plugin to produce JSON messages to <a href="https://kafka.apache.org/">Kafka</a>.
+ */
+package org.apache.nutch.indexwriter.kafka;