Merge branch 'master' into NUTCH-2654
diff --git a/conf/elasticsearch.conf b/conf/elasticsearch.conf
deleted file mode 100644
index c4c73b9..0000000
--- a/conf/elasticsearch.conf
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Settings for Elasticsearch indexer plugin
-# Format: key=value\n
diff --git a/conf/index-writers.xml.template b/conf/index-writers.xml.template
index 268554a..808e31f 100644
--- a/conf/index-writers.xml.template
+++ b/conf/index-writers.xml.template
@@ -115,6 +115,7 @@
<param name="exponential.backoff.millis" value="100"/>
<param name="exponential.backoff.retries" value="10"/>
<param name="bulk.close.timeout" value="600"/>
+ <!--<param name="options" value="key1=value1,key2=value2"/>-->
</parameters>
<mapping>
<copy>
diff --git a/conf/nutch-default.xml.template b/conf/nutch-default.xml.template
index fd201c7..c5359bc 100644
--- a/conf/nutch-default.xml.template
+++ b/conf/nutch-default.xml.template
@@ -2073,376 +2073,6 @@
</description>
</property>
-<!-- solr index properties -->
-
-<property>
- <name>solr.server.type</name>
- <value>http</value>
- <description>
- Specifies the SolrServer implementation to use. This is a string value
- of one of the following 'cloud', 'concurrent', 'http' or 'lb'.
- The values represent CloudSolrServer, ConcurrentUpdateSolrServer,
- HttpSolrServer or LBHttpSolrServer respectively.
- </description>
-</property>
-
-<property>
- <name>solr.server.url</name>
- <value>http://127.0.0.1:8983/solr/</value>
- <description>
- Defines the Solr URL into which data should be indexed using the
- indexer-solr plugin.
- </description>
-</property>
-
-<property>
- <name>solr.zookeeper.url</name>
- <value></value>
- <description>
- Defines the Zookeeper URL which is an essential setting to be used
- when using SolrCloud. This should be a fully qualified URL similar to
- the property provided within 'solr.server.url' above.
- </description>
-</property>
-
-<property>
- <name>solr.loadbalance.urls</name>
- <value></value>
- <description>
- A comma-separated value representing the Solr servers to be used when
- initiating LBHttpSolrServer as the SolrServer implementation.
- </description>
-</property>
-
-<property>
- <name>solr.mapping.file</name>
- <value>solrindex-mapping.xml</value>
- <description>
- Defines the name of the file that will be used in the mapping of internal
- Nutch field names to solr index fields as specified in the target Solr schema.
- </description>
-</property>
-
-<property>
- <name>solr.commit.size</name>
- <value>250</value>
- <description>
- Defines the number of documents to send to Solr in a single update batch.
- Decrease when handling very large documents to prevent Nutch from running
- out of memory. NOTE: It does not explicitly trigger a server side commit.
- </description>
-</property>
-
-<property>
- <name>solr.commit.index</name>
- <value>true</value>
- <description>
- When closing the indexer, trigger a commit to the Solr server.
- </description>
-</property>
-
-<property>
- <name>solr.auth</name>
- <value>false</value>
- <description>
- Whether to enable HTTP basic authentication for communicating with Solr.
- Use the solr.auth.username and solr.auth.password properties to configure
- your credentials.
- </description>
-</property>
-
-<!-- Elasticsearch properties -->
-
-<property>
- <name>elastic.host</name>
- <value></value>
- <description>Comma-separated list of hostnames to send documents to using
- TransportClient. Either host and port must be defined or cluster.</description>
-</property>
-
-<property>
- <name>elastic.port</name>
- <value>9300</value>
- <description>The port to connect to using TransportClient.</description>
-</property>
-
-<property>
- <name>elastic.cluster</name>
- <value></value>
- <description>The cluster name to discover. Either host and port must be defined
- or cluster.</description>
-</property>
-
-<property>
- <name>elastic.index</name>
- <value>nutch</value>
- <description>Default index to send documents to.</description>
-</property>
-
-<property>
- <name>elastic.max.bulk.docs</name>
- <value>250</value>
- <description>Maximum size of the bulk in number of documents.</description>
-</property>
-
-<property>
- <name>elastic.max.bulk.size</name>
- <value>2500500</value>
- <description>Maximum size of the bulk in bytes.</description>
-</property>
-
-<property>
- <name>elastic.exponential.backoff.millis</name>
- <value>100</value>
- <description>Initial delay for the BulkProcessor's exponential backoff policy.
- </description>
-</property>
-
-<property>
- <name>elastic.exponential.backoff.retries</name>
- <value>10</value>
- <description>Number of times the BulkProcessor's exponential backoff policy
- should retry bulk operations.</description>
-</property>
-
-<property>
- <name>elastic.bulk.close.timeout</name>
- <value>600</value>
- <description>Number of seconds allowed for the BulkProcessor to complete its
- last operation.</description>
-</property>
-
-<!-- RabbitMQ indexer properties -->
-
-<property>
- <name>rabbitmq.indexer.server.uri</name>
- <value>amqp://guest:guest@localhost:5672/</value>
- <description>
- URI with connection parameters in the form
- amqp://username:password@hostname:port/virtualHost
- Where:
- username is the username for RabbitMQ server.
- password is the password for RabbitMQ server.
- hostname is where the RabbitMQ server is running.
- port is where the RabbitMQ server is listening.
- virtualHost is where where the exchange is and the user has access.
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.binding</name>
- <value>false</value>
- <description>
- Whether the relationship between an exchange and a queue is created
- automatically. Default "false".
-
- NOTE: Binding between exchanges is not supported.
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.binding.arguments</name>
- <value></value>
- <description>
- Arguments used in binding. It must have the form key1=value1,key2=value2.
- This value is only used when the exchange's type is headers and
- the value of 'rabbitmq.indexer.binding' property is true. In other cases
- is ignored.
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.exchange.name</name>
- <value></value>
- <description>
- Name for the exchange where the messages will be sent. Default "".
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.exchange.options</name>
- <value>type=direct,durable=true</value>
- <description>
- Options used when the exchange is created.
- Only used when the value of 'rabbitmq.indexer.binding' property is true.
- Default "type=direct,durable=true".
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.queue.name</name>
- <value>nutch.queue</value>
- <description>
- Name of the queue used to create the binding. Default "nutch.queue".
- Only used when the value of 'rabbitmq.indexer.binding' property is true.
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.queue.options</name>
- <value>durable=true,exclusive=false,auto-delete=false</value>
- <description>
- Options used when the queue is created.
- Only used when the value of 'rabbitmq.indexer.binding' property is true.
- Default "durable=true,exclusive=false,auto-delete=false".
-
- It must have the form
- durable={durable},exclusive={exclusive},auto-delete={auto-delete},arguments={arguments}
- where:
- durable is true or false
- exclusive is true or false
- auto-delete is true or false
- arguments must be the for {key1:value1;key2:value2}
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.routingkey</name>
- <value></value>
- <description>
- The routing key used to publish messages to specific queues.
- It is only used when the exchange type is "topic" or "direct". Default
- is the value of 'rabbitmq.indexer.queue.name' property.
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.commit.mode</name>
- <value>multiple</value>
- <description>
- "single" if a message contains only one document. In this case a header
- with the action (write, update or delete) will be added.
- "multiple" if a message contains all documents. Default "multiple".
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.commit.size</name>
- <value>250</value>
- <description>
- Amount of documents to send into each message if the value of
- 'rabbitmq.indexer.commit.mode' property is "multiple". Default "250".
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.headers.static</name>
- <value></value>
- <description>
- Headers to add to each message. It must have the form key1=value1,key2=value2.
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.headers.dynamic</name>
- <value></value>
- <description>
- Document's fields to add as headers to each message. It must have the form field1,field2.
- </description>
-</property>
-
-<!--elasticsearch rest properties-->
-<property>
- <name>elastic.rest.host</name>
- <value></value>
- <description>
- The hostname or a list of comma separated hostnames to send documents to
- using Elasticsearch Jest. Both host and port must be defined.
- </description>
-</property>
-
-<property>
- <name>elastic.rest.port</name>
- <value></value>
- <description>The port to connect to using Elasticsearch Jest.</description>
-</property>
-
-<property>
- <name>elastic.rest.index</name>
- <value>nutch</value>
- <description>Default index to send documents to.</description>
-</property>
-
-<property>
- <name>elastic.rest.index.languages</name>
- <value></value>
- <description>
- A list of strings denoting the supported languages (e.g. `en,de,fr,it`).
- If this value is empty all documents will be sent to index ${elastic.rest.index}.
- If not empty the Rest client will distribute documents in different indices based on their `lang` property.
- Indices are named with the following schema: ${elastic.rest.index}${elastic.rest.index.separator}${lang} (e.g. `nutch_de`).
- Entries with an unsupported `lang` value will be added to index ${elastic.rest.index}${elastic.rest.index.separator}${elastic.rest.index.sink} (e.g. `nutch_others`).
- </description>
-</property>
-
-<property>
- <name>elastic.rest.index.separator</name>
- <value>_</value>
- <description>
- Default value is `_`. Is used only if `elastic.rest.index.languages` is defined to build the index name (i.e. ${elastic.rest.index}${elastic.rest.index.separator}${lang}).
- </description>
-</property>
-
-<property>
- <name>elastic.rest.index.sink</name>
- <value>others</value>
- <description>
- Default value is `others`. Is used only if `elastic.rest.index.languages` is defined to build the index name where to store documents with unsupported languages (i.e. ${elastic.rest.index}${elastic.rest.index.separator}${elastic.rest.index.sink}).
- </description>
-</property>
-
-<property>
- <name>elastic.rest.type</name>
- <value>doc</value>
- <description>Default type to send documents to.</description>
-</property>
-
-<property>
- <name>elastic.rest.max.bulk.docs</name>
- <value>250</value>
- <description>Maximum size of the bulk in number of documents.</description>
-</property>
-
-<property>
- <name>elastic.rest.max.bulk.size</name>
- <value>26214400</value>
- <description>Maximum size of the bulk in bytes.</description>
-</property>
-
-<property>
- <name>elastic.rest.https</name>
- <value>false</value>
- <description>
- "true" to enable https, "false" to disable https
- If you've disabled http access (by forcing https), be sure to
- set this to true, otherwise you might get "connection reset by peer".
- </description>
-</property>
-
-<property>
- <name>elastic.rest.user</name>
- <value></value>
- <description>Username for auth credentials (only used when https is enabled)</description>
-</property>
-
-<property>
- <name>elastic.rest.password</name>
- <value></value>
- <description>Password for auth credentials (only used when https is enabled)</description>
-</property>
-
-<property>
- <name>elastic.rest.trustallhostnames</name>
- <value>false</value>
- <description>
- "true" to trust elasticsearch server's certificate even if its listed domain name does not
- match the domain they are hosted on
- "false" to check if the elasticsearch server's certificate's listed domain is the same domain
- that it is hosted on, and if it doesn't, then fail to index
- (only used when https is enabled)
- </description>
-</property>
-
<!-- subcollection properties -->
<property>
diff --git a/conf/solrindex-mapping.xml b/conf/solrindex-mapping.xml
deleted file mode 100644
index 2b581bb..0000000
--- a/conf/solrindex-mapping.xml
+++ /dev/null
@@ -1,43 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<mapping>
- <!-- Simple mapping of fields created by Nutch IndexingFilters
- to fields defined (and expected) in Solr schema.xml.
-
- Any fields in NutchDocument that match a name defined
- in field/@source will be renamed to the corresponding
- field/@dest.
- Additionally, if a field name (before mapping) matches
- a copyField/@source then its values will be copied to
- the corresponding copyField/@dest.
-
- uniqueKey has the same meaning as in Solr schema.xml
- and defaults to "id" if not defined.
- -->
- <fields>
- <field dest="content" source="content"/>
- <field dest="title" source="title"/>
- <field dest="host" source="host"/>
- <field dest="segment" source="segment"/>
- <field dest="boost" source="boost"/>
- <field dest="digest" source="digest"/>
- <field dest="tstamp" source="tstamp"/>
- </fields>
- <uniqueKey>id</uniqueKey>
-</mapping>
diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java
index a646510..d272841 100644
--- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java
+++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java
@@ -17,13 +17,14 @@
package org.apache.nutch.indexwriter.elastic;
public interface ElasticConstants {
- public static final String HOSTS = "host";
- public static final String PORT = "port";
- public static final String CLUSTER = "cluster";
- public static final String INDEX = "index";
- public static final String MAX_BULK_DOCS = "max.bulk.docs";
- public static final String MAX_BULK_LENGTH = "max.bulk.size";
- public static final String EXPONENTIAL_BACKOFF_MILLIS = "exponential.backoff.millis";
- public static final String EXPONENTIAL_BACKOFF_RETRIES = "exponential.backoff.retries";
- public static final String BULK_CLOSE_TIMEOUT = "bulk.close.timeout";
+ String HOSTS = "host";
+ String PORT = "port";
+ String CLUSTER = "cluster";
+ String INDEX = "index";
+ String MAX_BULK_DOCS = "max.bulk.docs";
+ String MAX_BULK_LENGTH = "max.bulk.size";
+ String EXPONENTIAL_BACKOFF_MILLIS = "exponential.backoff.millis";
+ String EXPONENTIAL_BACKOFF_RETRIES = "exponential.backoff.retries";
+ String BULK_CLOSE_TIMEOUT = "bulk.close.timeout";
+ String OPTIONS = "options";
}
diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
index a82beae..ee31527 100644
--- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
+++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
@@ -17,7 +17,6 @@
package org.apache.nutch.indexwriter.elastic;
import java.lang.invoke.MethodHandles;
-import java.io.BufferedReader;
import java.io.IOException;
import java.net.InetAddress;
import java.util.AbstractMap;
@@ -143,16 +142,17 @@
Settings.Builder settingsBuilder = Settings.builder();
- BufferedReader reader = new BufferedReader(
- config.getConfResourceAsReader("elasticsearch.conf"));
- String line;
- String[] parts;
- while ((line = reader.readLine()) != null) {
- if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
- parts = line.trim().split("=");
+ String options = parameters.get(ElasticConstants.OPTIONS);
- if (parts.length == 2) {
- settingsBuilder.put(parts[0].trim(), parts[1].trim());
+ if (options != null) {
+ String[] lines = options.trim().split(",");
+ for (String line : lines) {
+ if (StringUtils.isNotBlank(line)) {
+ String[] parts = line.trim().split("=");
+
+ if (parts.length == 2) {
+ settingsBuilder.put(parts[0].trim(), parts[1].trim());
+ }
}
}
}
@@ -168,8 +168,8 @@
// Prefer TransportClient
if (hosts != null && port > 1) {
- @SuppressWarnings("resource")
- TransportClient transportClient = new PreBuiltTransportClient(settings);
+ @SuppressWarnings("resource") TransportClient transportClient = new PreBuiltTransportClient(
+ settings);
for (String host : hosts)
transportClient.addTransportAddress(
diff --git a/src/plugin/indexer-solr/README.md b/src/plugin/indexer-solr/README.md
index 1d60acc..a5305ca 100644
--- a/src/plugin/indexer-solr/README.md
+++ b/src/plugin/indexer-solr/README.md
@@ -37,4 +37,8 @@
commitSize | Defines the number of documents to send to Solr in a single update batch. Decrease when handling very large documents to prevent Nutch from running out of memory.<br>**Note**: It does not explicitly trigger a server side commit. | 1000
auth | Whether to enable HTTP basic authentication for communicating with Solr. Use the `username` and `password` properties to configure your credentials. | false
username | The username of Solr server. | username
-password | The password of Solr server. | password
\ No newline at end of file
+password | The password of Solr server. | password
+
+## schema.xml
+
+In the distribution of the indexer-solr plugin there is a schema.xml file available. Nutch does not use this file, but it is provided to Solr users as a reference/guide to facilitate the configuration of Solr.
\ No newline at end of file
diff --git a/conf/schema.xml b/src/plugin/indexer-solr/schema.xml
similarity index 100%
rename from conf/schema.xml
rename to src/plugin/indexer-solr/schema.xml