Merge branch 'master' into NUTCH-2732
diff --git a/.gitignore b/.gitignore
index 4992c58..3e3bad2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,11 +7,18 @@
runtime/
logs/
/bin/
-.classpath
-.naivebayes-model.crc
-.project
ivy/ivy-2.3.0.jar
ivy/ivy-2.4.0.jar
ivy/ivy-2.5.0-rc1.jar
naivebayes-model
+.naivebayes-model.crc
.gitconfig
+# Eclipse configuration files
+.classpath
+.project
+.settings/
+.externalToolBuilders/
+# IntelliJ IDEA configuration files
+.idea/
+*.iml
+*.swp
diff --git a/CHANGES.txt b/CHANGES.txt
index 12f5aad..ff564d3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -4,6 +4,9 @@
Comments
+ - schema.xml has been moved to indexer-solr plugin directory. This file is provided as a
+ reference/guide for Solr users (NUTCH-2654)
+
Breaking Changes
- The value of crawl.gen.delay is now read in milliseconds as stated in the description
diff --git a/README.md b/README.md
index 3aed205..ad091f1 100644
--- a/README.md
+++ b/README.md
@@ -36,6 +36,20 @@
11. git push -u <your git username> NUTCH-xxx
12. git pull-request
```
+
+IDE setup
+=========
+
+Generate Eclipse project files
+
+```
+ant eclipse
+```
+
+and follow the instructions in [Importing existing projects](https://help.eclipse.org/2019-06/topic/org.eclipse.platform.doc.user/tasks/tasks-importproject.htm).
+
+IntelliJ IDEA users can also import Eclipse projects using the ["Eclipser" plugin](https://www.tutorialspoint.com/intellij_idea/intellij_idea_migrating_from_eclipse.htm)https://plugins.jetbrains.com/plugin/7153-eclipser), see also [Importing Eclipse Projects into IntelliJ IDEA](https://www.jetbrains.com/help/idea/migrating-from-eclipse-to-intellij-idea.html#migratingEclipseProject).
+
Export Control
==============
diff --git a/conf/elasticsearch.conf b/conf/elasticsearch.conf
deleted file mode 100644
index c4c73b9..0000000
--- a/conf/elasticsearch.conf
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Settings for Elasticsearch indexer plugin
-# Format: key=value\n
diff --git a/conf/index-writers.xml.template b/conf/index-writers.xml.template
index 268554a..808e31f 100644
--- a/conf/index-writers.xml.template
+++ b/conf/index-writers.xml.template
@@ -115,6 +115,7 @@
<param name="exponential.backoff.millis" value="100"/>
<param name="exponential.backoff.retries" value="10"/>
<param name="bulk.close.timeout" value="600"/>
+ <!--<param name="options" value="key1=value1,key2=value2"/>-->
</parameters>
<mapping>
<copy>
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index fd201c7..c5359bc 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2073,376 +2073,6 @@
</description>
</property>
-<!-- solr index properties -->
-
-<property>
- <name>solr.server.type</name>
- <value>http</value>
- <description>
- Specifies the SolrServer implementation to use. This is a string value
- of one of the following 'cloud', 'concurrent', 'http' or 'lb'.
- The values represent CloudSolrServer, ConcurrentUpdateSolrServer,
- HttpSolrServer or LBHttpSolrServer respectively.
- </description>
-</property>
-
-<property>
- <name>solr.server.url</name>
- <value>http://127.0.0.1:8983/solr/</value>
- <description>
- Defines the Solr URL into which data should be indexed using the
- indexer-solr plugin.
- </description>
-</property>
-
-<property>
- <name>solr.zookeeper.url</name>
- <value></value>
- <description>
- Defines the Zookeeper URL which is an essential setting to be used
- when using SolrCloud. This should be a fully qualified URL similar to
- the property provided within 'solr.server.url' above.
- </description>
-</property>
-
-<property>
- <name>solr.loadbalance.urls</name>
- <value></value>
- <description>
- A comma-separated value representing the Solr servers to be used when
- initiating LBHttpSolrServer as the SolrServer implementation.
- </description>
-</property>
-
-<property>
- <name>solr.mapping.file</name>
- <value>solrindex-mapping.xml</value>
- <description>
- Defines the name of the file that will be used in the mapping of internal
- Nutch field names to solr index fields as specified in the target Solr schema.
- </description>
-</property>
-
-<property>
- <name>solr.commit.size</name>
- <value>250</value>
- <description>
- Defines the number of documents to send to Solr in a single update batch.
- Decrease when handling very large documents to prevent Nutch from running
- out of memory. NOTE: It does not explicitly trigger a server side commit.
- </description>
-</property>
-
-<property>
- <name>solr.commit.index</name>
- <value>true</value>
- <description>
- When closing the indexer, trigger a commit to the Solr server.
- </description>
-</property>
-
-<property>
- <name>solr.auth</name>
- <value>false</value>
- <description>
- Whether to enable HTTP basic authentication for communicating with Solr.
- Use the solr.auth.username and solr.auth.password properties to configure
- your credentials.
- </description>
-</property>
-
-<!-- Elasticsearch properties -->
-
-<property>
- <name>elastic.host</name>
- <value></value>
- <description>Comma-separated list of hostnames to send documents to using
- TransportClient. Either host and port must be defined or cluster.</description>
-</property>
-
-<property>
- <name>elastic.port</name>
- <value>9300</value>
- <description>The port to connect to using TransportClient.</description>
-</property>
-
-<property>
- <name>elastic.cluster</name>
- <value></value>
- <description>The cluster name to discover. Either host and port must be defined
- or cluster.</description>
-</property>
-
-<property>
- <name>elastic.index</name>
- <value>nutch</value>
- <description>Default index to send documents to.</description>
-</property>
-
-<property>
- <name>elastic.max.bulk.docs</name>
- <value>250</value>
- <description>Maximum size of the bulk in number of documents.</description>
-</property>
-
-<property>
- <name>elastic.max.bulk.size</name>
- <value>2500500</value>
- <description>Maximum size of the bulk in bytes.</description>
-</property>
-
-<property>
- <name>elastic.exponential.backoff.millis</name>
- <value>100</value>
- <description>Initial delay for the BulkProcessor's exponential backoff policy.
- </description>
-</property>
-
-<property>
- <name>elastic.exponential.backoff.retries</name>
- <value>10</value>
- <description>Number of times the BulkProcessor's exponential backoff policy
- should retry bulk operations.</description>
-</property>
-
-<property>
- <name>elastic.bulk.close.timeout</name>
- <value>600</value>
- <description>Number of seconds allowed for the BulkProcessor to complete its
- last operation.</description>
-</property>
-
-<!-- RabbitMQ indexer properties -->
-
-<property>
- <name>rabbitmq.indexer.server.uri</name>
- <value>amqp://guest:guest@localhost:5672/</value>
- <description>
- URI with connection parameters in the form
- amqp://username:password@hostname:port/virtualHost
- Where:
- username is the username for RabbitMQ server.
- password is the password for RabbitMQ server.
- hostname is where the RabbitMQ server is running.
- port is where the RabbitMQ server is listening.
- virtualHost is where where the exchange is and the user has access.
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.binding</name>
- <value>false</value>
- <description>
- Whether the relationship between an exchange and a queue is created
- automatically. Default "false".
-
- NOTE: Binding between exchanges is not supported.
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.binding.arguments</name>
- <value></value>
- <description>
- Arguments used in binding. It must have the form key1=value1,key2=value2.
- This value is only used when the exchange's type is headers and
- the value of 'rabbitmq.indexer.binding' property is true. In other cases
- is ignored.
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.exchange.name</name>
- <value></value>
- <description>
- Name for the exchange where the messages will be sent. Default "".
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.exchange.options</name>
- <value>type=direct,durable=true</value>
- <description>
- Options used when the exchange is created.
- Only used when the value of 'rabbitmq.indexer.binding' property is true.
- Default "type=direct,durable=true".
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.queue.name</name>
- <value>nutch.queue</value>
- <description>
- Name of the queue used to create the binding. Default "nutch.queue".
- Only used when the value of 'rabbitmq.indexer.binding' property is true.
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.queue.options</name>
- <value>durable=true,exclusive=false,auto-delete=false</value>
- <description>
- Options used when the queue is created.
- Only used when the value of 'rabbitmq.indexer.binding' property is true.
- Default "durable=true,exclusive=false,auto-delete=false".
-
- It must have the form
- durable={durable},exclusive={exclusive},auto-delete={auto-delete},arguments={arguments}
- where:
- durable is true or false
- exclusive is true or false
- auto-delete is true or false
- arguments must be the for {key1:value1;key2:value2}
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.routingkey</name>
- <value></value>
- <description>
- The routing key used to publish messages to specific queues.
- It is only used when the exchange type is "topic" or "direct". Default
- is the value of 'rabbitmq.indexer.queue.name' property.
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.commit.mode</name>
- <value>multiple</value>
- <description>
- "single" if a message contains only one document. In this case a header
- with the action (write, update or delete) will be added.
- "multiple" if a message contains all documents. Default "multiple".
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.commit.size</name>
- <value>250</value>
- <description>
- Amount of documents to send into each message if the value of
- 'rabbitmq.indexer.commit.mode' property is "multiple". Default "250".
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.headers.static</name>
- <value></value>
- <description>
- Headers to add to each message. It must have the form key1=value1,key2=value2.
- </description>
-</property>
-
-<property>
- <name>rabbitmq.indexer.headers.dynamic</name>
- <value></value>
- <description>
- Document's fields to add as headers to each message. It must have the form field1,field2.
- </description>
-</property>
-
-<!--elasticsearch rest properties-->
-<property>
- <name>elastic.rest.host</name>
- <value></value>
- <description>
- The hostname or a list of comma separated hostnames to send documents to
- using Elasticsearch Jest. Both host and port must be defined.
- </description>
-</property>
-
-<property>
- <name>elastic.rest.port</name>
- <value></value>
- <description>The port to connect to using Elasticsearch Jest.</description>
-</property>
-
-<property>
- <name>elastic.rest.index</name>
- <value>nutch</value>
- <description>Default index to send documents to.</description>
-</property>
-
-<property>
- <name>elastic.rest.index.languages</name>
- <value></value>
- <description>
- A list of strings denoting the supported languages (e.g. `en,de,fr,it`).
- If this value is empty all documents will be sent to index ${elastic.rest.index}.
- If not empty the Rest client will distribute documents in different indices based on their `lang` property.
- Indices are named with the following schema: ${elastic.rest.index}${elastic.rest.index.separator}${lang} (e.g. `nutch_de`).
- Entries with an unsupported `lang` value will be added to index ${elastic.rest.index}${elastic.rest.index.separator}${elastic.rest.index.sink} (e.g. `nutch_others`).
- </description>
-</property>
-
-<property>
- <name>elastic.rest.index.separator</name>
- <value>_</value>
- <description>
- Default value is `_`. Is used only if `elastic.rest.index.languages` is defined to build the index name (i.e. ${elastic.rest.index}${elastic.rest.index.separator}${lang}).
- </description>
-</property>
-
-<property>
- <name>elastic.rest.index.sink</name>
- <value>others</value>
- <description>
- Default value is `others`. Is used only if `elastic.rest.index.languages` is defined to build the index name where to store documents with unsupported languages (i.e. ${elastic.rest.index}${elastic.rest.index.separator}${elastic.rest.index.sink}).
- </description>
-</property>
-
-<property>
- <name>elastic.rest.type</name>
- <value>doc</value>
- <description>Default type to send documents to.</description>
-</property>
-
-<property>
- <name>elastic.rest.max.bulk.docs</name>
- <value>250</value>
- <description>Maximum size of the bulk in number of documents.</description>
-</property>
-
-<property>
- <name>elastic.rest.max.bulk.size</name>
- <value>26214400</value>
- <description>Maximum size of the bulk in bytes.</description>
-</property>
-
-<property>
- <name>elastic.rest.https</name>
- <value>false</value>
- <description>
- "true" to enable https, "false" to disable https
- If you've disabled http access (by forcing https), be sure to
- set this to true, otherwise you might get "connection reset by peer".
- </description>
-</property>
-
-<property>
- <name>elastic.rest.user</name>
- <value></value>
- <description>Username for auth credentials (only used when https is enabled)</description>
-</property>
-
-<property>
- <name>elastic.rest.password</name>
- <value></value>
- <description>Password for auth credentials (only used when https is enabled)</description>
-</property>
-
-<property>
- <name>elastic.rest.trustallhostnames</name>
- <value>false</value>
- <description>
- "true" to trust elasticsearch server's certificate even if its listed domain name does not
- match the domain they are hosted on
- "false" to check if the elasticsearch server's certificate's listed domain is the same domain
- that it is hosted on, and if it doesn't, then fail to index
- (only used when https is enabled)
- </description>
-</property>
-
<!-- subcollection properties -->
<property>
diff --git a/conf/solrindex-mapping.xml b/conf/solrindex-mapping.xml
deleted file mode 100644
index 2b581bb..0000000
--- a/conf/solrindex-mapping.xml
+++ /dev/null
@@ -1,43 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<mapping>
- <!-- Simple mapping of fields created by Nutch IndexingFilters
- to fields defined (and expected) in Solr schema.xml.
-
- Any fields in NutchDocument that match a name defined
- in field/@source will be renamed to the corresponding
- field/@dest.
- Additionally, if a field name (before mapping) matches
- a copyField/@source then its values will be copied to
- the corresponding copyField/@dest.
-
- uniqueKey has the same meaning as in Solr schema.xml
- and defaults to "id" if not defined.
- -->
- <fields>
- <field dest="content" source="content"/>
- <field dest="title" source="title"/>
- <field dest="host" source="host"/>
- <field dest="segment" source="segment"/>
- <field dest="boost" source="boost"/>
- <field dest="digest" source="digest"/>
- <field dest="tstamp" source="tstamp"/>
- </fields>
- <uniqueKey>id</uniqueKey>
-</mapping>
diff --git a/docker/Dockerfile b/docker/Dockerfile
index c5ba807..3077d1a 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -13,20 +13,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-FROM ubuntu:16.04
-MAINTAINER Michael Joyce <joyce@apache.org>
+FROM ubuntu:18.04
+MAINTAINER Apache Nutch Committers <dev@nutch.apache.org>
WORKDIR /root/
# Install dependencies
RUN apt update
-RUN apt install -y ant openssh-server vim telnet git rsync curl openjdk-8-jdk-headless
+RUN apt install -y ant git openjdk-8-jdk-headless
# Set up JAVA_HOME
RUN echo 'export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64' >> $HOME/.bashrc
-# Checkout and build the nutch trunk
+# Checkout and build the Nutch master branch (1.x)
RUN git clone https://github.com/apache/nutch.git nutch_source && cd nutch_source && ant runtime
# Convenience symlink to Nutch runtime local
diff --git a/docker/README.md b/docker/README.md
index 36fc852..25d601f 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -14,11 +14,11 @@
Current configuration of this image consists of components:
-* Nutch 1.x
+* Nutch 1.x (branch "master")
## Base Image
-* [ubuntu:14.04](https://registry.hub.docker.com/_/ubuntu/)
+* [ubuntu:18.04](https://hub.docker.com/_/ubuntu/)
## Tips
diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java
index cbfbe0c..18e3871 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -132,46 +132,27 @@
context.write(key, (CrawlDatum) value);
}
else if (value instanceof HostDatum) {
- // For entry from hostdb, get sitemap url(s) from robots.txt, fetch the sitemap,
- // extract urls and emit those
-
- // try different combinations of schemes one by one till we get rejection in all cases
- String host = key.toString();
- if((url = filterNormalize("http://" + host + "/")) == null &&
- (url = filterNormalize("https://" + host + "/")) == null &&
- (url = filterNormalize("ftp://" + host + "/")) == null &&
- (url = filterNormalize("file:/" + host + "/")) == null) {
- context.getCounter("Sitemap", "filtered_records").increment(1);
- return;
- }
- // We may wish to use the robots.txt content as the third parameter for .getRobotRules
- BaseRobotRules rules = protocolFactory.getProtocol(url).getRobotRules(new Text(url), datum, null);
- List<String> sitemaps = rules.getSitemaps();
-
- if (tryDefaultSitemapXml && sitemaps.size() == 0) {
- sitemaps.add(url + "sitemap.xml");
- }
- for (String sitemap : sitemaps) {
- context.getCounter("Sitemap", "sitemaps_from_hostdb").increment(1);
- sitemap = filterNormalize(sitemap);
- if (sitemap == null) {
- context.getCounter("Sitemap", "filtered_sitemaps_from_hostdb")
- .increment(1);
- } else {
- generateSitemapUrlDatum(protocolFactory.getProtocol(sitemap),
- sitemap, context);
- }
- }
+ generateSitemapsFromHostname(key.toString(), context);
}
else if (value instanceof Text) {
- // For entry from sitemap urls file, fetch the sitemap, extract urls and emit those
- if((url = filterNormalize(key.toString())) == null) {
- context.getCounter("Sitemap", "filtered_records").increment(1);
- return;
- }
+ // Input can be sitemap URL or hostname
+ url = key.toString();
+ if (url.startsWith("http://") ||
+ url.startsWith("https://") ||
+ url.startsWith("ftp://") ||
+ url.startsWith("file:/")) {
+ // For entry from sitemap urls file, fetch the sitemap, extract urls and emit those
+ if((url = filterNormalize(url)) == null) {
+ context.getCounter("Sitemap", "filtered_records").increment(1);
+ return;
+ }
- context.getCounter("Sitemap", "sitemap_seeds").increment(1);
- generateSitemapUrlDatum(protocolFactory.getProtocol(url), url, context);
+ context.getCounter("Sitemap", "sitemap_seeds").increment(1);
+ generateSitemapUrlDatum(protocolFactory.getProtocol(url), url, context);
+ } else {
+ LOG.info("generateSitemapsFromHostname: " + key.toString());
+ generateSitemapsFromHostname(key.toString(), context);
+ }
}
} catch (Exception e) {
LOG.warn("Exception for record {} : {}", key.toString(), StringUtils.stringifyException(e));
@@ -191,6 +172,43 @@
}
return url;
}
+
+ private void generateSitemapsFromHostname(String host, Context context) {
+ try {
+ // For entry from hostdb, get sitemap url(s) from robots.txt, fetch the sitemap,
+ // extract urls and emit those
+
+ // try different combinations of schemes one by one till we get rejection in all cases
+ String url;
+ if((url = filterNormalize("http://" + host + "/")) == null &&
+ (url = filterNormalize("https://" + host + "/")) == null &&
+ (url = filterNormalize("ftp://" + host + "/")) == null &&
+ (url = filterNormalize("file:/" + host + "/")) == null) {
+ context.getCounter("Sitemap", "filtered_records").increment(1);
+ return;
+ }
+ // We may wish to use the robots.txt content as the third parameter for .getRobotRules
+ BaseRobotRules rules = protocolFactory.getProtocol(url).getRobotRules(new Text(url), datum, null);
+ List<String> sitemaps = rules.getSitemaps();
+
+ if (tryDefaultSitemapXml && sitemaps.size() == 0) {
+ sitemaps.add(url + "sitemap.xml");
+ }
+ for (String sitemap : sitemaps) {
+ context.getCounter("Sitemap", "sitemaps_from_hostname").increment(1);
+ sitemap = filterNormalize(sitemap);
+ if (sitemap == null) {
+ context.getCounter("Sitemap", "filtered_sitemaps_from_hostname")
+ .increment(1);
+ } else {
+ generateSitemapUrlDatum(protocolFactory.getProtocol(sitemap),
+ sitemap, context);
+ }
+ }
+ } catch (Exception e) {
+ LOG.warn("Exception for record {} : {}", host, StringUtils.stringifyException(e));
+ }
+ }
private void generateSitemapUrlDatum(Protocol protocol, String url, Context context) throws Exception {
ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum);
@@ -399,13 +417,13 @@
if (LOG.isInfoEnabled()) {
long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue();
- long fromHostDb = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostdb").getValue();
+ long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue();
long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue();
long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue();
long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue();
LOG.info("SitemapProcessor: Total records rejected by filters: {}", filteredRecords);
- LOG.info("SitemapProcessor: Total sitemaps from HostDb: {}", fromHostDb);
+ LOG.info("SitemapProcessor: Total sitemaps from host name: {}", fromHostname);
LOG.info("SitemapProcessor: Total sitemaps from seed urls: {}", fromSeeds);
LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", failedFetches);
LOG.info("SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries);
@@ -431,7 +449,7 @@
System.err.println("\t<crawldb>\t\tpath to crawldb where the sitemap urls would be injected");
System.err.println("\t-hostdb <hostdb>\tpath of a hostdb. Sitemap(s) from these hosts would be downloaded");
- System.err.println("\t-sitemapUrls <url_dir>\tpath to sitemap urls directory");
+ System.err.println("\t-sitemapUrls <url_dir>\tpath to directory with sitemap urls or hostnames");
System.err.println("\t-threads <threads>\tNumber of threads created per mapper to fetch sitemap urls (default: 8)");
System.err.println("\t-force\t\t\tforce update even if CrawlDb appears to be locked (CAUTION advised)");
System.err.println("\t-noStrict\t\tBy default Sitemap parser rejects invalid urls. '-noStrict' disables that.");
diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java
index a646510..d272841 100644
--- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java
+++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java
@@ -17,13 +17,14 @@
package org.apache.nutch.indexwriter.elastic;
public interface ElasticConstants {
- public static final String HOSTS = "host";
- public static final String PORT = "port";
- public static final String CLUSTER = "cluster";
- public static final String INDEX = "index";
- public static final String MAX_BULK_DOCS = "max.bulk.docs";
- public static final String MAX_BULK_LENGTH = "max.bulk.size";
- public static final String EXPONENTIAL_BACKOFF_MILLIS = "exponential.backoff.millis";
- public static final String EXPONENTIAL_BACKOFF_RETRIES = "exponential.backoff.retries";
- public static final String BULK_CLOSE_TIMEOUT = "bulk.close.timeout";
+ String HOSTS = "host";
+ String PORT = "port";
+ String CLUSTER = "cluster";
+ String INDEX = "index";
+ String MAX_BULK_DOCS = "max.bulk.docs";
+ String MAX_BULK_LENGTH = "max.bulk.size";
+ String EXPONENTIAL_BACKOFF_MILLIS = "exponential.backoff.millis";
+ String EXPONENTIAL_BACKOFF_RETRIES = "exponential.backoff.retries";
+ String BULK_CLOSE_TIMEOUT = "bulk.close.timeout";
+ String OPTIONS = "options";
}
diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
index a82beae..ee31527 100644
--- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
+++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
@@ -17,7 +17,6 @@
package org.apache.nutch.indexwriter.elastic;
import java.lang.invoke.MethodHandles;
-import java.io.BufferedReader;
import java.io.IOException;
import java.net.InetAddress;
import java.util.AbstractMap;
@@ -143,16 +142,17 @@
Settings.Builder settingsBuilder = Settings.builder();
- BufferedReader reader = new BufferedReader(
- config.getConfResourceAsReader("elasticsearch.conf"));
- String line;
- String[] parts;
- while ((line = reader.readLine()) != null) {
- if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
- parts = line.trim().split("=");
+ String options = parameters.get(ElasticConstants.OPTIONS);
- if (parts.length == 2) {
- settingsBuilder.put(parts[0].trim(), parts[1].trim());
+ if (options != null) {
+ String[] lines = options.trim().split(",");
+ for (String line : lines) {
+ if (StringUtils.isNotBlank(line)) {
+ String[] parts = line.trim().split("=");
+
+ if (parts.length == 2) {
+ settingsBuilder.put(parts[0].trim(), parts[1].trim());
+ }
}
}
}
@@ -168,8 +168,8 @@
// Prefer TransportClient
if (hosts != null && port > 1) {
- @SuppressWarnings("resource")
- TransportClient transportClient = new PreBuiltTransportClient(settings);
+ @SuppressWarnings("resource") TransportClient transportClient = new PreBuiltTransportClient(
+ settings);
for (String host : hosts)
transportClient.addTransportAddress(
diff --git a/src/plugin/indexer-solr/README.md b/src/plugin/indexer-solr/README.md
index 1d60acc..a5305ca 100644
--- a/src/plugin/indexer-solr/README.md
+++ b/src/plugin/indexer-solr/README.md
@@ -37,4 +37,8 @@
commitSize | Defines the number of documents to send to Solr in a single update batch. Decrease when handling very large documents to prevent Nutch from running out of memory.<br>**Note**: It does not explicitly trigger a server side commit. | 1000
auth | Whether to enable HTTP basic authentication for communicating with Solr. Use the `username` and `password` properties to configure your credentials. | false
username | The username of Solr server. | username
-password | The password of Solr server. | password
\ No newline at end of file
+password | The password of Solr server. | password
+
+## schema.xml
+
+In the distribution of the indexer-solr plugin there is a schema.xml file available. Nutch does not use this file, but it is provided to Solr users as a reference/guide to facilitate the configuration of Solr.
\ No newline at end of file
diff --git a/conf/schema.xml b/src/plugin/indexer-solr/schema.xml
similarity index 100%
rename from conf/schema.xml
rename to src/plugin/indexer-solr/schema.xml