Merge branch 'master' into NUTCH-2732

commit: 8875a8fcd154e5a1c72310749de50eeb1a422118 [log] [tgz]
author: r0ann3l <roannel.fdez@gmail.com> Fri Sep 27 11:32:15 2019 -0400
committer: r0ann3l <roannel.fdez@gmail.com> Fri Sep 27 11:32:15 2019 -0400
tree: bc8fcd90bf15090b92ca82cd64536648b59b62f3
parent: 026d2e783cce2372c2e2ba426afc0effd8b7dd25 [diff]
parent: caa94228614b0d63049a41978939d9a1e2f729d3 [diff]
diff --git a/.gitignore b/.gitignore
index 4992c58..3e3bad2 100644
--- a/.gitignore
+++ b/.gitignore

@@ -7,11 +7,18 @@
 runtime/
 logs/
 /bin/
-.classpath
-.naivebayes-model.crc
-.project
 ivy/ivy-2.3.0.jar
 ivy/ivy-2.4.0.jar
 ivy/ivy-2.5.0-rc1.jar
 naivebayes-model
+.naivebayes-model.crc
 .gitconfig
+# Eclipse configuration files
+.classpath
+.project
+.settings/
+.externalToolBuilders/
+# IntelliJ IDEA configuration files
+.idea/
+*.iml
+*.swp

diff --git a/CHANGES.txt b/CHANGES.txt
index 12f5aad..ff564d3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt

@@ -4,6 +4,9 @@
 
 Comments
 
+    -  schema.xml has been moved to indexer-solr plugin directory. This file is provided as a
+       reference/guide for Solr users (NUTCH-2654)
+
 Breaking Changes
 
     -  The value of crawl.gen.delay is now read in milliseconds as stated in the description

diff --git a/README.md b/README.md
index 3aed205..ad091f1 100644
--- a/README.md
+++ b/README.md

@@ -36,6 +36,20 @@
 11. git push -u <your git username> NUTCH-xxx
 12. git pull-request
 ```
+
+IDE setup
+=========
+
+Generate Eclipse project files
+
+```
+ant eclipse
+```
+
+and follow the instructions in [Importing existing projects](https://help.eclipse.org/2019-06/topic/org.eclipse.platform.doc.user/tasks/tasks-importproject.htm).
+
+IntelliJ IDEA users can also import Eclipse projects using the ["Eclipser" plugin](https://www.tutorialspoint.com/intellij_idea/intellij_idea_migrating_from_eclipse.htm)https://plugins.jetbrains.com/plugin/7153-eclipser), see also [Importing Eclipse Projects into IntelliJ IDEA](https://www.jetbrains.com/help/idea/migrating-from-eclipse-to-intellij-idea.html#migratingEclipseProject).
+
    
 Export Control
 ==============

diff --git a/conf/elasticsearch.conf b/conf/elasticsearch.conf
deleted file mode 100644
index c4c73b9..0000000
--- a/conf/elasticsearch.conf
+++ /dev/null

@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Settings for Elasticsearch indexer plugin
-# Format: key=value\n

diff --git a/conf/index-writers.xml.template b/conf/index-writers.xml.template
index 268554a..808e31f 100644
--- a/conf/index-writers.xml.template
+++ b/conf/index-writers.xml.template

@@ -115,6 +115,7 @@
       <param name="exponential.backoff.millis" value="100"/>
       <param name="exponential.backoff.retries" value="10"/>
       <param name="bulk.close.timeout" value="600"/>
+      <!--<param name="options" value="key1=value1,key2=value2"/>-->
     </parameters>
     <mapping>
       <copy>

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index fd201c7..c5359bc 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml

@@ -2073,376 +2073,6 @@
   </description>
 </property>
 
-<!-- solr index properties -->
-
-<property>
-  <name>solr.server.type</name>
-  <value>http</value>
-  <description>
-    Specifies the SolrServer implementation to use. This is a string value
-    of one of the following 'cloud', 'concurrent', 'http' or 'lb'.
-    The values represent CloudSolrServer, ConcurrentUpdateSolrServer, 
-    HttpSolrServer or LBHttpSolrServer respectively.
-  </description>
-</property>
-
-<property>
-  <name>solr.server.url</name>
-  <value>http://127.0.0.1:8983/solr/</value>
-  <description>
-      Defines the Solr URL into which data should be indexed using the
-      indexer-solr plugin.
-  </description>
-</property>
-
-<property>
-  <name>solr.zookeeper.url</name>
-  <value></value>
-  <description>
-      Defines the Zookeeper URL which is an essential setting to be used 
-      when using SolrCloud. This should be a fully qualified URL similar to
-      the property provided within 'solr.server.url' above.
-  </description>
-</property>
-
-<property>
-  <name>solr.loadbalance.urls</name>
-  <value></value>
-  <description>
-      A comma-separated value representing the Solr servers to be used when
-      initiating LBHttpSolrServer as the SolrServer implementation. 
-  </description>
-</property>
-
-<property>
-  <name>solr.mapping.file</name>
-  <value>solrindex-mapping.xml</value>
-  <description>
-  Defines the name of the file that will be used in the mapping of internal
-  Nutch field names to solr index fields as specified in the target Solr schema.
-  </description>
-</property>
-
-<property> 
-  <name>solr.commit.size</name>
-  <value>250</value>
-  <description>
-  Defines the number of documents to send to Solr in a single update batch.
-  Decrease when handling very large documents to prevent Nutch from running
-  out of memory. NOTE: It does not explicitly trigger a server side commit.
-  </description>
-</property>
-
-<property>
-  <name>solr.commit.index</name>
-  <value>true</value>
-  <description>
-  When closing the indexer, trigger a commit to the Solr server. 
-  </description>
-</property>
-
-<property>
-  <name>solr.auth</name>
-  <value>false</value>
-  <description>
-  Whether to enable HTTP basic authentication for communicating with Solr.
-  Use the solr.auth.username and solr.auth.password properties to configure
-  your credentials.
-  </description>
-</property>
-
-<!-- Elasticsearch properties -->
-
-<property>
-  <name>elastic.host</name>
-  <value></value>
-  <description>Comma-separated list of hostnames to send documents to using
-  TransportClient. Either host and port must be defined or cluster.</description>
-</property>
-
-<property> 
-  <name>elastic.port</name>
-  <value>9300</value>
-  <description>The port to connect to using TransportClient.</description>
-</property>
-
-<property> 
-  <name>elastic.cluster</name>
-  <value></value>
-  <description>The cluster name to discover. Either host and port must be defined
-  or cluster.</description>
-</property>
-
-<property> 
-  <name>elastic.index</name>
-  <value>nutch</value> 
-  <description>Default index to send documents to.</description>
-</property>
-
-<property> 
-  <name>elastic.max.bulk.docs</name>
-  <value>250</value> 
-  <description>Maximum size of the bulk in number of documents.</description>
-</property>
-
-<property> 
-  <name>elastic.max.bulk.size</name>
-  <value>2500500</value> 
-  <description>Maximum size of the bulk in bytes.</description>
-</property>
-
-<property>
-  <name>elastic.exponential.backoff.millis</name>
-  <value>100</value>
-  <description>Initial delay for the BulkProcessor's exponential backoff policy.
-  </description>
-</property>
-
-<property>
-  <name>elastic.exponential.backoff.retries</name>
-  <value>10</value>
-  <description>Number of times the BulkProcessor's exponential backoff policy
-  should retry bulk operations.</description>
-</property>
-
-<property>
-  <name>elastic.bulk.close.timeout</name>
-  <value>600</value>
-  <description>Number of seconds allowed for the BulkProcessor to complete its
-  last operation.</description>
-</property>
-
-<!-- RabbitMQ indexer properties -->
-
-<property>
-  <name>rabbitmq.indexer.server.uri</name>
-  <value>amqp://guest:guest@localhost:5672/</value>
-  <description>
-    URI with connection parameters in the form
-    amqp://username:password@hostname:port/virtualHost
-    Where:
-    username is the username for RabbitMQ server.
-    password is the password for RabbitMQ server.
-    hostname is where the RabbitMQ server is running.
-    port is where the RabbitMQ server is listening.
-    virtualHost is where where the exchange is and the user has access.
-  </description>
-</property>
-
-<property>
-  <name>rabbitmq.indexer.binding</name>
-  <value>false</value>
-  <description>
-    Whether the relationship between an exchange and a queue is created
-    automatically. Default "false".
-
-    NOTE: Binding between exchanges is not supported.
-  </description>
-</property>
-
-<property>
-  <name>rabbitmq.indexer.binding.arguments</name>
-  <value></value>
-  <description>
-    Arguments used in binding. It must have the form key1=value1,key2=value2.
-    This value is only used when the exchange's type is headers and
-    the value of 'rabbitmq.indexer.binding' property is true. In other cases
-    is ignored.
-  </description>
-</property>
-
-<property>
-  <name>rabbitmq.indexer.exchange.name</name>
-  <value></value>
-  <description>
-    Name for the exchange where the messages will be sent. Default "".
-  </description>
-</property>
-
-<property>
-  <name>rabbitmq.indexer.exchange.options</name>
-  <value>type=direct,durable=true</value>
-  <description>
-    Options used when the exchange is created.
-    Only used when the value of 'rabbitmq.indexer.binding' property is true.
-    Default "type=direct,durable=true".
-  </description>
-</property>
-
-<property>
-  <name>rabbitmq.indexer.queue.name</name>
-  <value>nutch.queue</value>
-  <description>
-    Name of the queue used to create the binding. Default "nutch.queue".
-    Only used when the value of 'rabbitmq.indexer.binding' property is true.
-  </description>
-</property>
-
-<property>
-  <name>rabbitmq.indexer.queue.options</name>
-  <value>durable=true,exclusive=false,auto-delete=false</value>
-  <description>
-    Options used when the queue is created.
-    Only used when the value of 'rabbitmq.indexer.binding' property is true.
-    Default "durable=true,exclusive=false,auto-delete=false".
-
-    It must have the form
-    durable={durable},exclusive={exclusive},auto-delete={auto-delete},arguments={arguments}
-    where:
-    durable is true or false
-    exclusive is true or false
-    auto-delete is true or false
-    arguments must be the for {key1:value1;key2:value2}
-  </description>
-</property>
-
-<property>
-  <name>rabbitmq.indexer.routingkey</name>
-  <value></value>
-  <description>
-    The routing key used to publish messages to specific queues.
-    It is only used when the exchange type is "topic" or "direct". Default
-    is the value of 'rabbitmq.indexer.queue.name' property.
-  </description>
-</property>
-
-<property>
-  <name>rabbitmq.indexer.commit.mode</name>
-  <value>multiple</value>
-  <description>
-    "single" if a message contains only one document. In this case a header
-    with the action (write, update or delete) will be added.
-    "multiple" if a message contains all documents. Default "multiple".
-  </description>
-</property>
-
-<property>
-  <name>rabbitmq.indexer.commit.size</name>
-  <value>250</value>
-  <description>
-    Amount of documents to send into each message if the value of
-    'rabbitmq.indexer.commit.mode' property is "multiple". Default "250".
-  </description>
-</property>
-
-<property>
-  <name>rabbitmq.indexer.headers.static</name>
-  <value></value>
-  <description>
-    Headers to add to each message. It must have the form key1=value1,key2=value2.
-  </description>
-</property>
-
-<property>
-  <name>rabbitmq.indexer.headers.dynamic</name>
-  <value></value>
-  <description>
-    Document's fields to add as headers to each message. It must have the form field1,field2.
-  </description>
-</property>
-
-<!--elasticsearch rest properties-->
-<property>
-    <name>elastic.rest.host</name>
-    <value></value>
-    <description>
-      The hostname or a list of comma separated hostnames to send documents to
-      using Elasticsearch Jest. Both host and port must be defined.
-    </description>
-</property>
-
-<property>
-    <name>elastic.rest.port</name>
-    <value></value>
-    <description>The port to connect to using Elasticsearch Jest.</description>
-</property>
-
-<property>
-    <name>elastic.rest.index</name>
-    <value>nutch</value>
-    <description>Default index to send documents to.</description>
-</property>
-
-<property>
-    <name>elastic.rest.index.languages</name>
-    <value></value>
-    <description>
-        A list of strings denoting the supported languages (e.g. `en,de,fr,it`).
-        If this value is empty all documents will be sent to index ${elastic.rest.index}.
-        If not empty the Rest client will distribute documents in different indices based on their `lang` property.
-        Indices are named with the following schema: ${elastic.rest.index}${elastic.rest.index.separator}${lang} (e.g. `nutch_de`).
-        Entries with an unsupported `lang` value will be added to index ${elastic.rest.index}${elastic.rest.index.separator}${elastic.rest.index.sink} (e.g. `nutch_others`).
-    </description>
-</property>
-
-<property>
-    <name>elastic.rest.index.separator</name>
-    <value>_</value>
-    <description>
-        Default value is `_`. Is used only if `elastic.rest.index.languages` is defined to build the index name (i.e. ${elastic.rest.index}${elastic.rest.index.separator}${lang}). 
-    </description>
-</property>
-
-<property>
-    <name>elastic.rest.index.sink</name>
-    <value>others</value>
-    <description>
-        Default value is `others`. Is used only if `elastic.rest.index.languages` is defined to build the index name where to store documents with unsupported languages (i.e. ${elastic.rest.index}${elastic.rest.index.separator}${elastic.rest.index.sink}).
-    </description>
-</property>
-
-<property>
-    <name>elastic.rest.type</name>
-    <value>doc</value>
-    <description>Default type to send documents to.</description>
-</property>
-
-<property>
-    <name>elastic.rest.max.bulk.docs</name>
-    <value>250</value>
-    <description>Maximum size of the bulk in number of documents.</description>
-</property>
-
-<property>
-    <name>elastic.rest.max.bulk.size</name>
-    <value>26214400</value>
-    <description>Maximum size of the bulk in bytes.</description>
-</property>
-
-<property>
-    <name>elastic.rest.https</name>
-    <value>false</value>
-    <description>
-        "true" to enable https, "false" to disable https
-        If you've disabled http access (by forcing https), be sure to
-        set this to true, otherwise you might get "connection reset by peer".
-    </description>
-</property>
-
-<property>
-    <name>elastic.rest.user</name>
-    <value></value>
-    <description>Username for auth credentials (only used when https is enabled)</description>
-</property>
-
-<property>
-    <name>elastic.rest.password</name>
-    <value></value>
-    <description>Password for auth credentials (only used when https is enabled)</description>
-</property>
-
-<property>
-    <name>elastic.rest.trustallhostnames</name>
-    <value>false</value>
-    <description>
-        "true" to trust elasticsearch server's certificate even if its listed domain name does not
-        match the domain they are hosted on
-        "false" to check if the elasticsearch server's certificate's listed domain is the same domain
-        that it is hosted on, and if it doesn't, then fail to index
-        (only used when https is enabled)
-    </description>
-</property>
-
 <!-- subcollection properties -->
 
 <property>

diff --git a/conf/solrindex-mapping.xml b/conf/solrindex-mapping.xml
deleted file mode 100644
index 2b581bb..0000000
--- a/conf/solrindex-mapping.xml
+++ /dev/null

@@ -1,43 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<mapping>
-  <!-- Simple mapping of fields created by Nutch IndexingFilters
-       to fields defined (and expected) in Solr schema.xml.
-
-             Any fields in NutchDocument that match a name defined
-             in field/@source will be renamed to the corresponding
-             field/@dest.
-             Additionally, if a field name (before mapping) matches
-             a copyField/@source then its values will be copied to 
-             the corresponding copyField/@dest.
-
-             uniqueKey has the same meaning as in Solr schema.xml
-             and defaults to "id" if not defined.
-         -->
-  <fields>
-    <field dest="content" source="content"/>
-    <field dest="title" source="title"/>
-    <field dest="host" source="host"/>
-    <field dest="segment" source="segment"/>
-    <field dest="boost" source="boost"/>
-    <field dest="digest" source="digest"/>
-    <field dest="tstamp" source="tstamp"/>
-  </fields>
-  <uniqueKey>id</uniqueKey>
-</mapping>

diff --git a/docker/Dockerfile b/docker/Dockerfile
index c5ba807..3077d1a 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile

@@ -13,20 +13,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM ubuntu:16.04
-MAINTAINER Michael Joyce <joyce@apache.org>
+FROM ubuntu:18.04
+MAINTAINER Apache Nutch Committers <dev@nutch.apache.org>
 
 WORKDIR /root/
 
 
 # Install dependencies
 RUN apt update
-RUN apt install -y ant openssh-server vim telnet git rsync curl openjdk-8-jdk-headless
+RUN apt install -y ant git openjdk-8-jdk-headless
 
 # Set up JAVA_HOME
 RUN echo 'export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64' >> $HOME/.bashrc
 
-# Checkout and build the nutch trunk
+# Checkout and build the Nutch master branch (1.x)
 RUN git clone https://github.com/apache/nutch.git nutch_source && cd nutch_source && ant runtime
 
 # Convenience symlink to Nutch runtime local

diff --git a/docker/README.md b/docker/README.md
index 36fc852..25d601f 100644
--- a/docker/README.md
+++ b/docker/README.md

@@ -14,11 +14,11 @@
 
 Current configuration of this image consists of components:
 
-*	Nutch 1.x
+*	Nutch 1.x (branch "master")
 
 ##  Base Image
 
-* [ubuntu:14.04](https://registry.hub.docker.com/_/ubuntu/)
+* [ubuntu:18.04](https://hub.docker.com/_/ubuntu/)
 
 ## Tips
 

diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java
index cbfbe0c..18e3871 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java

@@ -132,46 +132,27 @@
           context.write(key, (CrawlDatum) value);
         }
         else if (value instanceof HostDatum) {
-          // For entry from hostdb, get sitemap url(s) from robots.txt, fetch the sitemap,
-          // extract urls and emit those
-
-          // try different combinations of schemes one by one till we get rejection in all cases
-          String host = key.toString();
-          if((url = filterNormalize("http://" + host + "/")) == null &&
-              (url = filterNormalize("https://" + host + "/")) == null &&
-              (url = filterNormalize("ftp://" + host + "/")) == null &&
-              (url = filterNormalize("file:/" + host + "/")) == null) {
-            context.getCounter("Sitemap", "filtered_records").increment(1);
-            return;
-          }
-          // We may wish to use the robots.txt content as the third parameter for .getRobotRules
-          BaseRobotRules rules = protocolFactory.getProtocol(url).getRobotRules(new Text(url), datum, null);
-          List<String> sitemaps = rules.getSitemaps();
-
-          if (tryDefaultSitemapXml && sitemaps.size() == 0) {
-            sitemaps.add(url + "sitemap.xml");
-          }
-          for (String sitemap : sitemaps) {
-            context.getCounter("Sitemap", "sitemaps_from_hostdb").increment(1);
-            sitemap = filterNormalize(sitemap);
-            if (sitemap == null) {
-              context.getCounter("Sitemap", "filtered_sitemaps_from_hostdb")
-                  .increment(1);
-            } else {
-              generateSitemapUrlDatum(protocolFactory.getProtocol(sitemap),
-                  sitemap, context);
-            }
-          }
+          generateSitemapsFromHostname(key.toString(), context);
         }
         else if (value instanceof Text) {
-          // For entry from sitemap urls file, fetch the sitemap, extract urls and emit those
-          if((url = filterNormalize(key.toString())) == null) {
-            context.getCounter("Sitemap", "filtered_records").increment(1);
-            return;
-          }
+          // Input can be sitemap URL or hostname
+          url = key.toString();
+          if (url.startsWith("http://") ||
+                url.startsWith("https://") ||
+                url.startsWith("ftp://") ||
+                url.startsWith("file:/")) {
+            // For entry from sitemap urls file, fetch the sitemap, extract urls and emit those
+            if((url = filterNormalize(url)) == null) {
+              context.getCounter("Sitemap", "filtered_records").increment(1);
+              return;
+            }
 
-          context.getCounter("Sitemap", "sitemap_seeds").increment(1);
-          generateSitemapUrlDatum(protocolFactory.getProtocol(url), url, context);
+            context.getCounter("Sitemap", "sitemap_seeds").increment(1);
+            generateSitemapUrlDatum(protocolFactory.getProtocol(url), url, context); 
+          } else {
+            LOG.info("generateSitemapsFromHostname: " + key.toString());
+            generateSitemapsFromHostname(key.toString(), context);
+          }
         }
       } catch (Exception e) {
         LOG.warn("Exception for record {} : {}", key.toString(), StringUtils.stringifyException(e));
@@ -191,6 +172,43 @@
       }
       return url;
     }
+    
+    private void generateSitemapsFromHostname(String host, Context context) {
+      try {
+        // For entry from hostdb, get sitemap url(s) from robots.txt, fetch the sitemap,
+        // extract urls and emit those
+
+        // try different combinations of schemes one by one till we get rejection in all cases
+        String url;
+        if((url = filterNormalize("http://" + host + "/")) == null &&
+            (url = filterNormalize("https://" + host + "/")) == null &&
+            (url = filterNormalize("ftp://" + host + "/")) == null &&
+            (url = filterNormalize("file:/" + host + "/")) == null) {
+          context.getCounter("Sitemap", "filtered_records").increment(1);
+          return;
+        }
+        // We may wish to use the robots.txt content as the third parameter for .getRobotRules
+        BaseRobotRules rules = protocolFactory.getProtocol(url).getRobotRules(new Text(url), datum, null);
+        List<String> sitemaps = rules.getSitemaps();
+
+        if (tryDefaultSitemapXml && sitemaps.size() == 0) {
+          sitemaps.add(url + "sitemap.xml");
+        }
+        for (String sitemap : sitemaps) {
+          context.getCounter("Sitemap", "sitemaps_from_hostname").increment(1);
+          sitemap = filterNormalize(sitemap);
+          if (sitemap == null) {
+            context.getCounter("Sitemap", "filtered_sitemaps_from_hostname")
+                .increment(1);
+          } else {
+            generateSitemapUrlDatum(protocolFactory.getProtocol(sitemap),
+                sitemap, context);
+          }
+        }
+      } catch (Exception e) {
+        LOG.warn("Exception for record {} : {}", host, StringUtils.stringifyException(e));
+      }
+    }
 
     private void generateSitemapUrlDatum(Protocol protocol, String url, Context context) throws Exception {
       ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum);
@@ -399,13 +417,13 @@
 
       if (LOG.isInfoEnabled()) {
         long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue();
-        long fromHostDb = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostdb").getValue();
+        long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue();
         long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue();
         long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue();
         long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue();
 
         LOG.info("SitemapProcessor: Total records rejected by filters: {}", filteredRecords);
-        LOG.info("SitemapProcessor: Total sitemaps from HostDb: {}", fromHostDb);
+        LOG.info("SitemapProcessor: Total sitemaps from host name: {}", fromHostname);
         LOG.info("SitemapProcessor: Total sitemaps from seed urls: {}", fromSeeds);
         LOG.info("SitemapProcessor: Total failed sitemap fetches: {}", failedFetches);
         LOG.info("SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries);
@@ -431,7 +449,7 @@
 
     System.err.println("\t<crawldb>\t\tpath to crawldb where the sitemap urls would be injected");
     System.err.println("\t-hostdb <hostdb>\tpath of a hostdb. Sitemap(s) from these hosts would be downloaded");
-    System.err.println("\t-sitemapUrls <url_dir>\tpath to sitemap urls directory");
+    System.err.println("\t-sitemapUrls <url_dir>\tpath to directory with sitemap urls or hostnames");
     System.err.println("\t-threads <threads>\tNumber of threads created per mapper to fetch sitemap urls (default: 8)");
     System.err.println("\t-force\t\t\tforce update even if CrawlDb appears to be locked (CAUTION advised)");
     System.err.println("\t-noStrict\t\tBy default Sitemap parser rejects invalid urls. '-noStrict' disables that.");

diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java
index a646510..d272841 100644
--- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java
+++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticConstants.java

@@ -17,13 +17,14 @@
 package org.apache.nutch.indexwriter.elastic;
 
 public interface ElasticConstants {
-  public static final String HOSTS = "host";
-  public static final String PORT = "port";
-  public static final String CLUSTER = "cluster";
-  public static final String INDEX = "index";
-  public static final String MAX_BULK_DOCS = "max.bulk.docs";
-  public static final String MAX_BULK_LENGTH = "max.bulk.size";
-  public static final String EXPONENTIAL_BACKOFF_MILLIS = "exponential.backoff.millis";
-  public static final String EXPONENTIAL_BACKOFF_RETRIES = "exponential.backoff.retries";
-  public static final String BULK_CLOSE_TIMEOUT = "bulk.close.timeout";
+  String HOSTS = "host";
+  String PORT = "port";
+  String CLUSTER = "cluster";
+  String INDEX = "index";
+  String MAX_BULK_DOCS = "max.bulk.docs";
+  String MAX_BULK_LENGTH = "max.bulk.size";
+  String EXPONENTIAL_BACKOFF_MILLIS = "exponential.backoff.millis";
+  String EXPONENTIAL_BACKOFF_RETRIES = "exponential.backoff.retries";
+  String BULK_CLOSE_TIMEOUT = "bulk.close.timeout";
+  String OPTIONS = "options";
 }

diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
index a82beae..ee31527 100644
--- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
+++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java

@@ -17,7 +17,6 @@
 package org.apache.nutch.indexwriter.elastic;
 
 import java.lang.invoke.MethodHandles;
-import java.io.BufferedReader;
 import java.io.IOException;
 import java.net.InetAddress;
 import java.util.AbstractMap;
@@ -143,16 +142,17 @@
 
     Settings.Builder settingsBuilder = Settings.builder();
 
-    BufferedReader reader = new BufferedReader(
-        config.getConfResourceAsReader("elasticsearch.conf"));
-    String line;
-    String[] parts;
-    while ((line = reader.readLine()) != null) {
-      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
-        parts = line.trim().split("=");
+    String options = parameters.get(ElasticConstants.OPTIONS);
 
-        if (parts.length == 2) {
-          settingsBuilder.put(parts[0].trim(), parts[1].trim());
+    if (options != null) {
+      String[] lines = options.trim().split(",");
+      for (String line : lines) {
+        if (StringUtils.isNotBlank(line)) {
+          String[] parts = line.trim().split("=");
+
+          if (parts.length == 2) {
+            settingsBuilder.put(parts[0].trim(), parts[1].trim());
+          }
         }
       }
     }
@@ -168,8 +168,8 @@
 
     // Prefer TransportClient
     if (hosts != null && port > 1) {
-      @SuppressWarnings("resource")
-      TransportClient transportClient = new PreBuiltTransportClient(settings);
+      @SuppressWarnings("resource") TransportClient transportClient = new PreBuiltTransportClient(
+          settings);
 
       for (String host : hosts)
         transportClient.addTransportAddress(

diff --git a/src/plugin/indexer-solr/README.md b/src/plugin/indexer-solr/README.md
index 1d60acc..a5305ca 100644
--- a/src/plugin/indexer-solr/README.md
+++ b/src/plugin/indexer-solr/README.md

@@ -37,4 +37,8 @@
 commitSize | Defines the number of documents to send to Solr in a single update batch. Decrease when handling very large documents to prevent Nutch from running out of memory.<br>**Note**: It does not explicitly trigger a server side commit. | 1000 
 auth | Whether to enable HTTP basic authentication for communicating with Solr. Use the `username` and `password` properties to configure your credentials. | false
 username | The username of Solr server. | username
-password | The password of Solr server. | password
\ No newline at end of file
+password | The password of Solr server. | password
+
+## schema.xml
+
+In the distribution of the indexer-solr plugin there is a schema.xml file available. Nutch does not use this file, but it is provided to Solr users as a reference/guide to facilitate the configuration of Solr.
\ No newline at end of file

diff --git a/conf/schema.xml b/src/plugin/indexer-solr/schema.xml
similarity index 100%
rename from conf/schema.xml
rename to src/plugin/indexer-solr/schema.xml
commit	8875a8fcd154e5a1c72310749de50eeb1a422118	[log] [tgz]
author	r0ann3l <roannel.fdez@gmail.com>	Fri Sep 27 11:32:15 2019 -0400
committer	r0ann3l <roannel.fdez@gmail.com>	Fri Sep 27 11:32:15 2019 -0400
tree	bc8fcd90bf15090b92ca82cd64536648b59b62f3
parent	026d2e783cce2372c2e2ba426afc0effd8b7dd25 [diff]
parent	caa94228614b0d63049a41978939d9a1e2f729d3 [diff]