Merge branch 'master' into NUTCH-2688

commit: 97a24a2c8d47ca0d24989b273d7b00822b4c33ae [log] [tgz]
author: r0ann3l <roannel.fdez@gmail.com> Fri Mar 08 15:41:07 2019 -0400
committer: r0ann3l <roannel.fdez@gmail.com> Fri Mar 08 15:41:07 2019 -0400
tree: 8bfa66713d1b1ca47af67b3eec2e815c8d2806fa
parent: a6ead23f05cc36d534a6201df7aaefd51f1e548f [diff]
parent: 8bdec5e3ef77f816c616c978c775a0eb3b4a391a [diff]
diff --git a/.gitignore b/.gitignore
index 732ca05..61e42e0 100644
--- a/.gitignore
+++ b/.gitignore

@@ -13,3 +13,4 @@
 ivy/ivy-2.4.0.jar
 ivy/ivy-2.5.0-rc1.jar
 naivebayes-model
+.gitconfig

diff --git a/CHANGES.txt b/CHANGES.txt
index 96bd05a..12f5aad 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt

@@ -6,9 +6,12 @@
 
 Breaking Changes
 
-The value of crawl.gen.delay is now read in milliseconds as stated in the description
-in nutch-default.xml. Previously, the value has been read in days, see NUTCH-1842 for
-further information.
+    -  The value of crawl.gen.delay is now read in milliseconds as stated in the description
+       in nutch-default.xml. Previously, the value has been read in days, see NUTCH-1842 for
+       further information.
+
+    -  HostDB entries have been moved from Integer to Long in order to accomodate very large
+       hosts. Remove your existing HostDB and recreate it with bin/nutch updatehostdb.
 
 
 Nutch 1.15 Release (25/07/2018)

diff --git a/build.xml b/build.xml
index 65e8f3f..18f659a 100644
--- a/build.xml
+++ b/build.xml

@@ -999,32 +999,6 @@
     </rat:report>
   </target>
 
-  <!-- ================================================================== -->
-  <!-- SONAR targets                                                      -->
-  <!-- ================================================================== -->
-
-  <!-- Define the Sonar task if this hasn't been done in a common script -->
-  <taskdef uri="antlib:org.sonar.ant" resource="org/sonar/ant/antlib.xml">
-    <classpath path="${ant.library.dir}"/>
-    <classpath path="${mysql.library.dir}"/>
-  </taskdef>
-
-  <!-- Add the target -->
-  <target name="sonar" description="--> run SONAR analysis">
-
-    <!-- list of mandatory source directories (required) -->
-    <property name="sonar.sources" value="${src.dir}"/>
-
-    <!-- list of properties (optional) -->
-    <property name="sonar.projectName" value="Nutch Trunk 1.4 Sonar Analysis" />
-    <property name="sonar.binaries" value="${build.dir}/classes" />
-    <property name="sonar.binaries" value="${build.dir}/plugins" />
-    <property name="sonar.tests" value="${test.src.dir}" />
-
-    <sonar:sonar workDir="${base.dir}" key="org.apache.nutch:trunk"
-     version="1.4-SNAPSHOT" xmlns:sonar="antlib:org.sonar.ant"/>
-  </target>
-
 
   <!-- ================================================================== -->
   <!-- Eclipse targets                                                    -->

diff --git a/conf/host-protocol-mapping.txt.template b/conf/host-protocol-mapping.txt.template
new file mode 100644
index 0000000..a09bca6
--- /dev/null
+++ b/conf/host-protocol-mapping.txt.template

@@ -0,0 +1,16 @@
+# This file defines a hostname to protocol plugin mapping. Each line takes a
+# host name followed by a tab, followed by the ID of the protocol plugin. You
+# can find the ID in the protocol plugin's plugin.xml file.
+#
+# <hostname>\t<plugin_id>\n
+# nutch.apache.org	org.apache.nutch.protocol.httpclient.Http
+# tika.apache.org	org.apache.nutch.protocol.http.Http
+#
+# If the requested host is not mapped, Nutch can choose any of the enabled
+# plugins so you can force defaults using:
+#
+# protocol:<protocol>\t<plugin_id>\n
+#
+# This example forces httpclient for all protocol in case the host is not mapped:
+# protocol:http	org.apache.nutch.protocol.httpclient.Http
+# protocol:https	org.apache.nutch.protocol.httpclient.Http

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 00cb845..dadf30d 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml

@@ -849,14 +849,14 @@
   <value>-1</value>
   <description>The maximum number of urls in a single
   fetchlist.  -1 if unlimited. The urls are counted according
-  to the value of the parameter generator.count.mode.
+  to the value of the parameter generate.count.mode.
   </description>
 </property>
 
 <property>
   <name>generate.count.mode</name>
   <value>host</value>
-  <description>Determines how the URLs are counted for generator.max.count.
+  <description>Determines how the URLs are counted for generate.max.count.
   Default value is 'host' but can be 'domain'. Note that we do not count 
   per IP in the new version of the Generator.
   </description>
@@ -1062,7 +1062,7 @@
 <property>
   <name>fetcher.throughput.threshold.retries</name>
   <value>5</value>
-  <description>The number of times the fetcher.throughput.threshold is allowed to be exceeded.
+  <description>The number of times the fetcher.throughput.threshold.pages is allowed to be exceeded.
   This settings prevents accidental slow downs from immediately killing the fetcher thread.
   </description>
 </property>
@@ -1172,6 +1172,18 @@
 	Publisher implementation specific properties</description>
 </property> 
 
+<property>
+  <name>fetcher.filter.urls</name>
+  <value>false</value>
+  <description>Whether fetcher will filter URLs (with the configured URL filters).</description>
+</property>
+
+<property>
+  <name>fetcher.normalize.urls</name>
+  <value>false</value>
+  <description>Whether fetcher will normalize URLs (with the configured URL normalizers).</description>
+</property>
+
 <!--  any23 plugin properties -->
 
 <property>
@@ -1207,6 +1219,15 @@
   </description>
 </property>
 
+<property>
+  <name>moreIndexingFilter.mapMimeTypes.field</name>
+  <value></value>
+  <description>It's used if moreIndexingFilter.mapMimeTypes is true. Indicates the field
+  where the mapped MIME-type must be written. If it's empty or unset, the content of the field "type"
+  will be replaced by the mapped MIME-type.
+  </description>
+</property>
+
 <!-- AnchorIndexing filter plugin properties -->
 
 <property>
@@ -1360,11 +1381,11 @@
   <value>protocol-http|urlfilter-(regex|validator)|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
   <description>Regular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
-  In any case you need at least include the nutch-extensionpoints plugin. By
-  default Nutch includes crawling just HTML and plain text via HTTP,
-  and basic indexing and search plugins. In order to use HTTPS please enable 
-  protocol-httpclient, but be aware of possible intermittent problems with the 
-  underlying commons-httpclient library. Set parsefilter-naivebayes for classification based focused crawler.
+  By default Nutch includes plugins to crawl HTML and various other
+  document formats via HTTP/HTTPS and indexing the crawled content
+  into Solr.  More plugins are available to support more indexing
+  backends, to fetch ftp:// and file:// URLs, for focused crawling,
+  and many other use cases.
   </description>
 </property>
 
@@ -2398,6 +2419,14 @@
   </description>
 </property>
 
+<property>
+  <name>subcollection.case.insensitive</name>
+  <value>false</value>
+  <description>
+  Whether the URL prefixes are to be treated case insensitive.
+  </description>
+</property>
+
 <!-- Headings plugin properties -->
 
 <property>
@@ -2496,10 +2525,11 @@
   <description>
     A String value representing the flavour of Selenium 
     WebDriver() to use. Currently the following options
-    exist - 'firefox', 'chrome', 'safari', 'opera', 'phantomjs' and 'remote'.
+    exist - 'firefox', 'chrome', 'safari', 'opera' and 'remote'.
     If 'remote' is used it is essential to also set correct properties for
     'selenium.hub.port', 'selenium.hub.path', 'selenium.hub.host',
-    'selenium.hub.protocol', 'selenium.grid.driver' and 'selenium.grid.binary'.
+    'selenium.hub.protocol', 'selenium.grid.driver', 'selenium.grid.binary'
+    and 'selenium.enable.headless'.
   </description>
 </property>
 
@@ -2531,8 +2561,9 @@
   <name>selenium.grid.driver</name>
   <value>firefox</value>
   <description>A String value representing the flavour of Selenium 
-    WebDriver() used on the selenium grid. Currently the following options
-    exist - 'firefox', 'phantomjs' </description>
+    WebDriver() used on the selenium grid. We must set `selenium.driver` to `remote` first.
+    Currently the following options
+    exist - 'firefox', 'chrome', 'random' </description>
 </property>
 
 <property>
@@ -2543,6 +2574,14 @@
  </description>
 </property>
 
+<!-- headless options for Firefox and Chrome-->
+<property>
+  <name>selenium.enable.headless</name>
+  <value>false</value>
+  <description>A Boolean value representing the headless option
+    for Firefix and Chrome drivers
+  </description>
+</property>
 <!-- selenium firefox configuration; 
      applies to protocol-selenium and protocol-interactiveselenium plugins -->
 <property>
@@ -2593,6 +2632,14 @@
   Currently this option exist for - 'firefox' </description>
 </property>
 
+<!-- selenium chrome configurations -->
+<property>
+  <name>webdriver.chrome.driver</name>
+  <value>/root/chromedriver</value>
+  <description>The path to the ChromeDriver binary</description>
+</property>
+<!-- end of selenium chrome configurations -->
+
 <!-- protocol-interactiveselenium configuration -->
 <property>
   <name>interactiveselenium.handlers</name>

diff --git a/conf/regex-urlfilter.txt.template b/conf/regex-urlfilter.txt.template
index 4319bf1..1448642 100644
--- a/conf/regex-urlfilter.txt.template
+++ b/conf/regex-urlfilter.txt.template

@@ -24,14 +24,14 @@
 # matches, the URL is ignored.
 
 # skip file: ftp: and mailto: urls
--^(file|ftp|mailto):
+-^(?:file|ftp|mailto):
 
 # skip URLs longer than 2048 characters, see also db.max.outlink.length
 #-^.{2049,}
 
 # skip image and other suffixes we can't yet parse
 # for a more extensive coverage use the urlfilter-suffix plugin
--(?i)\.(gif|jpg|png|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|jpeg|bmp|js)$
+-(?i)\.(?:gif|jpg|png|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|jpeg|bmp|js)$
 
 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]

diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index f1e4a80..52826bb 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml

@@ -43,11 +43,11 @@
 			<exclude org="com.sun.jmx" name="jmxri" />
 		</dependency-->
 
-		<dependency org="org.apache.commons" name="commons-lang3" rev="3.7" conf="*->default" />
-		<dependency org="org.apache.commons" name="commons-collections4" rev="4.1" conf="*->master" />
-		<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.5" conf="*->master" />
+		<dependency org="org.apache.commons" name="commons-lang3" rev="3.8.1" conf="*->default" />
+		<dependency org="org.apache.commons" name="commons-collections4" rev="4.2" conf="*->master" />
+		<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.5.6" conf="*->master" />
 		<dependency org="commons-codec" name="commons-codec" rev="1.11" conf="*->default" />
-		<dependency org="org.apache.commons" name="commons-compress" rev="1.16.1" conf="*->default" />
+		<dependency org="org.apache.commons" name="commons-compress" rev="1.18" conf="*->default" />
 		<dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1" />
 		<dependency org="com.tdunning" name="t-digest" rev="3.2" />
 
@@ -65,7 +65,7 @@
 		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.7.4" conf="*->default"/>
 		<!-- End of Hadoop Dependencies -->
 
-		<dependency org="org.apache.tika" name="tika-core" rev="1.19.1" />
+		<dependency org="org.apache.tika" name="tika-core" rev="1.20" />
 		<dependency org="com.ibm.icu" name="icu4j" rev="61.1" />
 
 		<dependency org="xerces" name="xercesImpl" rev="2.11.0" />
@@ -78,14 +78,14 @@
 		<dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0" />
 
 		<!--dependency org="org.apache.cxf" name="cxf" rev="3.0.4" conf="*->default"/-->
-		<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.1.15" conf="*->default"/>
-		<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.1.15" conf="*->default"/>
-		<dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.1.15" conf="*->default"/>
-		<dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.1.15" conf="*->default"/>
-		<dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.1.15" conf="test->default"/>
-		<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.9.5" conf="*->default"/>
-		<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.9.5" conf="*->default"/>
-		<dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.9.5" conf="*->default"/>
+		<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.2.7" conf="*->default"/>
+		<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.2.7" conf="*->default"/>
+		<dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.2.7" conf="*->default"/>
+		<dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.2.7" conf="*->default"/>
+		<dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.2.7" conf="test->default"/>
+		<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.9.7" conf="*->default"/>
+		<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.9.7" conf="*->default"/>
+		<dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.9.7" conf="*->default"/>
 
 		<!-- WARC artifacts needed -->
 		<dependency org="org.netpreserve.commons" name="webarchive-commons" rev="1.1.5" conf="*->default">

diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index bb3326c..a7d2f11 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java

@@ -521,7 +521,7 @@
             qs.add(d);
           } else {
             LOG.warn(
-                "Skipping quantile {} not in range in db.stats.score.quantiles: {}",
+                "Skipping quantile {} not in range in db.stats.score.quantiles",
                 s);
           }
         } catch (NumberFormatException e) {

diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
index 69b86fd..feba08a 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java

@@ -33,6 +33,7 @@
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.StringUtil;
 
 /** Merge new page entries with existing entries. */
 public class CrawlDbReducer extends
@@ -168,7 +169,8 @@
         context.getCounter("CrawlDB status",
             CrawlDatum.getStatusName(old.getStatus())).increment(1);
       } else {
-        LOG.warn("Missing fetch and old value, signature=" + signature);
+        LOG.warn("Missing fetch and old value, signature="
+            + StringUtil.toHexString(signature));
       }
       return;
     }

diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index 3966be5..e4afbe9 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java

@@ -254,7 +254,7 @@
           return;
 
         // consider only entries with a score superior to the threshold
-        if (scoreThreshold != Float.NaN && sort < scoreThreshold)
+        if (!Float.isNaN(scoreThreshold) && sort < scoreThreshold)
           return;
 
         // consider only entries with a retry (or fetch) interval lower than
@@ -701,9 +701,9 @@
 
   /**
    * Generate fetchlists in one or more segments. Whether to filter URLs or not
-   * is read from the crawl.generate.filter property in the configuration files.
-   * If the property is not found, the URLs are filtered. Same for the
-   * normalisation.
+   * is read from the &quot;generate.filter&quot; property set for the job from
+   * command-line. If the property is not found, the URLs are filtered. Same for
+   * the normalisation.
    * 
    * @param dbDir
    *          Crawl database directory

diff --git a/src/java/org/apache/nutch/fetcher/QueueFeeder.java b/src/java/org/apache/nutch/fetcher/QueueFeeder.java
index 72009ad..f5fa663 100644
--- a/src/java/org/apache/nutch/fetcher/QueueFeeder.java
+++ b/src/java/org/apache/nutch/fetcher/QueueFeeder.java

@@ -18,10 +18,15 @@
 
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
 
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.fetcher.Fetcher.FetcherRun;
+import org.apache.nutch.net.URLFilterException;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -38,6 +43,9 @@
   private FetchItemQueues queues;
   private int size;
   private long timelimit = -1;
+  private URLFilters urlFilters = null;
+  private URLNormalizers urlNormalizers = null;
+  private String urlNormalizerScope = URLNormalizers.SCOPE_DEFAULT;
 
   public QueueFeeder(FetcherRun.Context context,
       FetchItemQueues queues, int size) {
@@ -46,20 +54,43 @@
     this.size = size;
     this.setDaemon(true);
     this.setName("QueueFeeder");
+    Configuration conf = context.getConfiguration();
+    if (conf.getBoolean("fetcher.filter.urls", false)) {
+      urlFilters = new URLFilters(conf);
+    }
+    if (conf.getBoolean("fetcher.normalize.urls", false)) {
+      urlNormalizers = new URLNormalizers(conf, urlNormalizerScope);
+    }
   }
 
   public void setTimeLimit(long tl) {
     timelimit = tl;
   }
 
+  /** Filter and normalize the url */
+  private String filterNormalize(String url) {
+    if (url != null) {
+      try {
+        if (urlNormalizers != null)
+          url = urlNormalizers.normalize(url, urlNormalizerScope); // normalize the url
+        if (urlFilters != null)
+          url = urlFilters.filter(url);
+      } catch (MalformedURLException | URLFilterException e) {
+        LOG.warn("Skipping {}: {}", url, e);
+        url = null;
+      }
+    }
+    return url;
+  }
+
   public void run() {
     boolean hasMore = true;
     int cnt = 0;
     int timelimitcount = 0;
     while (hasMore) {
       if (System.currentTimeMillis() >= timelimit && timelimit != -1) {
-        // enough .. lets' simply
-        // read all the entries from the input without processing them
+        // enough ... lets' simply read all the entries from the input without
+        // processing them
         try {
           hasMore = context.nextKeyValue();
           timelimitcount++;
@@ -77,33 +108,43 @@
         // queues are full - spin-wait until they have some free space
         try {
           Thread.sleep(1000);
-        } catch (Exception e) {
+        } catch (InterruptedException e) {
         }
-        ;
         continue;
-      } else {
-        LOG.debug("-feeding {} input urls ...", feed);
-        while (feed > 0 && hasMore) {
-          try {
-            hasMore = context.nextKeyValue();
-            if (hasMore) {
-              /*
-               * Need to copy key and value objects because MapReduce will reuse
-               * the original objects while the objects are stored in the queue.
-               */
-              Text url = new Text((Text)context.getCurrentKey());
-              CrawlDatum datum = new CrawlDatum();
-              datum.set((CrawlDatum)context.getCurrentValue());
-              queues.addFetchItem(url, datum);
-              cnt++;
-              feed--;
+      }
+      LOG.debug("-feeding {} input urls ...", feed);
+      while (feed > 0 && hasMore) {
+        try {
+          hasMore = context.nextKeyValue();
+          if (hasMore) {
+            Text url = context.getCurrentKey();
+            if (urlFilters != null || urlNormalizers != null) {
+              String u = filterNormalize(url.toString());
+              if (u == null) {
+                // filtered or failed to normalize
+                context.getCounter("FetcherStatus", "filtered").increment(1);
+                continue;
+              }
+              url = new Text(u);
             }
-          } catch (IOException e) {
-            LOG.error("QueueFeeder error reading input, record " + cnt, e);
-            return;
-          } catch (InterruptedException e) {
-            LOG.info("QueueFeeder interrupted, exception:", e);
+            /*
+             * Need to copy key and value objects because MapReduce will reuse
+             * the original objects while the objects are stored in the queue.
+             */
+            else {
+              url = new Text(url);
+            }
+            CrawlDatum datum = new CrawlDatum();
+            datum.set((CrawlDatum) context.getCurrentValue());
+            queues.addFetchItem(url, datum);
+            cnt++;
+            feed--;
           }
+        } catch (IOException e) {
+          LOG.error("QueueFeeder error reading input, record " + cnt, e);
+          return;
+        } catch (InterruptedException e) {
+          LOG.info("QueueFeeder interrupted, exception:", e);
         }
       }
     }

diff --git a/src/java/org/apache/nutch/hostdb/HostDatum.java b/src/java/org/apache/nutch/hostdb/HostDatum.java
index fe3b73e..2bc9244 100644
--- a/src/java/org/apache/nutch/hostdb/HostDatum.java
+++ b/src/java/org/apache/nutch/hostdb/HostDatum.java

@@ -30,7 +30,7 @@
 /**
  */
 public class HostDatum implements Writable, Cloneable {
-  protected int failures = 0;
+  protected long failures = 0;
   protected float score = 0;
   protected Date lastCheck = new Date(0);
   protected String homepageUrl = new String();
@@ -38,17 +38,17 @@
   protected MapWritable metaData = new MapWritable();
 
   // Records the number of times DNS look-up failed, may indicate host no longer exists
-  protected int dnsFailures = 0;
+  protected long dnsFailures = 0;
 
   // Records the number of connection failures, may indicate our netwerk being blocked by firewall
-  protected int connectionFailures = 0;
+  protected long connectionFailures = 0;
 
-  protected int unfetched = 0;
-  protected int fetched = 0;
-  protected int notModified = 0;
-  protected int redirTemp = 0;
-  protected int redirPerm = 0;
-  protected int gone = 0;
+  protected long unfetched = 0;
+  protected long fetched = 0;
+  protected long notModified = 0;
+  protected long redirTemp = 0;
+  protected long redirPerm = 0;
+  protected long gone = 0;
 
   public HostDatum() {
   }
@@ -68,15 +68,15 @@
   }
 
   public void resetFailures() {
-    setDnsFailures(0);
-    setConnectionFailures(0);
+    setDnsFailures(0l);
+    setConnectionFailures(0l);
   }
 
-  public void setDnsFailures(Integer dnsFailures) {
+  public void setDnsFailures(Long dnsFailures) {
     this.dnsFailures = dnsFailures;
   }
 
-  public void setConnectionFailures(Integer connectionFailures) {
+  public void setConnectionFailures(Long connectionFailures) {
     this.connectionFailures = connectionFailures;
   }
 
@@ -88,15 +88,15 @@
     this.connectionFailures++;
   }
 
-  public Integer numFailures() {
+  public Long numFailures() {
     return getDnsFailures() + getConnectionFailures();
   }
 
-  public Integer getDnsFailures() {
+  public Long getDnsFailures() {
     return dnsFailures;
   }
 
-  public Integer getConnectionFailures() {
+  public Long getConnectionFailures() {
     return connectionFailures;
   }
 
@@ -120,7 +120,7 @@
     return score;
   }
 
-  public Integer numRecords() {
+  public Long numRecords() {
     return unfetched + fetched + gone + redirPerm + redirTemp + notModified;
   }
 
@@ -140,51 +140,51 @@
     this.homepageUrl = homepageUrl;
   }
 
-  public void setUnfetched(int val) {
+  public void setUnfetched(long val) {
     unfetched = val;
   }
 
-  public int getUnfetched() {
+  public long getUnfetched() {
     return unfetched;
   }
 
-  public void setFetched(int val) {
+  public void setFetched(long val) {
     fetched = val;
   }
 
-  public int getFetched() {
+  public long getFetched() {
     return fetched;
   }
 
-  public void setNotModified(int val) {
+  public void setNotModified(long val) {
     notModified = val;
   }
 
-  public int getNotModified() {
+  public long getNotModified() {
     return notModified;
   }
 
-  public void setRedirTemp(int val) {
+  public void setRedirTemp(long val) {
     redirTemp = val;
   }
 
-  public int getRedirTemp() {
+  public long getRedirTemp() {
     return redirTemp;
   }
 
-  public void setRedirPerm(int val) {
+  public void setRedirPerm(long val) {
     redirPerm = val;
   }
 
-  public int getRedirPerm() {
+  public long getRedirPerm() {
     return redirPerm;
   }
 
-  public void setGone(int val) {
+  public void setGone(long val) {
     gone = val;
   }
 
-  public int getGone() {
+  public long getGone() {
     return gone;
   }
 
@@ -249,15 +249,15 @@
     lastCheck = new Date(in.readLong());
     homepageUrl = Text.readString(in);
 
-    dnsFailures = in.readInt();
-    connectionFailures = in.readInt();
+    dnsFailures = in.readLong();
+    connectionFailures = in.readLong();
 
-    unfetched= in.readInt();
-    fetched= in.readInt();
-    notModified= in.readInt();
-    redirTemp= in.readInt();
-    redirPerm = in.readInt();
-    gone = in.readInt();
+    unfetched= in.readLong();
+    fetched= in.readLong();
+    notModified= in.readLong();
+    redirTemp= in.readLong();
+    redirPerm = in.readLong();
+    gone = in.readLong();
 
     metaData = new org.apache.hadoop.io.MapWritable();
     metaData.readFields(in);
@@ -269,15 +269,15 @@
     out.writeLong(lastCheck.getTime());
     Text.writeString(out, homepageUrl);
 
-    out.writeInt(dnsFailures);
-    out.writeInt(connectionFailures);
+    out.writeLong(dnsFailures);
+    out.writeLong(connectionFailures);
 
-    out.writeInt(unfetched);
-    out.writeInt(fetched);
-    out.writeInt(notModified);
-    out.writeInt(redirTemp);
-    out.writeInt(redirPerm);
-    out.writeInt(gone);
+    out.writeLong(unfetched);
+    out.writeLong(fetched);
+    out.writeLong(notModified);
+    out.writeLong(redirTemp);
+    out.writeLong(redirPerm);
+    out.writeLong(gone);
 
     metaData.write(out);
   }
@@ -285,25 +285,25 @@
   @Override
   public String toString() {
     StringBuilder buf = new StringBuilder();
-    buf.append(Integer.toString(getUnfetched()));
+    buf.append(Long.toString(getUnfetched()));
     buf.append("\t");
-    buf.append(Integer.toString(getFetched()));
+    buf.append(Long.toString(getFetched()));
     buf.append("\t");
-    buf.append(Integer.toString(getGone()));
+    buf.append(Long.toString(getGone()));
     buf.append("\t");
-    buf.append(Integer.toString(getRedirTemp()));
+    buf.append(Long.toString(getRedirTemp()));
     buf.append("\t");
-    buf.append(Integer.toString(getRedirPerm()));
+    buf.append(Long.toString(getRedirPerm()));
     buf.append("\t");
-    buf.append(Integer.toString(getNotModified()));
+    buf.append(Long.toString(getNotModified()));
     buf.append("\t");
-    buf.append(Integer.toString(numRecords()));
+    buf.append(Long.toString(numRecords()));
     buf.append("\t");
-    buf.append(Integer.toString(getDnsFailures()));
+    buf.append(Long.toString(getDnsFailures()));
     buf.append("\t");
-    buf.append(Integer.toString(getConnectionFailures()));
+    buf.append(Long.toString(getConnectionFailures()));
     buf.append("\t");
-    buf.append(Integer.toString(numFailures()));
+    buf.append(Long.toString(numFailures()));
     buf.append("\t");
     buf.append(Float.toString(score));
     buf.append("\t");

diff --git a/src/java/org/apache/nutch/hostdb/ResolverThread.java b/src/java/org/apache/nutch/hostdb/ResolverThread.java
index fe66217..564e5da 100644
--- a/src/java/org/apache/nutch/hostdb/ResolverThread.java
+++ b/src/java/org/apache/nutch/hostdb/ResolverThread.java

@@ -71,7 +71,7 @@
       } else if (datum.getDnsFailures() > 0) {
         context.getCounter("UpdateHostDb", "rediscovered_host").increment(1);
         datum.setLastCheck();
-        datum.setDnsFailures(0);
+        datum.setDnsFailures(0l);
         LOG.info(host + ": rediscovered_host " + datum);
       } else {
         context.getCounter("UpdateHostDb", "existing_known_host").increment(1);
@@ -86,7 +86,7 @@
         // If the counter is empty we'll initialize with date = today and 1 failure
         if (datum.isEmpty()) {
           datum.setLastCheck();
-          datum.setDnsFailures(1);
+          datum.setDnsFailures(1l);
           context.write(hostText, datum);
           context.getCounter("UpdateHostDb", "new_unknown_host").increment(1);
           LOG.info(host + ": new_unknown_host " + datum);
@@ -108,7 +108,7 @@
         }
 
         context.getCounter("UpdateHostDb",
-          Integer.toString(datum.numFailures()) + "_times_failed").increment(1);
+          Long.toString(datum.numFailures()) + "_times_failed").increment(1);
       } catch (Exception ioe) {
         LOG.warn(StringUtils.stringifyException(ioe));
       }

diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
index 15a7e37..c239349 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java

@@ -58,9 +58,6 @@
   protected URLFilters filters = null;
   protected URLNormalizers normalizers = null;
 
-  /**
-   * @param job
-   */
   @Override
   public void setup(Mapper<Text, Writable, Text, NutchWritable>.Context context) {
     Configuration conf = context.getConfiguration();

diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
index 70ce3eb..862a3c9 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java

@@ -28,7 +28,7 @@
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.FloatWritable;
-import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.Reducer;
@@ -71,8 +71,6 @@
 
   /**
     * Configures the thread pool and prestarts all resolver threads.
-    *
-    * @param job
     */
   @Override
   public void setup(Reducer<Text, NutchWritable, Text, HostDatum>.Context context) {
@@ -118,10 +116,10 @@
   public void reduce(Text key, Iterable<NutchWritable> values,
     Context context) throws IOException, InterruptedException {
 
-    Map<String,Map<String,Integer>> stringCounts = new HashMap<>();
+    Map<String,Map<String,Long>> stringCounts = new HashMap<>();
     Map<String,Float> maximums = new HashMap<>();
     Map<String,Float> sums = new HashMap<>(); // used to calc averages
-    Map<String,Integer> counts = new HashMap<>(); // used to calc averages
+    Map<String,Long> counts = new HashMap<>(); // used to calc averages
     Map<String,Float> minimums = new HashMap<>();
     Map<String,TDigest> tdigests = new HashMap<String,TDigest>();
     
@@ -146,27 +144,27 @@
         // Set the correct status field
         switch (buffer.getStatus()) {
           case CrawlDatum.STATUS_DB_UNFETCHED:
-            hostDatum.setUnfetched(hostDatum.getUnfetched() + 1);
+            hostDatum.setUnfetched(hostDatum.getUnfetched() + 1l);
             break;
 
           case CrawlDatum.STATUS_DB_FETCHED:
-            hostDatum.setFetched(hostDatum.getFetched() + 1);
+            hostDatum.setFetched(hostDatum.getFetched() + 1l);
             break;
 
           case CrawlDatum.STATUS_DB_GONE:
-            hostDatum.setGone(hostDatum.getGone() + 1);
+            hostDatum.setGone(hostDatum.getGone() + 1l);
             break;
 
           case CrawlDatum.STATUS_DB_REDIR_TEMP:
-            hostDatum.setRedirTemp(hostDatum.getRedirTemp() + 1);
+            hostDatum.setRedirTemp(hostDatum.getRedirTemp() + 1l);
             break;
 
           case CrawlDatum.STATUS_DB_REDIR_PERM:
-            hostDatum.setRedirPerm(hostDatum.getRedirPerm() + 1);
+            hostDatum.setRedirPerm(hostDatum.getRedirPerm() + 1l);
             break;
 
           case CrawlDatum.STATUS_DB_NOTMODIFIED:
-            hostDatum.setNotModified(hostDatum.getNotModified() + 1);
+            hostDatum.setNotModified(hostDatum.getNotModified() + 1l);
             break;
         }
         
@@ -193,10 +191,10 @@
                 // Does the value exist?
                 if (stringCounts.get(stringFields[i]).containsKey(metadataValue)) {
                   // Yes, increment it
-                  stringCounts.get(stringFields[i]).put(metadataValue, stringCounts.get(stringFields[i]).get(metadataValue) + 1);
+                  stringCounts.get(stringFields[i]).put(metadataValue, stringCounts.get(stringFields[i]).get(metadataValue) + 1l);
                 } else {
                   // Create it!
-                  stringCounts.get(stringFields[i]).put(metadataValue, 1);
+                  stringCounts.get(stringFields[i]).put(metadataValue, 1l);
                 }
               }
             }
@@ -247,11 +245,11 @@
                   if (sums.containsKey(numericFields[i])) {
                     // Increment
                     sums.put(numericFields[i], sums.get(numericFields[i]) + metadataValue);
-                    counts.put(numericFields[i], counts.get(numericFields[i]) + 1);
+                    counts.put(numericFields[i], counts.get(numericFields[i]) + 1l);
                   } else {
                     // Create it!
                     sums.put(numericFields[i], metadataValue);
-                    counts.put(numericFields[i], 1);
+                    counts.put(numericFields[i], 1l);
                   }
                 } catch (Exception e) {
                   LOG.error(e.getMessage() + " when processing values for " + key.toString());
@@ -312,9 +310,9 @@
     }
     
     // Set metadata
-    for (Map.Entry<String, Map<String,Integer>> entry : stringCounts.entrySet()) {
-      for (Map.Entry<String,Integer> subEntry : entry.getValue().entrySet()) {
-        hostDatum.getMetaData().put(new Text(entry.getKey() + "." + subEntry.getKey()), new IntWritable(subEntry.getValue()));
+    for (Map.Entry<String, Map<String,Long>> entry : stringCounts.entrySet()) {
+      for (Map.Entry<String,Long> subEntry : entry.getValue().entrySet()) {
+        hostDatum.getMetaData().put(new Text(entry.getKey() + "." + subEntry.getKey()), new LongWritable(subEntry.getValue()));
       }
     }
     for (Map.Entry<String, Float> entry : maximums.entrySet()) {
@@ -326,7 +324,7 @@
     for (Map.Entry<String, TDigest> entry : tdigests.entrySet()) {
       // Emit all percentiles
       for (int i = 0; i < percentiles.length; i++) {
-        hostDatum.getMetaData().put(new Text("pct" + Integer.toString(percentiles[i]) + "." + entry.getKey()), new FloatWritable((float)entry.getValue().quantile(0.5)));
+        hostDatum.getMetaData().put(new Text("pct" + Long.toString(percentiles[i]) + "." + entry.getKey()), new FloatWritable((float)entry.getValue().quantile(0.5)));
       }
     }      
     for (Map.Entry<String, Float> entry : minimums.entrySet()) {

diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
index 37c675d..022cee7 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

@@ -267,7 +267,7 @@
     
     output.append("\n"); // For readability if keepClientCnxOpen
 
-    if (getConf().getBoolean("doIndex", false) && doc != null) {
+    if (getConf().getBoolean("doIndex", false)) {
       IndexWriters writers = IndexWriters.get(getConf());
       writers.open(getConf(), "IndexingFilterChecker");
       writers.write(doc);

diff --git a/src/java/org/apache/nutch/net/protocols/Response.java b/src/java/org/apache/nutch/net/protocols/Response.java
index 84ff627..16dd698 100644
--- a/src/java/org/apache/nutch/net/protocols/Response.java
+++ b/src/java/org/apache/nutch/net/protocols/Response.java

@@ -62,7 +62,7 @@
     NOT_TRUNCATED,
     /** fetch exceeded configured http.content.limit */
     LENGTH,
-    /** fetch exceeded configured http.fetch.duration */
+    /** fetch exceeded configured http.time.limit */
     TIME,
     /** network disconnect or timeout during fetch */
     DISCONNECT,

diff --git a/src/java/org/apache/nutch/parse/OutlinkExtractor.java b/src/java/org/apache/nutch/parse/OutlinkExtractor.java
index ea77231..a9b2bb1 100644
--- a/src/java/org/apache/nutch/parse/OutlinkExtractor.java
+++ b/src/java/org/apache/nutch/parse/OutlinkExtractor.java

@@ -128,7 +128,7 @@
     final Outlink[] retval;
 
     // create array of the Outlinks
-    if (outlinks != null && outlinks.size() > 0) {
+    if (outlinks.size() > 0) {
       retval = outlinks.toArray(new Outlink[0]);
     } else {
       retval = new Outlink[0];

diff --git a/src/java/org/apache/nutch/parse/ParseData.java b/src/java/org/apache/nutch/parse/ParseData.java
index 5ff4ea5..e88c7ac 100644
--- a/src/java/org/apache/nutch/parse/ParseData.java
+++ b/src/java/org/apache/nutch/parse/ParseData.java

@@ -149,20 +149,10 @@
       outlinks[i] = Outlink.read(in);
     }
 
-    if (version < 3) {
-      int propertyCount = in.readInt(); // read metadata
-      contentMeta.clear();
-      for (int i = 0; i < propertyCount; i++) {
-        contentMeta.add(Text.readString(in), Text.readString(in));
-      }
-    } else {
-      contentMeta.clear();
-      contentMeta.readFields(in);
-    }
-    if (version > 3) {
-      parseMeta.clear();
-      parseMeta.readFields(in);
-    }
+    contentMeta.clear();
+    contentMeta.readFields(in);
+    parseMeta.clear();
+    parseMeta.readFields(in);
   }
 
   public final void write(DataOutput out) throws IOException {

diff --git a/src/java/org/apache/nutch/parse/ParsePluginsReader.java b/src/java/org/apache/nutch/parse/ParsePluginsReader.java
index d14306a..4420111 100644
--- a/src/java/org/apache/nutch/parse/ParsePluginsReader.java
+++ b/src/java/org/apache/nutch/parse/ParsePluginsReader.java

@@ -238,7 +238,7 @@
     Map<String, String> aliases = new HashMap<>();
     NodeList aliasRoot = parsePluginsRoot.getElementsByTagName("aliases");
 
-    if (aliasRoot == null || (aliasRoot != null && aliasRoot.getLength() == 0)) {
+    if (aliasRoot == null || aliasRoot.getLength() == 0) {
       if (LOG.isWarnEnabled()) {
         LOG.warn("No aliases defined in parse-plugins.xml!");
       }

diff --git a/src/java/org/apache/nutch/plugin/Extension.java b/src/java/org/apache/nutch/plugin/Extension.java
index e73b850..be737cb 100644
--- a/src/java/org/apache/nutch/plugin/Extension.java
+++ b/src/java/org/apache/nutch/plugin/Extension.java

@@ -197,4 +197,8 @@
   public void setDescriptor(PluginDescriptor pDescriptor) {
     fDescriptor = pDescriptor;
   }
+
+  public String toString() {
+    return getId() + ", " + getClazz() + ", " + getTargetPoint();
+  }
 }

diff --git a/src/java/org/apache/nutch/protocol/ProtocolFactory.java b/src/java/org/apache/nutch/protocol/ProtocolFactory.java
index 5c2c96a..7dcc400 100644
--- a/src/java/org/apache/nutch/protocol/ProtocolFactory.java
+++ b/src/java/org/apache/nutch/protocol/ProtocolFactory.java

@@ -16,8 +16,13 @@
  */
 package org.apache.nutch.protocol;
 
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
 import java.net.URL;
 import java.net.MalformedURLException;
+import java.util.HashMap;
+import java.util.Map;
 
 import org.apache.nutch.plugin.Extension;
 import org.apache.nutch.plugin.ExtensionPoint;
@@ -25,8 +30,13 @@
 import org.apache.nutch.plugin.PluginRuntimeException;
 import org.apache.nutch.util.ObjectCache;
 
+import org.apache.commons.lang.StringUtils;
+
 import org.apache.hadoop.conf.Configuration;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 /**
  * Creates and caches {@link Protocol} plugins. Protocol plugins should define
  * the attribute "protocolName" with the name of the protocol that they
@@ -36,10 +46,16 @@
  */
 public class ProtocolFactory {
 
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
   private ExtensionPoint extensionPoint;
 
   private Configuration conf;
 
+  protected Map<String, String> defaultProtocolImplMapping = new HashMap<>();
+  protected Map<String, String> hostProtocolMapping = new HashMap<>();
+
   public ProtocolFactory(Configuration conf) {
     this.conf = conf;
     this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
@@ -48,8 +64,35 @@
       throw new RuntimeException("x-point " + Protocol.X_POINT_ID
           + " not found.");
     }
-  }
 
+    try {
+      BufferedReader reader = new BufferedReader(conf.getConfResourceAsReader("host-protocol-mapping.txt"));
+      String line;
+      String parts[];
+      while ((line = reader.readLine()) != null) {
+        if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+          line = line.trim();
+          parts = line.split("\t");
+
+          // Must be at least two parts
+          if (parts.length == 2) {
+            // Is this a host to plugin mapping, or a default?
+            if (parts[0].indexOf(":") == -1) {
+              hostProtocolMapping.put(parts[0].trim(), parts[1].trim());
+            } else {
+              String[] moreParts = parts[0].split(":");
+              defaultProtocolImplMapping.put(moreParts[1].trim(), parts[1].trim());
+            }
+          } else {
+            LOG.warn("Wrong format of line: {}", line);
+            LOG.warn("Expected format: <hostname> <tab> <plugin_id> or protocol:<protocol> <tab> <plugin_id>");
+          }
+        }
+      }
+    } catch (IOException e) {
+      LOG.error("Unable to read host-protocol-mapping.txt", e);
+    }
+  }
   /**
    * Returns the appropriate {@link Protocol} implementation for a url.
    * 
@@ -82,52 +125,92 @@
    */
   public Protocol getProtocol(URL url)
       throws ProtocolNotFound {
-    ObjectCache objectCache = ObjectCache.get(conf);
     try {
-      String protocolName = url.getProtocol();
-      if (protocolName == null) {
-        throw new ProtocolNotFound(url.toString());
+      Protocol protocol = null;
+
+      // First attempt to resolve a protocol implementation by hostname
+      String host = url.getHost();
+      if (hostProtocolMapping.containsKey(host)) {
+        Extension extension = getExtensionById(hostProtocolMapping.get(host));
+        if (extension != null) {
+          protocol = getProtocolInstanceByExtension(extension);
+        }
       }
 
-      String cacheId = Protocol.X_POINT_ID + protocolName;
-      synchronized (objectCache) {
-        Protocol protocol = (Protocol) objectCache.getObject(cacheId);
-        if (protocol != null) {
-          return protocol;
+      // Nothing, see if we have defaults configured
+      if (protocol == null) {
+        // Protocol listed in default map?
+        if (defaultProtocolImplMapping.containsKey(url.getProtocol())) {
+          Extension extension = getExtensionById(defaultProtocolImplMapping.get(url.getProtocol()));
+          if (extension != null) {
+            protocol = getProtocolInstanceByExtension(extension);
+          }
         }
+      }
 
-        Extension extension = findExtension(protocolName);
-        if (extension == null) {
-          throw new ProtocolNotFound(protocolName);
+      // Still couldn't find a protocol? Attempt by protocol
+      if (protocol == null) {
+        Extension extension = findExtension(url.getProtocol(), "protocolName");
+        if (extension != null) {
+          protocol = getProtocolInstanceByExtension(extension);
         }
+      }
 
-        protocol = (Protocol) extension.getExtensionInstance();
-        objectCache.setObject(cacheId, protocol);
+      // Got anything?
+      if (protocol != null) {
         return protocol;
       }
+
+      // Nothing!
+      throw new ProtocolNotFound(url.toString());
     } catch (PluginRuntimeException e) {
       throw new ProtocolNotFound(url.toString(), e.toString());
     }
   }
 
-  private Extension findExtension(String name) throws PluginRuntimeException {
+  private Protocol getProtocolInstanceByExtension(Extension extension) throws PluginRuntimeException {
+    Protocol protocol = null;
+    String cacheId = extension.getId();
+    ObjectCache objectCache = ObjectCache.get(conf);
+    synchronized (objectCache) {
+      if (!objectCache.hasObject(cacheId)) {
+        protocol = (Protocol) extension.getExtensionInstance();
+        objectCache.setObject(cacheId, protocol);
+      }
+      protocol = (Protocol) objectCache.getObject(cacheId);
+    }
 
+    return protocol;
+  }
+
+  private Extension getExtensionById(String id) {
     Extension[] extensions = this.extensionPoint.getExtensions();
-
     for (int i = 0; i < extensions.length; i++) {
-      Extension extension = extensions[i];
+      if (id.equals(extensions[i].getId())) {
+        return extensions[i];
+      }
+    }
 
-      if (contains(name, extension.getAttribute("protocolName")))
+    return null;
+  }
+
+  private Extension findExtension(String name, String attribute) throws PluginRuntimeException {
+    for (int i = 0; i < this.extensionPoint.getExtensions().length; i++) {
+      Extension extension = this.extensionPoint.getExtensions()[i];
+
+      if (contains(name, extension.getAttribute(attribute)))
         return extension;
     }
     return null;
   }
 
   boolean contains(String what, String where) {
-    String parts[] = where.split("[, ]");
-    for (int i = 0; i < parts.length; i++) {
-      if (parts[i].equals(what))
-        return true;
+    if (where != null) {
+      String parts[] = where.split("[, ]");
+      for (int i = 0; i < parts.length; i++) {
+        if (parts[i].equals(what))
+          return true;
+      }
     }
     return false;
   }

diff --git a/src/java/org/apache/nutch/segment/SegmentMerger.java b/src/java/org/apache/nutch/segment/SegmentMerger.java
index f4adf52..9fac5b2 100644
--- a/src/java/org/apache/nutch/segment/SegmentMerger.java
+++ b/src/java/org/apache/nutch/segment/SegmentMerger.java

@@ -392,7 +392,7 @@
         try {
           url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT); // normalize the url.
         } catch (Exception e) {
-          LOG.warn("Skipping {} :", url, e.getMessage());
+          LOG.warn("Skipping {} : {}", url, e.getMessage());
           url = null;
         }
       }
@@ -400,7 +400,7 @@
         try {
           url = filters.filter(url);
         } catch (Exception e) {
-          LOG.warn("Skipping key {} : ", url, e.getMessage());
+          LOG.warn("Skipping key {} : {}", url, e.getMessage());
           url = null;
         }
       }

diff --git a/src/java/org/apache/nutch/service/impl/LinkReader.java b/src/java/org/apache/nutch/service/impl/LinkReader.java
index e24fe1e..f3e54a3 100644
--- a/src/java/org/apache/nutch/service/impl/LinkReader.java
+++ b/src/java/org/apache/nutch/service/impl/LinkReader.java

@@ -61,7 +61,7 @@
 
     }catch (IOException e) {
       e.printStackTrace();
-      LOG.error("Error occurred while reading file {} : ", file, StringUtils.stringifyException(e));
+      LOG.error("Error occurred while reading file {} : {}", file, StringUtils.stringifyException(e));
       throw new WebApplicationException();
     } 
 
@@ -93,7 +93,7 @@
 
     }catch (IOException e) {
       e.printStackTrace();
-      LOG.error("Error occurred while reading file {} : ", file, StringUtils.stringifyException(e));
+      LOG.error("Error occurred while reading file {} : {}", file, StringUtils.stringifyException(e));
       throw new WebApplicationException();
     } 
 
@@ -127,7 +127,7 @@
 
     }catch (IOException e) {
       e.printStackTrace();
-      LOG.error("Error occurred while reading file {} : ", file, StringUtils.stringifyException(e));
+      LOG.error("Error occurred while reading file {} : {}", file, StringUtils.stringifyException(e));
       throw new WebApplicationException();
     } 
 
@@ -151,7 +151,7 @@
     } catch(FileNotFoundException fne){ 
       throw new FileNotFoundException();
     }catch (IOException e) {
-      LOG.error("Error occurred while reading file {} : ", file, StringUtils.stringifyException(e));
+      LOG.error("Error occurred while reading file {} : {}", file, StringUtils.stringifyException(e));
       throw new WebApplicationException();
     } 
     return i;

diff --git a/src/java/org/apache/nutch/service/impl/NodeReader.java b/src/java/org/apache/nutch/service/impl/NodeReader.java
index 48379c8..612fa26 100644
--- a/src/java/org/apache/nutch/service/impl/NodeReader.java
+++ b/src/java/org/apache/nutch/service/impl/NodeReader.java

@@ -61,7 +61,7 @@
 
     }catch (IOException e) {
       e.printStackTrace();
-      LOG.error("Error occurred while reading file {} : ", file, StringUtils.stringifyException(e));
+      LOG.error("Error occurred while reading file {} : {}", file, StringUtils.stringifyException(e));
       throw new WebApplicationException();
     } 
 
@@ -93,7 +93,7 @@
 
     }catch (IOException e) {
       e.printStackTrace();
-      LOG.error("Error occurred while reading file {} : ", file, 
+      LOG.error("Error occurred while reading file {} : {}", file, 
           StringUtils.stringifyException(e));
       throw new WebApplicationException();
     } 
@@ -128,7 +128,7 @@
 
     }catch (IOException e) {
       e.printStackTrace();
-      LOG.error("Error occurred while reading file {} : ", file, 
+      LOG.error("Error occurred while reading file {} : {}", file, 
           StringUtils.stringifyException(e));
       throw new WebApplicationException();
     } 
@@ -157,7 +157,7 @@
 
     }catch (IOException e) {
       e.printStackTrace();
-      LOG.error("Error occurred while reading file {} : ", file, 
+      LOG.error("Error occurred while reading file {} : {}", file, 
           StringUtils.stringifyException(e));
       throw new WebApplicationException();
     } 

diff --git a/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java b/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java
index c06e611..f533cd1 100644
--- a/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java
+++ b/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java

@@ -53,7 +53,7 @@
   protected void afterExecute(Runnable runnable, Throwable throwable) {
     super.afterExecute(runnable, throwable);
     synchronized (runningWorkers) {
-      runningWorkers.remove(((JobWorker) runnable).getInfo());
+      runningWorkers.remove((JobWorker) runnable);
     }
     JobWorker worker = ((JobWorker) runnable);
     addStatusToHistory(worker);

diff --git a/src/java/org/apache/nutch/service/impl/SequenceReader.java b/src/java/org/apache/nutch/service/impl/SequenceReader.java
index 638f695..26b3d55 100644
--- a/src/java/org/apache/nutch/service/impl/SequenceReader.java
+++ b/src/java/org/apache/nutch/service/impl/SequenceReader.java

@@ -64,7 +64,7 @@
     }catch (IOException e) {
       // TODO Auto-generated catch block
       e.printStackTrace();
-      LOG.error("Error occurred while reading file {} : ", file, 
+      LOG.error("Error occurred while reading file {} : {}", file,
           StringUtils.stringifyException(e));
       throw new WebApplicationException();
     } 
@@ -99,7 +99,7 @@
       throw new FileNotFoundException();
     }catch (IOException e) {
       // TODO Auto-generated catch block
-      LOG.error("Error occurred while reading file {} : ", file, 
+      LOG.error("Error occurred while reading file {} : {}", file,
           StringUtils.stringifyException(e));
       throw new WebApplicationException();
     } 
@@ -134,7 +134,7 @@
       throw new FileNotFoundException();
     }catch (IOException e) {
       // TODO Auto-generated catch block
-      LOG.error("Error occurred while reading file {} : ", file, 
+      LOG.error("Error occurred while reading file {} : {}", file,
           StringUtils.stringifyException(e));
       throw new WebApplicationException();
     } 
@@ -161,7 +161,7 @@
       throw new FileNotFoundException();
     }catch (IOException e) {
       // TODO Auto-generated catch block
-      LOG.error("Error occurred while reading file {} : ", file, 
+      LOG.error("Error occurred while reading file {} : {}", file,
           StringUtils.stringifyException(e));
       throw new WebApplicationException();
     } 

diff --git a/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java b/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java
index 9489297..3b1593b 100644
--- a/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java
+++ b/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java

@@ -46,8 +46,8 @@
    *          The InputSplit of the arc file to process.
    * @param job
    *          The job configuration.
-   * @param reporter
-   *          The progress reporter.
+   * @param context
+   *          The task context.
    */
   public RecordReader<Text, BytesWritable> getRecordReader(InputSplit split,
       Job job, Context context) throws IOException {

diff --git a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
index cf9eec0..b5f7a44 100644
--- a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
+++ b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java

@@ -308,7 +308,7 @@
         return true;
       }
     } catch (Exception e) {
-      LOG.equals(StringUtils.stringifyException(e));
+      LOG.error("Failed reading ARC record: ", e);
     }
 
     // couldn't populate the record or there is no next record to read

diff --git a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
index e84ebe1..7a26748 100644
--- a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
+++ b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java

@@ -269,8 +269,8 @@
      * and other relevant data.
      * </p>
      * 
-     * @param job
-     *          The job configuration.
+     * @param context
+     *          The task context.
      */
     public void setup(Mapper<Text, BytesWritable, Text, NutchWritable>.Context context) { 
       // set the url filters, scoring filters the parse util and the url

diff --git a/src/java/org/apache/nutch/util/EncodingDetector.java b/src/java/org/apache/nutch/util/EncodingDetector.java
index 01e65e5..2b28447 100644
--- a/src/java/org/apache/nutch/util/EncodingDetector.java
+++ b/src/java/org/apache/nutch/util/EncodingDetector.java

@@ -170,10 +170,8 @@
       // will sometimes throw exceptions
       try {
         detector.enableInputFilter(filter);
-        if (data.length > MIN_LENGTH) {
-          detector.setText(data);
-          matches = detector.detectAll();
-        }
+        detector.setText(data);
+        matches = detector.detectAll();
       } catch (Exception e) {
         LOG.debug("Exception from ICU4J (ignoring): ", e);
       }

diff --git a/src/java/org/apache/nutch/util/JexlUtil.java b/src/java/org/apache/nutch/util/JexlUtil.java
index bc73d68..42c8728 100644
--- a/src/java/org/apache/nutch/util/JexlUtil.java
+++ b/src/java/org/apache/nutch/util/JexlUtil.java

@@ -29,45 +29,47 @@
 import org.slf4j.LoggerFactory;
 
 /**
- * A collection of Jexl utilit(y|ies).
+ * Utility methods for handling JEXL expressions
  */
 public class JexlUtil {
 
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
 
-  /**
-   * 
-   */
-  public static Pattern datePattern = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
+  /** Supported format for date parsing yyyy-MM-ddTHH:mm:ssZ */
+  private static final Pattern DATE_PATTERN = Pattern.compile("\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z");
 
   /**
-   * Parses the given experssion to a Jexl expression. This supports
+   * Parses the given expression to a JEXL expression. This supports
    * date parsing.
    *
-   * @param expr the Jexl expression
-   * @return parsed Jexl expression or null in case of parse error
+   * @param expr string JEXL expression
+   * @return parsed JEXL expression or null in case of parse error
    */
   public static Expression parseExpression(String expr) {
     if (expr == null) return null;
     
     try {
-      // Translate any date object into a long, dates must be specified as 20-03-2016T00:00:00Z
-      Matcher matcher = datePattern.matcher(expr);
+      // Translate any date object into a long. Dates must be in the DATE_PATTERN
+      // format. For example: 2016-03-20T00:00:00Z
+      Matcher matcher = DATE_PATTERN.matcher(expr);
+
       if (matcher.find()) {
         String date = matcher.group();
         
-        // Parse the thing and get epoch!
+        // parse the matched substring and get the epoch
         Date parsedDate = DateUtils.parseDateStrictly(date, new String[] {"yyyy-MM-dd'T'HH:mm:ss'Z'"});
         long time = parsedDate.getTime();
         
-        // Replace in the original expression
+        // replace the original string date with the numeric value
         expr = expr.replace(date, Long.toString(time));
       }
-      
+
       JexlEngine jexl = new JexlEngine();
+
       jexl.setSilent(true);
       jexl.setStrict(true);
+
       return jexl.createExpression(expr);
     } catch (Exception e) {
       LOG.error(e.getMessage());

diff --git a/src/java/org/apache/nutch/util/MimeUtil.java b/src/java/org/apache/nutch/util/MimeUtil.java
index 203e912..17bb380 100644
--- a/src/java/org/apache/nutch/util/MimeUtil.java
+++ b/src/java/org/apache/nutch/util/MimeUtil.java

@@ -165,8 +165,7 @@
     }
 
     // if returned null, or if it's the default type then try url resolution
-    if (type == null
-        || (type != null && type.getName().equals(MimeTypes.OCTET_STREAM))) {
+    if (type == null || type.getName().equals(MimeTypes.OCTET_STREAM)) {
       // If no mime-type header, or cannot find a corresponding registered
       // mime-type, then guess a mime-type from the url pattern
       try {

diff --git a/src/java/org/apache/nutch/util/ObjectCache.java b/src/java/org/apache/nutch/util/ObjectCache.java
index 984760d..e313a6e 100644
--- a/src/java/org/apache/nutch/util/ObjectCache.java
+++ b/src/java/org/apache/nutch/util/ObjectCache.java

@@ -52,6 +52,10 @@
     return objectMap.get(key);
   }
 
+  public boolean hasObject(String key) {
+    return objectMap.containsKey(key);
+  }
+
   public synchronized void setObject(String key, Object value) {
     objectMap.put(key, value);
   }

diff --git a/src/plugin/exchange-jexl/README.md b/src/plugin/exchange-jexl/README.md
new file mode 100644
index 0000000..2d20242
--- /dev/null
+++ b/src/plugin/exchange-jexl/README.md

@@ -0,0 +1,64 @@
+exchange-jexl plugin for Nutch  
+==============================
+
+**exchange-jexl plugin** decides which index writer a document should be routed to, based on a JEXL expression.
+
+## Configuration
+
+The **exchange-jexl plugin** must be configured in the exchanges.xml file, included in the official Nutch distribution.
+
+```xml
+<exchanges>  
+  <exchange id="<exchange_id>" class="org.apache.nutch.exchange.jexl.JexlExchange">  
+    <writers>  
+      ...  
+    </writers>  
+    <params>  
+      <param name="expr" value="<jexl_expression>" />
+    </params>  
+  </exchange>  
+    ...  
+</exchanges>
+```
+
+Each `<exchange>` element has two mandatory attributes:
+
+* `<exchange_id>` is a unique identification for each configuration. It is used by Nutch to distinguish each one, even when they are for the same exchange implementation and this ID allows to have multiple instances for the same exchange, but with different configurations.
+
+* `org.apache.nutch.exchange.jexl.JexlExchange` corresponds to the canonical name of the class that implements the Exchange extension point. This value must not be modified for the **exchange-jexl plugin**.
+
+## Writers section
+
+The `<writers>` element is independent for each configuration and contains a list of `<writer id="<id>">` elements, where `<id>` indicates the ID of index writer where the documents should be routed.
+
+## Params section
+
+The `<params>` element is where the parameters that the exchange needs are specified. Each parameter has the form `<param name="<name>" value="<value>"/>`.
+
+The unique parameter needed by the **exchange-jexl plugin** has the `<name>` **expr** and the `<value>` is a JEXL expression used to validate each document. The variable **doc** can be used on the expressions and represents the document itself. For example, the expression `doc.getFieldValue('host')=='example.org'` will match the documents where the **host** field has the value **example.org**.
+
+## Use case 1
+
+```xml
+<exchanges xmlns="http://lucene.apache.org/nutch"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+       xsi:schemaLocation="http://lucene.apache.org/nutch exchanges.xsd">
+  <exchange id="exchange_jexl_1" class="org.apache.nutch.exchange.jexl.JexlExchange">
+    <writers>
+      <writer id="indexer_solr_1" />
+      <writer id="indexer_rabbit_1" />
+    </writers>
+    <params>
+      <param name="expr" value="doc.getFieldValue('host')=='example.org'" />
+    </params>
+  </exchange>
+  <exchange id="default" class="default">
+    <writers>
+      <writer id="indexer_dummy_1" />
+    </writers>
+    <params />
+  </exchange>
+</exchanges>
+```
+
+According to this example, the documents which the value of **host** field is **example.org** will be sent to **indexer_solr_1** and **indexer_rabbit_1**. The rest of documents where **host** is different to **example.org** do not match with **exchange_jexl_1** exchange and will be sent where the default exchange says; in this case to **indexer_dummy_1**.
\ No newline at end of file

diff --git a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
index ec6473c..633e0d4 100644
--- a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
+++ b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java

@@ -17,6 +17,7 @@
 package org.apache.nutch.indexer.jexl;
 
 import java.lang.invoke.MethodHandles;
+import java.util.List;
 import java.util.Map.Entry;
 
 import org.apache.commons.jexl2.Expression;
@@ -40,7 +41,6 @@
 /**
  * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering of
  * documents based on a JEXL expression.
- *
  */
 public class JexlIndexingFilter implements IndexingFilter {
 
@@ -83,9 +83,12 @@
         metadataToContext(parse.getData().getParseMeta()));
 
     JexlContext context = new MapContext();
+
     for (Entry<String, NutchField> entry : doc) {
-      context.set(entry.getKey(), entry.getValue().getValues());
+      List<Object> values = entry.getValue().getValues();
+      context.set(entry.getKey(), values.size() > 1 ? values : values.get(0));
     }
+
     jcontext.set("doc", context);
 
     try {
@@ -101,16 +104,21 @@
   @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
-    String str = conf.get("index.jexl.filter");
-    if (str == null) {
-      LOG.warn(
+    String strExpr = conf.get("index.jexl.filter");
+
+    if (strExpr == null) {
+      LOG.error(
           "The property index.jexl.filter must have a value when index-jexl-filter is used. You can use 'true' or 'false' to index all/none");
+
       throw new RuntimeException(
           "The property index.jexl.filter must have a value when index-jexl-filter is used. You can use 'true' or 'false' to index all/none");
     }
-    expr = JexlUtil.parseExpression(str);
+
+    expr = JexlUtil.parseExpression(strExpr);
+
     if (expr == null) {
-      LOG.warn("Failed parsing JEXL from index.jexl.filter: {}", str);
+      LOG.error("Failed parsing JEXL from index.jexl.filter: {}", strExpr);
+
       throw new RuntimeException("Failed parsing JEXL from index.jexl.filter");
     }
   }
@@ -122,9 +130,12 @@
 
   private JexlContext metadataToContext(Metadata metadata) {
     JexlContext context = new MapContext();
+
     for (String name : metadata.names()) {
-      context.set(name, metadata.getValues(name));
+      String[] values = metadata.getValues(name);
+      context.set(name, values.length > 1 ? values : values[0]);
     }
+
     return context;
   }
 }

diff --git a/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/jexl/TestJexlIndexingFilter.java b/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/jexl/TestJexlIndexingFilter.java
index 0427ad4..f3cc655 100644
--- a/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/jexl/TestJexlIndexingFilter.java
+++ b/src/plugin/index-jexl-filter/src/test/org/apache/nutch/indexer/jexl/TestJexlIndexingFilter.java

@@ -39,7 +39,7 @@
   @Test
   public void testAllowMatchingDocument() throws Exception {
     Configuration conf = NutchConfiguration.create();
-    conf.set("index.jexl.filter", "doc.lang[0]=='en'");
+    conf.set("index.jexl.filter", "doc.lang=='en'");
 
     JexlIndexingFilter filter = new JexlIndexingFilter();
     filter.setConf(conf);
@@ -73,7 +73,7 @@
   @Test
   public void testBlockNotMatchingDocuments() throws Exception {
     Configuration conf = NutchConfiguration.create();
-    conf.set("index.jexl.filter", "doc.lang[0]=='en'");
+    conf.set("index.jexl.filter", "doc.lang=='en'");
 
     JexlIndexingFilter filter = new JexlIndexingFilter();
     filter.setConf(conf);
@@ -115,7 +115,7 @@
   @Test
   public void testInvalidExpression() throws Exception {
     Configuration conf = NutchConfiguration.create();
-    conf.set("index.jexl.filter", "doc.lang[0]=<>:='en'");
+    conf.set("index.jexl.filter", "doc.lang=<>:='en'");
 
     JexlIndexingFilter filter = new JexlIndexingFilter();
     thrown.expect(RuntimeException.class);

diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
index 052ebd9..45b79b7 100644
--- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
+++ b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

@@ -81,6 +81,7 @@
   /** Map for mime-type substitution */
   private HashMap<String, String> mimeMap = null;
   private boolean mapMimes = false;
+  private String mapFieldName;
 
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
       CrawlDatum datum, Inlinks inlinks) throws IndexingException {
@@ -226,8 +227,11 @@
     if (mapMimes) {
       // Check if the current mime is mapped
       if (mimeMap.containsKey(mimeType)) {
-        // It's mapped, let's replace it
-        mimeType = mimeMap.get(mimeType);
+        if (mapFieldName != null) {
+          doc.add(mapFieldName, mimeMap.get(mimeType));
+        } else {
+          mimeType = mimeMap.get(mimeType);
+        }
       }
     }
 
@@ -273,7 +277,7 @@
   static {
     try {
       // order here is important
-      patterns[0] = Pattern.compile("\\bfilename=['\"](.+)['\"]");
+      patterns[0] = Pattern.compile("\\bfilename=['\"]([^\"]+)");
       patterns[1] = Pattern.compile("\\bfilename=(\\S+)\\b");
     } catch (PatternSyntaxException e) {
       // just ignore
@@ -300,9 +304,11 @@
     this.conf = conf;
     MIME = new MimeUtil(conf);
 
-    if (conf.getBoolean("moreIndexingFilter.mapMimeTypes", false) == true) {
+    if (conf.getBoolean("moreIndexingFilter.mapMimeTypes", false)) {
       mapMimes = true;
 
+      mapFieldName = conf.get("moreIndexingFilter.mapMimeTypes.field");
+
       // Load the mapping
       try {
         readConfiguration();

diff --git a/src/plugin/indexer-cloudsearch/README.md b/src/plugin/indexer-cloudsearch/README.md
index 8669682..ddef693 100644
--- a/src/plugin/indexer-cloudsearch/README.md
+++ b/src/plugin/indexer-cloudsearch/README.md

@@ -3,27 +3,41 @@
 
 See [http://aws.amazon.com/cloudsearch/] for information on AWS CloudSearch.
 
-Steps to use :
+**indexer-cloudsearch plugin** is used for sending documents from one or more segments to Amazon CloudSearch. The configuration for the index writers is on **conf/index-writers.xml** file, included in the official Nutch distribution and it's as follow:
 
-From runtime/local/bin
+```xml
+<writer id="<writer_id>" class="org.apache.nutch.indexwriter.cloudsearch.CloudSearchIndexWriter">
+  <mapping>
+    ...
+  </mapping>
+  <parameters>
+    ...
+  </parameters>
+</writer>
+```
 
-* Configure the AWS credentials 
+Each `<writer>` element has two mandatory attributes:
 
-Edit `~/.aws/credentials`, see [http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html] for details. Note that this should not be necessary when running Nutch on EC2.
+* `<writer_id>` is a unique identification for each configuration. This feature allows Nutch to distinguish each configuration, even when they are for the same index writer. In addition, it allows to have multiple instances for the same index writer, but with different configurations.
 
-* Edit ../conf/nutch-site.xml and check that 'plugin.includes' contains 'indexer-cloudsearch'. 
+* `org.apache.nutch.indexwriter.cloudsearch.CloudSearchIndexWriter` corresponds to the canonical name of the class that implements the IndexWriter extension point. This value should not be modified for the **indexer-cloudsearch plugin**.
 
-* (Optional) Test the indexing 
+## Mapping
 
-`./nutch indexchecker -D doIndex=true -D cloudsearch.batch.dump=true "http://nutch.apache.org/"`
+The mapping section is explained [here](https://wiki.apache.org/nutch/IndexWriters#Mapping_section). The structure of this section is general for all index writers.
 
-if the agent name hasn't been configured in nutch-site.xml, it can be added on the command line with `-D http.agent.name=whateverValueDescribesYouBest`
+## Parameters
 
-you should see the fields extracted for the indexing coming up on the console.
+Each parameter has the form `<param name="<name>" value="<value>"/>` and the parameters for this index writer are:
 
-Using the `cloudsearch.batch.dump` parameter allows to dump the batch to the local temp dir. The files has the prefix "CloudSearch_" e.g. `/tmp/CloudSearch_4822180575734804454.json`. This temp file can be used as a template when defining the fields in the domain creation (see below).
+Parameter Name | Description | Default value
+--|--|--
+endpoint | Endpoint where service requests should be submitted. | 
+region | Region name. | 
+batch.dump | **true** to store the JSON representation of the documents to a local temp dir. The files has the prefix "CloudSearch_" e.g. `/tmp/CloudSearch_4822180575734804454.json`. This temp file can be used as a template when defining the fields in the domain creation. | false
+batch.maxSize | Maximum number of documents to send as a batch to CloudSearch. | -1
 
-* Create a CloudSearch domain
+## Create a CloudSearch domain
 
 This can be done using the web console [https://eu-west-1.console.aws.amazon.com/cloudsearch/home?region=eu-west-1#]. You can use the temp file generated above to bootstrap the field definition. 
 
@@ -31,28 +45,14 @@
 
 Note that the creation of the domain can take some time. Once it is complete, note the document endpoint, or alternatively verify the region and domain name.
 
-* Edit ../conf/nutch-site.xml and add `cloudsearch.endpoint` and `cloudsearch.region`. 
+> The CloudSearchIndexWriter will log any errors while sending the batches to CloudSearch and will resume the process without breaking. This means that you might not get all the documents in the index. You should check the log files for errors. Using small batch sizes will limit the number of documents skipped in case of error.
 
-* Re-test the indexing
-
-`./nutch indexchecker -D doIndex=true "http://nutch.apache.org/"`
-
-and check in the CloudSearch console that the document has been succesfully indexed.
-
-Additional parameters
-
-* `cloudsearch.batch.maxSize` \: can be used to limit the size of the batches sent to CloudSearch to N documents. Note that the default limitations still apply.
-
-* `cloudsearch.batch.dump` \: see above. Stores the JSON representation of the document batch in the local temp dir, useful for bootstrapping the index definition.
-
-Note
-
-The CloudSearchIndexWriter will log any errors while sending the batches to CloudSearch and will resume the process without breaking. This means that you might not get all the documents in the index. You should check the log files for errors. Using small batch sizes will limit the number of documents skipped in case of error.
-
-Any fields not defined in the CloudSearch domain will be ignored by the CloudSearchIndexWriter. Again, the logs will contain a trace of any field names skipped.
+> Any fields not defined in the CloudSearch domain will be ignored by the CloudSearchIndexWriter. Again, the logs will contain a trace of any field names skipped.
 
 
 
-  
+
+
+
 
 

diff --git a/src/plugin/indexer-csv/README.md b/src/plugin/indexer-csv/README.md
new file mode 100644
index 0000000..1eadea1
--- /dev/null
+++ b/src/plugin/indexer-csv/README.md

@@ -0,0 +1,42 @@
+indexer-csv plugin for Nutch 
+============================
+
+**indexer-csv plugin** is used for writing documents to a CSV file. It does not work in distributed mode, the output is written to the local filesystem, not to HDFS, see [NUTCH-1541](https://issues.apache.org/jira/browse/NUTCH-1541). The configuration for the index writers is on **conf/index-writers.xml** file, included in the official Nutch distribution and it's as follow:
+
+```xml
+<writer id="<writer_id>" class="org.apache.nutch.indexwriter.csv.CSVIndexWriter">
+  <mapping>
+    ...
+  </mapping>
+  <parameters>
+    ...
+  </parameters>   
+</writer>
+```
+
+Each `<writer>` element has two mandatory attributes:
+
+* `<writer_id>` is a unique identification for each configuration. This feature allows Nutch to distinguish each configuration, even when they are for the same index writer. In addition, it allows to have multiple instances for the same index writer, but with different configurations.
+
+* `org.apache.nutch.indexwriter.csv.CSVIndexWriter` corresponds to the canonical name of the class that implements the IndexWriter extension point. This value should not be modified for the **indexer-csv plugin**.
+
+## Mapping
+
+The mapping section is explained [here](https://wiki.apache.org/nutch/IndexWriters#Mapping_section). The structure of this section is general for all index writers.
+
+## Parameters
+
+Each parameter has the form `<param name="<name>" value="<value>"/>` and the parameters for this index writer are:
+
+Parameter Name | Description | Default value
+--|--|--
+fields | Ordered list of fields (columns) in the CSV file | id,title,content
+charset | Encoding of CSV file | UTF-8
+separator | Separator between fields (columns) | ,
+valuesep | Separator between multiple values of one field | \|
+quotechar | Quote character used to quote fields containing separators or quotes | &quot;
+escapechar | Escape character used to escape a quote character | &quot;
+maxfieldlength | Max. length of a single field value in characters | 4096
+maxfieldvalues | Max. number of values of one field, useful for, e.g., the anchor texts field | 12
+header | Write CSV column headers | true
+outpath | Output path / directory (local filesystem path, relative to current working directory) | csvindexwriter
\ No newline at end of file

diff --git a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
index 4291fbf..160d03d 100644
--- a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
+++ b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java

@@ -392,7 +392,7 @@
 
   /**
    * Write a value to output stream. Escape quote characters.
-   * Clip value after <code>indexer.csv.maxfieldlength</code> characters.
+   * Clip value after <code>maxfieldlength</code> characters.
    *
    * @param value
    *          String to write
@@ -418,7 +418,7 @@
 
   /**
    * Write a value to output stream. Escape quote characters. Clip value after
-   * <code>indexer.csv.maxfieldlength</code> characters.
+   * <code>maxfieldlength</code> characters.
    */
   private void writeEscaped (String value) throws IOException {
     int nextQuoteChar = quoteCharacter.find(value, 0);

diff --git a/src/plugin/indexer-dummy/README.md b/src/plugin/indexer-dummy/README.md
new file mode 100644
index 0000000..0461789
--- /dev/null
+++ b/src/plugin/indexer-dummy/README.md

@@ -0,0 +1,34 @@
+indexer-dummy plugin for Nutch 
+==============================
+
+**indexer-dummy plugin** is used for writing "action"\t"url"\n lines to a plain text file for debugging purposes. It does not work in distributed mode, the output is written to the local filesystem, not to HDFS. The configuration for the index writers is on **conf/index-writers.xml** file, included in the official Nutch distribution and it's as follow:
+
+```xml
+<writer id="<writer_id>" class="org.apache.nutch.indexwriter.dummy.DummyIndexWriter">
+  <mapping>
+    ...
+  </mapping>
+  <parameters>
+    ...
+  </parameters>   
+</writer>
+```
+
+Each `<writer>` element has two mandatory attributes:
+
+* `<writer_id>` is a unique identification for each configuration. This feature allows Nutch to distinguish each configuration, even when they are for the same index writer. In addition, it allows to have multiple instances for the same index writer, but with different configurations.
+
+* `org.apache.nutch.indexwriter.dummy.DummyIndexWriter` corresponds to the canonical name of the class that implements the IndexWriter extension point. This value should not be modified for the **indexer-dummy plugin**.
+
+## Mapping
+
+The mapping section is explained [here](https://wiki.apache.org/nutch/IndexWriters#Mapping_section). The structure of this section is general for all index writers.
+
+## Parameters
+
+Each parameter has the form `<param name="<name>" value="<value>"/>` and the parameters for this index writer are:
+
+Parameter Name | Description | Default value
+--|--|--
+ path | Path where the file will be created. | ./dummy-index.txt
+ delete | If delete operations should be written to the file. | false
\ No newline at end of file

diff --git a/src/plugin/indexer-elastic-rest/README.md b/src/plugin/indexer-elastic-rest/README.md
new file mode 100644
index 0000000..e5a76c9
--- /dev/null
+++ b/src/plugin/indexer-elastic-rest/README.md

@@ -0,0 +1,45 @@
+indexer-elastic-rest plugin for Nutch 
+=====================================
+
+**indexer-elastic-rest plugin** is used for sending documents from one or more segments to Elasticsearch, but using Jest to connect with the REST API provided by Elasticsearch. The configuration for the index writers is on **conf/index-writers.xml** file, included in the official Nutch distribution and it's as follow:
+
+```xml
+<writer id="<writer_id>" class="org.apache.nutch.indexwriter.elasticrest.ElasticRestIndexWriter">
+  <mapping>
+    ...
+  </mapping>
+  <parameters>
+    ...
+  </parameters>   
+</writer>
+```
+
+Each `<writer>` element has two mandatory attributes:
+
+* `<writer_id>` is a unique identification for each configuration. This feature allows Nutch to distinguish each configuration, even when they are for the same index writer. In addition, it allows to have multiple instances for the same index writer, but with different configurations.
+
+* `org.apache.nutch.indexwriter.elasticrest.ElasticRestIndexWriter` corresponds to the canonical name of the class that implements the IndexWriter extension point. This value should not be modified for the **indexer-elastic-rest plugin**.
+
+## Mapping
+
+The mapping section is explained [here](https://wiki.apache.org/nutch/IndexWriters#Mapping_section). The structure of this section is general for all index writers.
+
+## Parameters
+
+Each parameter has the form `<param name="<name>" value="<value>"/>` and the parameters for this index writer are:
+
+Parameter Name | Description | Default value
+--|--|--
+host | The hostname or a list of comma separated hostnames to send documents to using Elasticsearch Jest. Both host and port must be defined. |  
+port | The port to connect to using Elasticsearch Jest. | 9200
+index | Default index to send documents to. | nutch
+max.bulk.docs | Maximum size of the bulk in number of documents. | 250
+max.bulk.size | Maximum size of the bulk in bytes. | 2500500
+user | Username for auth credentials (only used when https is enabled) | user
+password | Password for auth credentials (only used when https is enabled) | password
+type | Default type to send documents to. | doc
+https | **true** to enable https, **false** to disable https. If you've disabled http access (by forcing https), be sure to set this to true, otherwise you might get "connection reset by peer". | false
+trustallhostnames | **true** to trust elasticsearch server's certificate even if its listed domain name does not match the domain they are hosted or **false** to check if the elasticsearch server's certificate's listed domain is the same domain that it is hosted on, and if it doesn't, then fail to index (only used when https is enabled) | false
+languages | A list of strings denoting the supported languages (e.g. `en, de, fr, it`). If this value is empty all documents will be sent to index property. If not empty the Rest client will distribute documents in different indices based on their `languages` property. Indices are named with the following schema: `index separator language` (e.g. `nutch_de`). Entries with an unsupported `languages` value will be added to index `index separator sink` (e.g. `nutch_others`). | 
+separator | Is used only if `languages` property is defined to build the index name (i.e. `index separator lang`). | _
+sink | Is used only if `languages` property is defined to build the index name where to store documents with unsupported languages (i.e. `index separator sink`). | others 
\ No newline at end of file

diff --git a/src/plugin/indexer-elastic/README.md b/src/plugin/indexer-elastic/README.md
new file mode 100644
index 0000000..0ac4f08
--- /dev/null
+++ b/src/plugin/indexer-elastic/README.md

@@ -0,0 +1,41 @@
+indexer-elastic plugin for Nutch 
+================================
+
+**indexer-elastic plugin** is used for sending documents from one or more segments to an Elasticsearch server. The configuration for the index writers is on **conf/index-writers.xml** file, included in the official Nutch distribution and it's as follow:
+
+```xml
+<writer id="<writer_id>" class="org.apache.nutch.indexwriter.elastic.ElasticIndexWriter">
+  <mapping>
+    ...
+  </mapping>
+  <parameters>
+    ...
+  </parameters>   
+</writer>
+```
+
+Each `<writer>` element has two mandatory attributes:
+
+* `<writer_id>` is a unique identification for each configuration. This feature allows Nutch to distinguish each configuration, even when they are for the same index writer. In addition, it allows to have multiple instances for the same index writer, but with different configurations.
+
+* `org.apache.nutch.indexwriter.elastic.ElasticIndexWriter` corresponds to the canonical name of the class that implements the IndexWriter extension point. This value should not be modified for the **indexer-elastic plugin**.
+
+## Mapping
+
+The mapping section is explained [here](https://wiki.apache.org/nutch/IndexWriters#Mapping_section). The structure of this section is general for all index writers.
+
+## Parameters
+
+Each parameter has the form `<param name="<name>" value="<value>"/>` and the parameters for this index writer are:
+
+Parameter Name | Description | Default value
+--|--|--
+host | Comma-separated list of hostnames to send documents to using [TransportClient](https://static.javadoc.io/org.elasticsearch/elasticsearch/5.3.0/org/elasticsearch/client/transport/TransportClient.html). Either host and port must be defined or cluster. | 
+port | The port to connect to using [TransportClient](https://static.javadoc.io/org.elasticsearch/elasticsearch/5.3.0/org/elasticsearch/client/transport/TransportClient.html). | 9300
+cluster | The cluster name to discover. Either host and port must be defined or cluster. | 
+index | Default index to send documents to. | nutch
+max.bulk.docs | Maximum size of the bulk in number of documents. | 250
+max.bulk.size | Maximum size of the bulk in bytes. | 2500500
+exponential.backoff.millis | Initial delay for the [BulkProcessor](https://static.javadoc.io/org.elasticsearch/elasticsearch/5.3.0/org/elasticsearch/action/bulk/BulkProcessor.html) exponential backoff policy. | 100
+exponential.backoff.retries | Number of times the [BulkProcessor](https://static.javadoc.io/org.elasticsearch/elasticsearch/5.3.0/org/elasticsearch/action/bulk/BulkProcessor.html) exponential backoff policy should retry bulk operations. | 10
+bulk.close.timeout | Number of seconds allowed for the [BulkProcessor](https://static.javadoc.io/org.elasticsearch/elasticsearch/5.3.0/org/elasticsearch/action/bulk/BulkProcessor.html) to complete its last operation. | 600
\ No newline at end of file

diff --git a/src/plugin/indexer-rabbit/README.md b/src/plugin/indexer-rabbit/README.md
new file mode 100644
index 0000000..ea043ed
--- /dev/null
+++ b/src/plugin/indexer-rabbit/README.md

@@ -0,0 +1,44 @@
+indexer-rabbit plugin for Nutch
+===============================
+
+**indexer-rabbit plugin** is used for sending documents from one or more segments to a RabbitMQ server. The configuration for the index writers is on **conf/index-writers.xml** file, included in the official Nutch distribution and it's as follow:
+
+```xml
+<writer id="<writer_id>" class="org.apache.nutch.indexwriter.rabbit.RabbitIndexWriter">
+  <mapping>
+    ...
+  </mapping>
+  <parameters>
+    ...
+  </parameters>
+</writer>
+```
+
+Each `<writer>` element has two mandatory attributes:
+
+* `<writer_id>` is a unique identification for each configuration. This feature allows Nutch to distinguish each configuration, even when they are for the same index writer. In addition, it allows to have multiple instances for the same index writer, but with different configurations.
+
+* `org.apache.nutch.indexwriter.rabbit.RabbitIndexWriter` corresponds to the canonical name of the class that implements the IndexWriter extension point. This value should not be modified for the **indexer-rabbit plugin**.
+
+## Mapping
+
+The mapping section is explained [here](https://wiki.apache.org/nutch/IndexWriters#Mapping_section). The structure of this section is general for all index writers.
+
+## Parameters
+
+Each parameter has the form `<param name="<name>" value="<value>"/>` and the parameters for this index writer are:
+
+Parameter Name | Description | Default value
+--|--|--
+server.uri | URI with connection parameters in the form `amqp://<username>:<password>@<hostname>:<port>/<virtualHost>`<br>Where:<ul><li>`<username>` is the username for RabbitMQ server.</li><li>`<password>` is the password for RabbitMQ server.</li><li>`<hostname>` is where the RabbitMQ server is running.</li><li>`<port>` is where the RabbitMQ server is listening.</li><li>`<virtualHost>` is where the exchange is and the user has access.</li></ul> | amqp://guest:guest@localhost:5672/
+binding | Whether the relationship between an exchange and a queue is created automatically.<br>**NOTE:** Binding between exchanges is not supported. | false
+binding.arguments | Arguments used in binding. It must have the form `key1=value1,key2=value2`. This value is only used when the exchange's type is headers and the value of binding property is **true**. In other cases is ignored. | 
+exchange.name | Name for the exchange where the messages will be sent. | 
+exchange.options | Options used when the exchange is created. Only used when the value of `binding` property is **true**. It must have the form `type=<type>,durable=<durable>`<br>Where:<ul><li>`<type>` is **direct**, **topic**, **headers** or **fanout**</li><li>`<durable>` is **true** or **false** | type=direct,durable=true</li></ul>
+queue.name | Name of the queue used to create the binding. Only used when the value of `binding` property is **true**. | nutch.queue
+queue.options |  Options used when the queue is created. Only used when the value of `binding` property is **true**. It must have the form `durable=<durable>,exclusive=<exclusive>,auto-delete=<auto-delete>,arguments=<arguments>`<br>Where:<ul><li>`<durable>` is **true** or **false**</li><li>`<exclusive>` is **true** or **false**</li><li>`<auto-delete>` is **true** or **false**</li><li>`<arguments>` must be the form `key1:value1;key2:value2` | durable=true,exclusive=false,auto-delete=false</li></ul>
+routingkey | The routing key used to route messages in the exchange. It only makes sense when the exchange type is **topic** or **direct**. | Value of `queue.name` property
+commit.mode | **single** if a message contains only one document. In this case, a header with the action (write, update or delete) will be added. **multiple** if a message contains all documents. | multiple
+commit.size | Amount of documents to send into each message if the value of `commit.mode` property is **multiple**. In **single** mode this value represents the amount of messages to be sent. | 250
+headers.static | Headers to add to each message. It must have the form `key1=value1,key2=value2`. | 
+headers.dynamic | Document's fields to add as headers to each message. It must have the form `field1,field2`. Only used when the value of `commit.mode` property is **single**. | 
\ No newline at end of file

diff --git a/src/plugin/indexer-solr/README.md b/src/plugin/indexer-solr/README.md
new file mode 100644
index 0000000..1d60acc
--- /dev/null
+++ b/src/plugin/indexer-solr/README.md

@@ -0,0 +1,40 @@
+indexer-solr plugin for Nutch 
+=============================
+
+**indexer-solr plugin** is used for sending documents from one or more segments to a Solr server. The configuration for the index writers is on **conf/index-writers.xml** file, included in the official Nutch distribution and it's as follow:
+
+```xml
+<writer id="<writer_id>" class="org.apache.nutch.indexwriter.solr.SolrIndexWriter">
+  <mapping>
+    ...
+  </mapping>
+  <parameters>
+    ...
+  </parameters>
+</writer>
+```
+
+Each `<writer>` element has two mandatory attributes:
+
+* `<writer_id>` is a unique identification for each configuration. This feature allows Nutch to distinguish each configuration, even when they are for the same index writer. In addition, it allows to have multiple instances for the same index writer, but with different configurations.
+
+* `org.apache.nutch.indexwriter.solr.SolrIndexWriter` corresponds to the canonical name of the class that implements the IndexWriter extension point. This value should not be modified for the **indexer-solr plugin**.
+
+## Mapping
+
+The mapping section is explained [here](https://wiki.apache.org/nutch/IndexWriters#Mapping_section). The structure of this section is general for all index writers.
+
+## Parameters
+
+Each parameter has the form `<param name="<name>" value="<value>"/>` and the parameters for this index writer are:
+
+Parameter Name | Description | Default value
+--|--|--
+type | Specifies the [SolrClient](https://lucene.apache.org/solr/5_5_0/solr-solrj/org/apache/solr/client/solrj/SolrClient.html) implementation to use. This is a string value of one of the following **cloud** or **http**. The values represent [CloudSolrServer](https://lucene.apache.org/solr/5_5_0/solr-solrj/org/apache/solr/client/solrj/impl/CloudSolrServer.html) or [HttpSolrServer](https://lucene.apache.org/solr/5_5_0/solr-solrj/org/apache/solr/client/solrj/impl/HttpSolrServer.html) respectively. | http
+url | Defines the fully qualified URL of Solr into which data should be indexed. Multiple URL can be provided using comma as a delimiter. When the value of type property is **cloud**, the URL should not include any collections or cores; just the root Solr path. | http://localhost:8983/solr/nutch
+collection | The collection used in requests. Only used when the value of type property is **cloud**. |  
+weight.field | Field's name where the weight of the documents will be written. If it is empty no field will be used. |  
+commitSize | Defines the number of documents to send to Solr in a single update batch. Decrease when handling very large documents to prevent Nutch from running out of memory.<br>**Note**: It does not explicitly trigger a server side commit. | 1000 
+auth | Whether to enable HTTP basic authentication for communicating with Solr. Use the `username` and `password` properties to configure your credentials. | false
+username | The username of Solr server. | username
+password | The password of Solr server. | password
\ No newline at end of file

diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
index a837c28..2cf6dc1 100644
--- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
+++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java

@@ -69,6 +69,14 @@
   private Configuration conf;
 
   /**
+   * Whether there are host- or domain-specific rules. If there are no specific
+   * rules host and domain name are not extracted from the URL to speed up the
+   * matching. {@link #readRules(Reader)} automatically sets this to true if
+   * host- or domain-specific rules are used in the rule file.
+   */
+  protected boolean hasHostDomainRules = false;
+
+  /**
    * Constructs a new empty RegexURLFilterBase
    */
   public RegexURLFilterBase() {
@@ -154,34 +162,33 @@
 
   // Inherited Javadoc
   public String filter(String url) {
-    String host = URLUtil.getHost(url);
+    String host = null;
     String domain = null;
-    
-    try {
-      domain = URLUtil.getDomainName(url);
-    } catch (MalformedURLException e) {
-      // shouldnt happen here right?
-    }
-    
-    if (LOG.isDebugEnabled()) {
-      LOG.debug("URL belongs to host " + host + " and domain " + domain);
-    }
 
+    if (hasHostDomainRules) {
+      host = URLUtil.getHost(url);
+      try {
+        domain = URLUtil.getDomainName(url);
+      } catch (MalformedURLException e) {
+        // shouldnt happen here right?
+      }
+
+      LOG.debug("URL belongs to host {} and domain {}", host, domain);
+    }
+    
     for (RegexRule rule : rules) {
       // Skip the skip for rules that don't share the same host and domain
       if (rule.hostOrDomain() != null &&
             !rule.hostOrDomain().equals(host) &&
             !rule.hostOrDomain().equals(domain)) {
-        if (LOG.isDebugEnabled()) {
-          LOG.debug("Skipping rule [" + rule.regex() + "] for host: " + rule.hostOrDomain());
-        }
+        LOG.debug("Skipping rule [{}] for host: {}", rule.regex(),
+            rule.hostOrDomain());
 
         continue;
       }
     
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("Applying rule [" + rule.regex() + "] for host: " + host + " and domain " + domain);
-      }
+      LOG.debug("Applying rule [{}] for host {} and domain {}", rule.regex(),
+          host, domain);
 
       if (rule.match(url)) {
         return rule.accept() ? url : null;
@@ -265,6 +272,7 @@
         continue;
       case '>':
         hostOrDomain = line.substring(1).trim();
+        hasHostDomainRules = true;
         continue;
       case '<':
         hostOrDomain = null;

diff --git a/src/plugin/lib-selenium/README.md b/src/plugin/lib-selenium/README.md
new file mode 100644
index 0000000..1c6b37c
--- /dev/null
+++ b/src/plugin/lib-selenium/README.md

@@ -0,0 +1,13 @@
+# Updates
+* The use of phantomjs has been deprecated. Check [Wikipedia](https://en.wikipedia.org/wiki/PhantomJS) for more info.
+* The updated code for Safari webriver is under development as starting Safari 10 on OS X El Capitan and macOS Sierra, Safari comes bundled with a new driver implementation.
+* Opera is now based on ChromeDriver and has been adapted by Opera that enables programmatic automation of Chromium-based Opera products but hasn't been updated since April 5, 2017. We have suspended its support and removed from the code.([link](https://github.com/operasoftware/operachromiumdriver)) 
+* Headless mode has been added for Chrome and Firefox. Set `selenium.enable.headless` to `true` in nutch-default.xml or nutch-site.xml to use it.
+
+
+Your can run Nutch in Docker.  Check  some examples at https://github.com/sbatururimi/nutch-test.
+Don't forget to update Dockefile to point to the original Nutch repository when updated.
+
+# Contributors
+Stas Batururimi [s.batururimi@gmail.com]
+

diff --git a/src/plugin/lib-selenium/build-ivy.xml b/src/plugin/lib-selenium/build-ivy.xml
index 3abcf6d..fe919e5 100644
--- a/src/plugin/lib-selenium/build-ivy.xml
+++ b/src/plugin/lib-selenium/build-ivy.xml

@@ -17,7 +17,7 @@
 -->
 <project name="lib-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
 
-    <property name="ivy.install.version" value="2.1.0" />
+    <property name="ivy.install.version" value="2.4.0" />
     <condition property="ivy.home" value="${env.IVY_HOME}">
       <isset property="env.IVY_HOME" />
     </condition>

diff --git a/src/plugin/lib-selenium/ivy.xml b/src/plugin/lib-selenium/ivy.xml
index 701b725..d70dfaf 100644
--- a/src/plugin/lib-selenium/ivy.xml
+++ b/src/plugin/lib-selenium/ivy.xml

@@ -37,16 +37,13 @@
 
   <dependencies>
     <!-- begin selenium dependencies -->
-    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.48.2" />
-    
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="3.141.5" />
+    <!-- 
     <dependency org="com.opera" name="operadriver" rev="1.5">
       <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
     </dependency>
-    <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" >
-      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
-      <exclude org="org.seleniumhq.selenium" name="selenium-java" />
-    </dependency>
+    -->
     <!-- end selenium dependencies -->
   </dependencies>
-  
+
 </ivy-module>

diff --git a/src/plugin/lib-selenium/plugin.xml b/src/plugin/lib-selenium/plugin.xml
index a86d665..bf50ca0 100644
--- a/src/plugin/lib-selenium/plugin.xml
+++ b/src/plugin/lib-selenium/plugin.xml

@@ -29,147 +29,65 @@
         <export name="*"/>
      </library>
      <!-- all classes from dependent libraries are exported -->
-     <library name="cglib-nodep-2.1_3.jar">
+     <library name="animal-sniffer-annotations-1.14.jar">
        <export name="*"/>
      </library>
-     <library name="commons-codec-1.10.jar">
+     <library name="byte-buddy-1.8.15.jar">
        <export name="*"/>
      </library>
-     <library name="commons-collections-3.2.1.jar">
+     <library name="checker-compat-qual-2.0.0.jar">
        <export name="*"/>
      </library>
      <library name="commons-exec-1.3.jar">
        <export name="*"/>
      </library>
-     <library name="commons-io-2.4.jar">
+     <library name="error_prone_annotations-2.1.3.jar">
        <export name="*"/>
      </library>
-     <library name="commons-jxpath-1.3.jar">
+     <library name="guava-25.0-jre.jar">
        <export name="*"/>
      </library>
-     <library name="commons-lang3-3.4.jar">
+     <library name="j2objc-annotations-1.1.jar">
        <export name="*"/>
      </library>
-     <library name="commons-logging-1.2.jar">
+     <library name="jsr305-1.3.9.jar">
        <export name="*"/>
      </library>
-     <library name="cssparser-0.9.16.jar">
+     <library name="okhttp-3.11.0.jar">
        <export name="*"/>
      </library>
-     <library name="gson-2.3.1.jar">
+     <library name="okio-1.14.0.jar">
        <export name="*"/>
      </library>
-     <library name="guava-18.0.jar">
+     <library name="selenium-api-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="htmlunit-2.18.jar">
+     <library name="selenium-chrome-driver-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="htmlunit-core-js-2.17.jar">
+     <library name="selenium-edge-driver-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="httpclient-4.5.1.jar">
+     <library name="selenium-firefox-driver-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="httpcore-4.4.3.jar">
+     <library name="selenium-ie-driver-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="httpmime-4.5.jar">
+     <library name="selenium-java-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="ini4j-0.5.2.jar">
+     <library name="selenium-opera-driver-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="jetty-io-9.2.12.v20150709.jar">
+     <library name="selenium-remote-driver-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="jetty-util-9.2.12.v20150709.jar">
+     <library name="selenium-safari-driver-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="jna-4.1.0.jar">
-       <export name="*"/>
-     </library>
-     <library name="jna-platform-4.1.0.jar">
-       <export name="*"/>
-     </library>
-     <library name="nekohtml-1.9.22.jar">
-       <export name="*"/>
-     </library>
-     <library name="netty-3.5.2.Final.jar">
-       <export name="*"/>
-     </library>
-     <library name="operadriver-1.5.jar">
-       <export name="*"/>
-     </library>
-     <library name="operalaunchers-1.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="phantomjsdriver-1.2.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="protobuf-java-2.4.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="sac-1.3.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-api-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-chrome-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-edge-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-firefox-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-htmlunit-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-ie-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-java-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-leg-rc-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-remote-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-safari-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-support-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="serializer-2.7.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="webbit-0.4.14.jar">
-       <export name="*"/>
-     </library>
-     <library name="websocket-api-9.2.12.v20150709.jar">
-       <export name="*"/>
-     </library>
-     <library name="websocket-client-9.2.12.v20150709.jar">
-       <export name="*"/>
-     </library>
-     <library name="websocket-common-9.2.12.v20150709.jar">
-       <export name="*"/>
-     </library>
-     <library name="xalan-2.7.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="xercesImpl-2.11.0.jar">
-       <export name="*"/>
-     </library>
-     <library name="xml-apis-1.4.01.jar">
+     <library name="selenium-support-3.141.5.jar">
        <export name="*"/>
      </library>
    </runtime>
-
 </plugin>

diff --git a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
index 6e137f9..6af20b0 100644
--- a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
+++ b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java

@@ -24,182 +24,274 @@
 import java.io.OutputStream;
 import java.net.URL;
 import java.util.concurrent.TimeUnit;
+import java.util.Random;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IOUtils;
+
 import org.openqa.selenium.By;
+import org.openqa.selenium.Capabilities;
 import org.openqa.selenium.OutputType;
 import org.openqa.selenium.TakesScreenshot;
 import org.openqa.selenium.TimeoutException;
 import org.openqa.selenium.WebDriver;
+
 import org.openqa.selenium.chrome.ChromeDriver;
-import org.openqa.selenium.firefox.FirefoxBinary;
+import org.openqa.selenium.chrome.ChromeOptions;
+
+//import org.openqa.selenium.firefox.FirefoxBinary;
 import org.openqa.selenium.firefox.FirefoxDriver;
-import org.openqa.selenium.firefox.FirefoxProfile;
+//import org.openqa.selenium.firefox.FirefoxProfile;
+import org.openqa.selenium.firefox.FirefoxOptions;
+
 import org.openqa.selenium.io.TemporaryFilesystem;
+
 import org.openqa.selenium.remote.DesiredCapabilities;
 import org.openqa.selenium.remote.RemoteWebDriver;
-import org.openqa.selenium.safari.SafariDriver;
-import org.openqa.selenium.phantomjs.PhantomJSDriver;
-import org.openqa.selenium.phantomjs.PhantomJSDriverService;
+
+//import org.openqa.selenium.safari.SafariDriver;
+
+//import org.openqa.selenium.phantomjs.PhantomJSDriver;
+//import org.openqa.selenium.phantomjs.PhantomJSDriverService;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.opera.core.systems.OperaDriver;
+import org.openqa.selenium.opera.OperaOptions;
+import org.openqa.selenium.opera.OperaDriver;
+//import com.opera.core.systems.OperaDriver;
 
 public class HttpWebClient {
 
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
 
-  public static ThreadLocal<WebDriver> threadWebDriver = new ThreadLocal<WebDriver>() {
-
-    @Override
-    protected WebDriver initialValue()
-    {
-      FirefoxProfile profile = new FirefoxProfile();
-      profile.setPreference("permissions.default.stylesheet", 2);
-      profile.setPreference("permissions.default.image", 2);
-      profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false");
-      profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, "localhost");
-      WebDriver driver = new FirefoxDriver(profile);
-      return driver;          
-    };
-  };
-
   public static WebDriver getDriverForPage(String url, Configuration conf) {
-      WebDriver driver = null;
-      DesiredCapabilities capabilities = null;
-      long pageLoadWait = conf.getLong("page.load.delay", 3);
+    WebDriver driver = null;
+    long pageLoadWait = conf.getLong("page.load.delay", 3);
 
-      try {
-        String driverType  = conf.get("selenium.driver", "firefox");
-        switch (driverType) {
-          case "firefox":
-          	String allowedHost = conf.get("selenium.firefox.allowed.hosts", "localhost");
-          	long firefoxBinaryTimeout = conf.getLong("selenium.firefox.binary.timeout", 45);
-          	boolean enableFlashPlayer = conf.getBoolean("selenium.firefox.enable.flash", false);
-          	int loadImage = conf.getInt("selenium.firefox.load.image", 1);
-          	int loadStylesheet = conf.getInt("selenium.firefox.load.stylesheet", 1);
-    		    FirefoxProfile profile = new FirefoxProfile();
-    		    FirefoxBinary binary = new FirefoxBinary();
-    		    profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, allowedHost);
-    		    profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", enableFlashPlayer);
-    		    profile.setPreference("permissions.default.stylesheet", loadStylesheet);
-  	      	profile.setPreference("permissions.default.image", loadImage);
-    		    binary.setTimeout(TimeUnit.SECONDS.toMillis(firefoxBinaryTimeout));
-            driver = new FirefoxDriver(binary, profile);
-            break;
-          case "chrome":
-            driver = new ChromeDriver();
-            break;
-          case "safari":
-            driver = new SafariDriver();
-            break;
-          case "opera":
-            driver = new OperaDriver();
-            break;
-          case "phantomjs":
-            driver = new PhantomJSDriver();
-            break;
-          case "remote":
-            String seleniumHubHost = conf.get("selenium.hub.host", "localhost");
-            int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", "4444"));
-            String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub");
-            String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http");
-            String seleniumGridDriver = conf.get("selenium.grid.driver","firefox");
-            String seleniumGridBinary = conf.get("selenium.grid.binary");
+    try {
+      String driverType = conf.get("selenium.driver", "firefox");
+      boolean enableHeadlessMode = conf.getBoolean("selenium.enable.headless",
+          false);
 
-            switch (seleniumGridDriver){
-              case "firefox":
-                capabilities = DesiredCapabilities.firefox();
-                capabilities.setBrowserName("firefox");
-                capabilities.setJavascriptEnabled(true);
-                capabilities.setCapability("firefox_binary",seleniumGridBinary);
-                System.setProperty("webdriver.reap_profile", "false");
-                driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities);
-                break;
-              case "phantomjs":
-                capabilities = DesiredCapabilities.phantomjs();
-                capabilities.setBrowserName("phantomjs");
-                capabilities.setJavascriptEnabled(true);
-                capabilities.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,seleniumGridBinary);
-                driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities);
-                break;
-              default:
-                LOG.error("The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType);
-                driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), DesiredCapabilities.firefox());
-                break;
-            }
-            break;
-          default:
-            LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType);
-            driver = new FirefoxDriver();
-            break;
+      switch (driverType) {
+      case "firefox":
+        String geckoDriverPath = conf.get("selenium.grid.binary",
+            "/root/geckodriver");
+        driver = createFirefoxWebDriver(geckoDriverPath, enableHeadlessMode);
+        break;
+      case "chrome":
+        String chromeDriverPath = conf.get("selenium.grid.binary",
+            "/root/chromedriver");
+        driver = createChromeWebDriver(chromeDriverPath, enableHeadlessMode);
+        break;
+      // case "opera":
+      // // This class is provided as a convenience for easily testing the
+      // Chrome browser.
+      // String operaDriverPath = conf.get("selenium.grid.binary",
+      // "/root/operadriver");
+      // driver = createOperaWebDriver(operaDriverPath, enableHeadlessMode);
+      // break;
+      case "remote":
+        String seleniumHubHost = conf.get("selenium.hub.host", "localhost");
+        int seleniumHubPort = Integer
+            .parseInt(conf.get("selenium.hub.port", "4444"));
+        String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub");
+        String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http");
+        URL seleniumHubUrl = new URL(seleniumHubProtocol, seleniumHubHost,
+            seleniumHubPort, seleniumHubPath);
+
+        String seleniumGridDriver = conf.get("selenium.grid.driver", "firefox");
+
+        switch (seleniumGridDriver) {
+        case "firefox":
+          driver = createFirefoxRemoteWebDriver(seleniumHubUrl,
+              enableHeadlessMode);
+          break;
+        case "chrome":
+          driver = createChromeRemoteWebDriver(seleniumHubUrl,
+              enableHeadlessMode);
+          break;
+        case "random":
+          driver = createRandomRemoteWebDriver(seleniumHubUrl,
+              enableHeadlessMode);
+          break;
+        default:
+          LOG.error(
+              "The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().",
+              driverType);
+          driver = createDefaultRemoteWebDriver(seleniumHubUrl,
+              enableHeadlessMode);
+          break;
         }
-        LOG.debug("Selenium {} WebDriver selected.", driverType);
-  
-        driver.manage().timeouts().pageLoadTimeout(pageLoadWait, TimeUnit.SECONDS);
-        driver.get(url);
-      } catch (Exception e) {
-			  if(e instanceof TimeoutException) {
-          LOG.debug("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far...");
-          return driver;
-			  }
-			  cleanUpDriver(driver);
-		    throw new RuntimeException(e);
-	    } 
+        break;
+      default:
+        LOG.error(
+            "The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().",
+            driverType);
+        FirefoxOptions options = new FirefoxOptions();
+        driver = new FirefoxDriver(options);
+        break;
+      }
+      LOG.debug("Selenium {} WebDriver selected.", driverType);
 
-      return driver;
+      driver.manage().timeouts().pageLoadTimeout(pageLoadWait,
+          TimeUnit.SECONDS);
+      driver.get(url);
+    } catch (Exception e) {
+      if (e instanceof TimeoutException) {
+        LOG.error(
+            "Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far...");
+        return driver;
+      } else {
+        LOG.error(e.toString());
+      }
+      cleanUpDriver(driver);
+      throw new RuntimeException(e);
+    }
+
+    return driver;
   }
 
-  public static String getHTMLContent(WebDriver driver, Configuration conf) {
-      if (conf.getBoolean("take.screenshot", false)) {
-        takeScreenshot(driver, conf);
-      }
+  public static WebDriver createFirefoxWebDriver(String firefoxDriverPath,
+      boolean enableHeadlessMode) {
+    System.setProperty("webdriver.gecko.driver", firefoxDriverPath);
+    FirefoxOptions firefoxOptions = new FirefoxOptions();
+    if (enableHeadlessMode) {
+      firefoxOptions.addArguments("--headless");
+    }
+    WebDriver driver = new FirefoxDriver(firefoxOptions);
+    return driver;
+  }
 
-      return driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+  public static WebDriver createChromeWebDriver(String chromeDriverPath,
+      boolean enableHeadlessMode) {
+    // if not specified, WebDriver will search your path for chromedriver
+    System.setProperty("webdriver.chrome.driver", chromeDriverPath);
+    ChromeOptions chromeOptions = new ChromeOptions();
+    chromeOptions.addArguments("--no-sandbox");
+    chromeOptions.addArguments("--disable-extensions");
+    // be sure to set selenium.enable.headless to true if no monitor attached
+    // to your server
+    if (enableHeadlessMode) {
+      chromeOptions.addArguments("--headless");
+    }
+    WebDriver driver = new ChromeDriver(chromeOptions);
+    return driver;
+  }
+
+  public static WebDriver createOperaWebDriver(String operaDriverPath,
+      boolean enableHeadlessMode) {
+    // if not specified, WebDriver will search your path for operadriver
+    System.setProperty("webdriver.opera.driver", operaDriverPath);
+    OperaOptions operaOptions = new OperaOptions();
+    // operaOptions.setBinary("/usr/bin/opera");
+    operaOptions.addArguments("--no-sandbox");
+    operaOptions.addArguments("--disable-extensions");
+    // be sure to set selenium.enable.headless to true if no monitor attached
+    // to your server
+    if (enableHeadlessMode) {
+      operaOptions.addArguments("--headless");
+    }
+    WebDriver driver = new OperaDriver(operaOptions);
+    return driver;
+  }
+
+  public static RemoteWebDriver createFirefoxRemoteWebDriver(URL seleniumHubUrl,
+      boolean enableHeadlessMode) {
+    FirefoxOptions firefoxOptions = new FirefoxOptions();
+    if (enableHeadlessMode) {
+      firefoxOptions.setHeadless(true);
+    }
+    RemoteWebDriver driver = new RemoteWebDriver(seleniumHubUrl,
+        firefoxOptions);
+    return driver;
+  }
+
+  public static RemoteWebDriver createChromeRemoteWebDriver(URL seleniumHubUrl,
+      boolean enableHeadlessMode) {
+    ChromeOptions chromeOptions = new ChromeOptions();
+    if (enableHeadlessMode) {
+      chromeOptions.setHeadless(true);
+    }
+    RemoteWebDriver driver = new RemoteWebDriver(seleniumHubUrl, chromeOptions);
+    return driver;
+  }
+
+  public static RemoteWebDriver createRandomRemoteWebDriver(URL seleniumHubUrl,
+      boolean enableHeadlessMode) {
+    // we consider a possibility of generating only 2 types of browsers: Firefox
+    // and
+    // Chrome only
+    Random r = new Random();
+    int min = 0;
+    // we have actually hardcoded the maximum number of types of web driver that
+    // can
+    // be created
+    // but this must be later moved to the configuration file in order to be
+    // able
+    // to randomly choose between much more types(ex: Edge, Opera, Safari)
+    int max = 1; // for 3 types, change to 2 and update the if-clause
+    int num = r.nextInt((max - min) + 1) + min;
+    if (num == 0) {
+      return createFirefoxRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
+    }
+
+    return createChromeRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
+  }
+
+  public static RemoteWebDriver createDefaultRemoteWebDriver(URL seleniumHubUrl,
+      boolean enableHeadlessMode) {
+    return createFirefoxRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
   }
 
   public static void cleanUpDriver(WebDriver driver) {
     if (driver != null) {
       try {
-	      driver.close();
+        // driver.close();
         driver.quit();
         TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
       } catch (Exception e) {
-        throw new RuntimeException(e);
+        LOG.error(e.toString());
+        // throw new RuntimeException(e);
       }
     }
   }
 
   /**
-   * Function for obtaining the HTML BODY using the selected
-   * <a href='https://seleniumhq.github.io/selenium/docs/api/java/org/openqa/selenium/WebDriver.html'>selenium webdriver</a>
-   * There are a number of configuration properties within
-   * <code>nutch-site.xml</code> which determine whether to
-   * take screenshots of the rendered pages and persist them
-   * as timestamped .png's into HDFS.
-   * @param url the URL to fetch and render
-   * @param conf the {@link org.apache.hadoop.conf.Configuration}
+   * Function for obtaining the HTML BODY using the selected <a href=
+   * 'https://seleniumhq.github.io/selenium/docs/api/java/org/openqa/selenium/WebDriver.html'>selenium
+   * webdriver</a> There are a number of configuration properties within
+   * <code>nutch-site.xml</code> which determine whether to take screenshots of
+   * the rendered pages and persist them as timestamped .png's into HDFS.
+   * 
+   * @param url
+   *          the URL to fetch and render
+   * @param conf
+   *          the {@link org.apache.hadoop.conf.Configuration}
    * @return the rendered inner HTML page
    */
   public static String getHtmlPage(String url, Configuration conf) {
     WebDriver driver = getDriverForPage(url, conf);
-    
+
     try {
       if (conf.getBoolean("take.screenshot", false)) {
         takeScreenshot(driver, conf);
       }
 
-      String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+      String innerHtml = driver.findElement(By.tagName("body"))
+          .getAttribute("innerHTML");
       return innerHtml;
 
-      // I'm sure this catch statement is a code smell ; borrowing it from lib-htmlunit
+      // I'm sure this catch statement is a code smell ; borrowing it from
+      // lib-htmlunit
     } catch (Exception e) {
       TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+      // throw new RuntimeException(e);
+      LOG.error("getHtmlPage(url, conf): " + e.toString());
       throw new RuntimeException(e);
     } finally {
       cleanUpDriver(driver);
@@ -213,24 +305,32 @@
   private static void takeScreenshot(WebDriver driver, Configuration conf) {
     try {
       String url = driver.getCurrentUrl();
-      File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
+      File srcFile = ((TakesScreenshot) driver)
+          .getScreenshotAs(OutputType.FILE);
       LOG.debug("In-memory screenshot taken of: {}", url);
       FileSystem fs = FileSystem.get(conf);
       if (conf.get("screenshot.location") != null) {
-        Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + srcFile.getName());
+        Path screenshotPath = new Path(
+            conf.get("screenshot.location") + "/" + srcFile.getName());
         OutputStream os = null;
         if (!fs.exists(screenshotPath)) {
-          LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName());
+          LOG.debug(
+              "No existing screenshot already exists... creating new file at {} {}.",
+              screenshotPath, srcFile.getName());
           os = fs.create(screenshotPath);
         }
         InputStream is = new BufferedInputStream(new FileInputStream(srcFile));
         IOUtils.copyBytes(is, os, conf);
-        LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName()); 
+        LOG.debug("Screenshot for {} successfully saved to: {} {}", url,
+            screenshotPath, srcFile.getName());
       } else {
-        LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for "
-            + "'screenshot.location' is absent from nutch-site.xml.", url);
+        LOG.warn(
+            "Screenshot for {} not saved to HDFS (subsequently disgarded) as value for "
+                + "'screenshot.location' is absent from nutch-site.xml.",
+            url);
       }
     } catch (Exception e) {
+      LOG.error("Error taking screenshot: ", e);
       cleanUpDriver(driver);
       throw new RuntimeException(e);
     }

diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
index c84489a..4e7ef14 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java

@@ -80,35 +80,32 @@
             String name = nameNode.getNodeValue().toLowerCase();
             metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
             if ("robots".equals(name)) {
+              String directives = contentNode.getNodeValue().toLowerCase();
+              int index = directives.indexOf("none");
 
-              if (contentNode != null) {
-                String directives = contentNode.getNodeValue().toLowerCase();
-                int index = directives.indexOf("none");
+              if (index >= 0) {
+                metaTags.setNoIndex();
+                metaTags.setNoFollow();
+              }
 
-                if (index >= 0) {
-                  metaTags.setNoIndex();
-                  metaTags.setNoFollow();
-                }
+              index = directives.indexOf("all");
+              if (index >= 0) {
+                // do nothing...
+              }
 
-                index = directives.indexOf("all");
-                if (index >= 0) {
-                  // do nothing...
-                }
+              index = directives.indexOf("noindex");
+              if (index >= 0) {
+                metaTags.setNoIndex();
+              }
 
-                index = directives.indexOf("noindex");
-                if (index >= 0) {
-                  metaTags.setNoIndex();
-                }
+              index = directives.indexOf("nofollow");
+              if (index >= 0) {
+                metaTags.setNoFollow();
+              }
 
-                index = directives.indexOf("nofollow");
-                if (index >= 0) {
-                  metaTags.setNoFollow();
-                }
-
-                index = directives.indexOf("noarchive");
-                if (index >= 0) {
-                  metaTags.setNoCache();
-                }
+              index = directives.indexOf("noarchive");
+              if (index >= 0) {
+                metaTags.setNoCache();
               }
 
             } // end if (name == robots)

diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.txt
index f8bbae1..fbf7207 100644
--- a/src/plugin/parse-tika/howto_upgrade_tika.txt
+++ b/src/plugin/parse-tika/howto_upgrade_tika.txt

@@ -15,4 +15,23 @@
       <!-- end of dependencies of Tika (tika-parsers) -->
    with the output of the command above.
 
+4. (Optionally) remove overlapping dependencies between parse-tika and Nutch core dependencies:
+   - check for libs present both in
+       build/lib
+     and
+       build/plugins/parse-tika/
+     (eventually with different versions)
+   - duplicated libs can be added to the exclusions of transitive dependencies in
+       build/plugins/parse-tika/ivy.xml
+   - but it should be made sure that the library versions in ivy/ivy.xml correspend to
+     those required by Tika
+
+5. Remove the locally "installed" dependencies in src/plugin/parse-tika/lib/:
+
+    $ rm -rf lib/
+
+6. Build Nutch and run all unit tests:
+
+    $ cd ../../../
+    $ ant clean runtime test
 

diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml
index 53c7775..df06f14 100644
--- a/src/plugin/parse-tika/ivy.xml
+++ b/src/plugin/parse-tika/ivy.xml

@@ -36,14 +36,24 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.tika" name="tika-parsers" rev="1.19.1" conf="*->default">
+    <dependency org="org.apache.tika" name="tika-parsers" rev="1.20" conf="*->default">
+      <!-- exclusions of dependencies in Nutch core (ivy/ivy.xml) -->
       <exclude org="org.apache.tika" name="tika-core" />
       <exclude org="org.apache.httpcomponents" name="httpclient" />
       <exclude org="org.apache.httpcomponents" name="httpcore" />
+      <exclude org="commons-lang" name="commons-lang" />
+      <exclude org="org.apache.commons" name="commons-lang3" />
+      <exclude org="org.apache.commons" name="commons-codec" />
+      <exclude org="commons-codec" name="commons-codec" /><!-- older versions are published with org=commons-codec -->
+      <exclude org="org.apache.commons" name="commons-collections4" />
+      <exclude org="org.apache.commons" name="commons-compress" />
+      <exclude org="org.apache.cxf" name="cxf-core" />
+      <exclude org="org.apache.cxf" name="cxf-rt-transports-http" />
+      <exclude org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" />
+      <exclude org="com.fasterxml.jackson.core" name="jackson-databind" />
+      <exclude org="com.google.protobuf" name="protobuf-java" />
       <exclude org="org.slf4j" name="slf4j-log4j12" />
       <exclude org="org.slf4j" name="slf4j-api" />
-      <exclude org="commons-lang" name="commons-lang" />
-      <exclude org="com.google.protobuf" name="protobuf-java" />
     </dependency>
   </dependencies>
   

diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml
index 7dbe180..b89f41e 100644
--- a/src/plugin/parse-tika/plugin.xml
+++ b/src/plugin/parse-tika/plugin.xml

@@ -26,10 +26,9 @@
          <export name="*"/>
       </library>
       <!-- dependencies of Tika (tika-parsers) -->
-      <library name="activation-1.1.1.jar"/>
       <library name="apache-mime4j-core-0.8.2.jar"/>
       <library name="apache-mime4j-dom-0.8.2.jar"/>
-      <library name="asm-6.2.jar"/>
+      <library name="asm-7.0.jar"/>
       <library name="bcmail-jdk15on-1.60.jar"/>
       <library name="bcpkix-jdk15on-1.60.jar"/>
       <library name="bcprov-jdk15on-1.60.jar"/>
@@ -37,22 +36,22 @@
       <library name="bzip2-0.9.1.jar"/>
       <library name="c3p0-0.9.1.1.jar"/>
       <library name="cdm-4.5.5.jar"/>
-      <library name="commons-codec-1.11.jar"/>
       <library name="commons-collections4-4.2.jar"/>
       <library name="commons-compress-1.18.jar"/>
-      <library name="commons-csv-1.5.jar"/>
+      <library name="commons-csv-1.6.jar"/>
       <library name="commons-exec-1.3.jar"/>
       <library name="commons-io-2.6.jar"/>
-      <library name="commons-logging-1.2.jar"/>
-      <library name="curvesapi-1.04.jar"/>
-      <library name="cxf-core-3.2.6.jar"/>
-      <library name="cxf-rt-frontend-jaxrs-3.2.6.jar"/>
-      <library name="cxf-rt-rs-client-3.2.6.jar"/>
-      <library name="cxf-rt-transports-http-3.2.6.jar"/>
+      <library name="commons-lang3-3.8.1.jar"/>
+      <library name="commons-math3-3.6.1.jar"/>
+      <library name="curvesapi-1.05.jar"/>
+      <library name="cxf-core-3.2.7.jar"/>
+      <library name="cxf-rt-frontend-jaxrs-3.2.7.jar"/>
+      <library name="cxf-rt-rs-client-3.2.7.jar"/>
+      <library name="cxf-rt-transports-http-3.2.7.jar"/>
       <library name="dec-0.1.2.jar"/>
       <library name="ehcache-core-2.6.2.jar"/>
-      <library name="FastInfoset-1.2.13.jar"/>
-      <library name="fontbox-2.0.12.jar"/>
+      <library name="FastInfoset-1.2.15.jar"/>
+      <library name="fontbox-2.0.13.jar"/>
       <library name="geoapi-3.0.1.jar"/>
       <library name="grib-4.5.5.jar"/>
       <library name="gson-2.8.5.jar"/>
@@ -60,19 +59,19 @@
       <library name="httpmime-4.5.6.jar"/>
       <library name="httpservices-4.5.5.jar"/>
       <library name="isoparser-1.1.22.jar"/>
-      <library name="istack-commons-runtime-3.0.5.jar"/>
+      <library name="istack-commons-runtime-3.0.7.jar"/>
       <library name="jackcess-2.1.12.jar"/>
       <library name="jackcess-encrypt-2.1.4.jar"/>
-      <library name="jackson-annotations-2.9.6.jar"/>
-      <library name="jackson-core-2.9.6.jar"/>
-      <library name="jackson-databind-2.9.6.jar"/>
+      <library name="jackson-annotations-2.9.7.jar"/>
+      <library name="jackson-core-2.9.7.jar"/>
+      <library name="jackson-databind-2.9.7.jar"/>
       <library name="jai-imageio-core-1.4.0.jar"/>
       <library name="java-libpst-0.8.1.jar"/>
-      <library name="javax.annotation-api-1.3.jar"/>
-      <library name="javax.ws.rs-api-2.1.jar"/>
-      <library name="jaxb-api-2.3.0.jar"/>
-      <library name="jaxb-core-2.3.0.1.jar"/>
-      <library name="jaxb-runtime-2.3.0.1.jar"/>
+      <library name="javax.activation-1.2.0.jar"/>
+      <library name="javax.annotation-api-1.3.2.jar"/>
+      <library name="javax.ws.rs-api-2.1.1.jar"/>
+      <library name="jaxb-api-2.3.1.jar"/>
+      <library name="jaxb-runtime-2.3.1.jar"/>
       <library name="jbig2-imageio-3.0.2.jar"/>
       <library name="jcip-annotations-1.0.jar"/>
       <library name="jcl-over-slf4j-1.7.25.jar"/>
@@ -81,7 +80,7 @@
       <library name="jempbox-1.8.16.jar"/>
       <library name="jhighlight-1.0.3.jar"/>
       <library name="jmatio-1.5.jar"/>
-      <library name="jna-4.3.0.jar"/>
+      <library name="jna-5.1.0.jar"/>
       <library name="joda-time-2.2.jar"/>
       <library name="json-simple-1.1.1.jar"/>
       <library name="jsoup-1.11.3.jar"/>
@@ -92,16 +91,18 @@
       <library name="netcdf4-4.5.5.jar"/>
       <library name="openjson-1.0.10.jar"/>
       <library name="opennlp-tools-1.9.0.jar"/>
-      <library name="parso-2.0.9.jar"/>
-      <library name="pdfbox-2.0.12.jar"/>
-      <library name="pdfbox-tools-2.0.12.jar"/>
-      <library name="poi-4.0.0.jar"/>
-      <library name="poi-ooxml-4.0.0.jar"/>
-      <library name="poi-ooxml-schemas-4.0.0.jar"/>
-      <library name="poi-scratchpad-4.0.0.jar"/>
+      <library name="parso-2.0.10.jar"/>
+      <library name="pdfbox-2.0.13.jar"/>
+      <library name="pdfbox-tools-2.0.13.jar"/>
+      <library name="poi-4.0.1.jar"/>
+      <library name="poi-ooxml-4.0.1.jar"/>
+      <library name="poi-ooxml-schemas-4.0.1.jar"/>
+      <library name="poi-scratchpad-4.0.1.jar"/>
+      <library name="procyon-compilertools-0.5.32.jar"/>
+      <library name="procyon-core-0.5.32.jar"/>
       <library name="quartz-2.2.0.jar"/>
-      <library name="rome-1.5.1.jar"/>
-      <library name="rome-utils-1.5.1.jar"/>
+      <library name="rome-1.12.0.jar"/>
+      <library name="rome-utils-1.12.0.jar"/>
       <library name="sentiment-analysis-parser-0.1.jar"/>
       <library name="sis-feature-0.8.jar"/>
       <library name="sis-metadata-0.8.jar"/>
@@ -109,19 +110,19 @@
       <library name="sis-referencing-0.8.jar"/>
       <library name="sis-storage-0.8.jar"/>
       <library name="sis-utility-0.8.jar"/>
-      <library name="stax2-api-4.1.jar"/>
-      <library name="stax-ex-1.7.8.jar"/>
+      <library name="stax2-api-3.1.4.jar"/>
+      <library name="stax-ex-1.8.jar"/>
       <library name="tagsoup-1.2.1.jar"/>
-      <library name="tika-parsers-1.19.1.jar"/>
-      <library name="txw2-2.3.0.1.jar"/>
+      <library name="tika-parsers-1.20.jar"/>
+      <library name="txw2-2.3.1.jar"/>
       <library name="udunits-4.5.5.jar"/>
-      <library name="uimafit-core-2.2.0.jar"/>
-      <library name="uimaj-core-2.9.0.jar"/>
+      <library name="uimafit-core-2.4.0.jar"/>
+      <library name="uimaj-core-3.0.1.jar"/>
       <library name="unit-api-1.0.jar"/>
       <library name="vorbis-java-core-0.8.jar"/>
       <library name="vorbis-java-tika-0.8.jar"/>
-      <library name="woodstox-core-5.1.0.jar"/>
-      <library name="xmlbeans-3.0.1.jar"/>
+      <library name="woodstox-core-5.0.3.jar"/>
+      <library name="xmlbeans-3.0.2.jar"/>
       <library name="xmlschema-core-2.2.3.jar"/>
       <library name="xmpcore-5.1.3.jar"/>
       <library name="xz-1.8.jar"/>

diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
index 9359575..58f93ac 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java

@@ -81,35 +81,32 @@
             String name = nameNode.getNodeValue().toLowerCase();
             metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
             if ("robots".equals(name)) {
+              String directives = contentNode.getNodeValue().toLowerCase();
+              int index = directives.indexOf("none");
 
-              if (contentNode != null) {
-                String directives = contentNode.getNodeValue().toLowerCase();
-                int index = directives.indexOf("none");
+              if (index >= 0) {
+                metaTags.setNoIndex();
+                metaTags.setNoFollow();
+              }
 
-                if (index >= 0) {
-                  metaTags.setNoIndex();
-                  metaTags.setNoFollow();
-                }
+              index = directives.indexOf("all");
+              if (index >= 0) {
+                // do nothing...
+              }
 
-                index = directives.indexOf("all");
-                if (index >= 0) {
-                  // do nothing...
-                }
+              index = directives.indexOf("noindex");
+              if (index >= 0) {
+                metaTags.setNoIndex();
+              }
 
-                index = directives.indexOf("noindex");
-                if (index >= 0) {
-                  metaTags.setNoIndex();
-                }
+              index = directives.indexOf("nofollow");
+              if (index >= 0) {
+                metaTags.setNoFollow();
+              }
 
-                index = directives.indexOf("nofollow");
-                if (index >= 0) {
-                  metaTags.setNoFollow();
-                }
-
-                index = directives.indexOf("noarchive");
-                if (index >= 0) {
-                  metaTags.setNoCache();
-                }
+              index = directives.indexOf("noarchive");
+              if (index >= 0) {
+                metaTags.setNoCache();
               }
 
             } // end if (name == robots)

diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index e346940..7440333 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

@@ -42,6 +42,7 @@
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.html.HtmlMapper;
@@ -70,6 +71,8 @@
   private String cachingPolicy;
   private HtmlMapper HTMLMapper;
   private boolean upperCaseElementNames = true;
+  private String boilerpipeExtractorName;
+  private boolean useBoilerpipe;
 
   public ParseResult getParse(Content content) {
     HTMLDocumentImpl doc = new HTMLDocumentImpl();
@@ -83,59 +86,59 @@
   ParseResult getParse(Content content, HTMLDocumentImpl doc,
       DocumentFragment root) {
     String mimeType = content.getContentType();
-    
-    boolean useBoilerpipe = getConf().get("tika.extractor", "none").equals("boilerpipe");
-    String boilerpipeExtractorName = getConf().get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
 
     URL base;
     try {
       base = new URL(content.getBaseUrl());
     } catch (MalformedURLException e) {
-      return new ParseStatus(e)
-          .getEmptyParseResult(content.getUrl(), getConf());
+      return new ParseStatus(e).getEmptyParseResult(content.getUrl(),
+          getConf());
     }
 
     // get the right parser using the mime type as a clue
-    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
-    byte[] raw = content.getContent();
-
+    CompositeParser compositeParser = (CompositeParser) tikaConfig.getParser();
+    Parser parser = compositeParser.getParsers().get(MediaType.parse(mimeType));
     if (parser == null) {
       String message = "Can't retrieve Tika parser for mime-type " + mimeType;
       LOG.error(message);
-      return new ParseStatus(ParseStatus.FAILED, message).getEmptyParseResult(
-          content.getUrl(), getConf());
+      return new ParseStatus(ParseStatus.FAILED, message)
+          .getEmptyParseResult(content.getUrl(), getConf());
     }
 
-    LOG.debug("Using Tika parser " + parser.getClass().getName()
-        + " for mime-type " + mimeType);
+    LOG.debug("Using Tika parser {} for mime-type {}.",
+        parser.getClass().getName(), mimeType);
 
+    byte[] raw = content.getContent();
     Metadata tikamd = new Metadata();
 
     ContentHandler domHandler;
-    
+
     // Check whether to use Tika's BoilerplateContentHandler
     if (useBoilerpipe) {
-      BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler((ContentHandler)new DOMBuilder(doc, root),
-      BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
+      BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(
+          (ContentHandler) new DOMBuilder(doc, root),
+          BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
       bpHandler.setIncludeMarkup(true);
-      domHandler = (ContentHandler)bpHandler;
+      domHandler = (ContentHandler) bpHandler;
     } else {
       DOMBuilder domBuilder = new DOMBuilder(doc, root);
       domBuilder.setUpperCaseElementNames(upperCaseElementNames);
       domBuilder.setDefaultNamespaceURI(XHTMLContentHandler.XHTML);
-      domHandler = (ContentHandler)domBuilder;
+      domHandler = (ContentHandler) domBuilder;
     }
 
     LinkContentHandler linkContentHandler = new LinkContentHandler();
 
     ParseContext context = new ParseContext();
-    TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler, linkContentHandler);
-    
+    TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler,
+        linkContentHandler);
+
     if (HTMLMapper != null)
       context.set(HtmlMapper.class, HTMLMapper);
     tikamd.set(Metadata.CONTENT_TYPE, mimeType);
     try {
-      parser.parse(new ByteArrayInputStream(raw), (ContentHandler)teeContentHandler, tikamd, context);
+      parser.parse(new ByteArrayInputStream(raw),
+          (ContentHandler) teeContentHandler, tikamd, context);
     } catch (Exception e) {
       LOG.error("Error parsing " + content.getUrl(), e);
       return new ParseStatus(ParseStatus.FAILED, e.getMessage())
@@ -186,16 +189,16 @@
       if (LOG.isTraceEnabled()) {
         LOG.trace("Getting links (base URL = {}) ...", baseTag);
       }
-      
+
       // pre-1233 outlink extraction
-      //utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
+      // utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
       // Get outlinks from Tika
       List<Link> tikaExtractedOutlinks = linkContentHandler.getLinks();
       utils.getOutlinks(baseTag, l, tikaExtractedOutlinks);
       outlinks = l.toArray(new Outlink[l.size()]);
       if (LOG.isTraceEnabled()) {
-        LOG.trace("found " + outlinks.length + " outlinks in "
-            + content.getUrl());
+        LOG.trace(
+            "found " + outlinks.length + " outlinks in " + content.getUrl());
       }
     }
 
@@ -251,7 +254,8 @@
         // see if a Tika config file can be found in the job file
         URL customTikaConfig = conf.getResource(customConfFile);
         if (customTikaConfig != null)
-          tikaConfig = new TikaConfig(customTikaConfig, this.getClass().getClassLoader());
+          tikaConfig = new TikaConfig(customTikaConfig,
+              this.getClass().getClassLoader());
       } catch (Exception e1) {
         String message = "Problem loading custom Tika configuration from "
             + customConfFile;
@@ -277,20 +281,26 @@
           throw new RuntimeException("Class " + htmlmapperClassName
               + " does not implement HtmlMapper");
         }
-        HTMLMapper = (HtmlMapper) HTMLMapperClass.getConstructor().newInstance();
+        HTMLMapper = (HtmlMapper) HTMLMapperClass.getConstructor()
+            .newInstance();
       } catch (Exception e) {
-        LOG.error("Can't generate instance for class " + htmlmapperClassName);
-        throw new RuntimeException("Can't generate instance for class "
-            + htmlmapperClassName);
+        String message = "Can't generate instance for class "
+            + htmlmapperClassName;
+        LOG.error(message);
+        throw new RuntimeException(message);
       }
     }
 
-    this.htmlParseFilters = new HtmlParseFilters(getConf());
-    this.utils = new DOMContentUtils(conf);
-    this.cachingPolicy = getConf().get("parser.caching.forbidden.policy",
+    htmlParseFilters = new HtmlParseFilters(getConf());
+    utils = new DOMContentUtils(conf);
+    cachingPolicy = getConf().get("parser.caching.forbidden.policy",
         Nutch.CACHING_FORBIDDEN_CONTENT);
-    this.upperCaseElementNames = getConf().getBoolean(
-        "tika.uppercase.element.names", true);
+    upperCaseElementNames = getConf().getBoolean("tika.uppercase.element.names",
+        true);
+    useBoilerpipe = getConf().get("tika.extractor", "none")
+        .equals("boilerpipe");
+    boilerpipeExtractorName = getConf()
+        .get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
   }
 
   public Configuration getConf() {

diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index 8de2f59..25efb5e 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java

@@ -154,7 +154,7 @@
         socket = sslsocket;
       }
 
-      if (sockAddr != null && http.isStoreIPAddress()) {
+      if (http.isStoreIPAddress()) {
         headers.add("_ip_", sockAddr.getAddress().getHostAddress());
       }
 
@@ -452,7 +452,7 @@
     byte[] bytes = new byte[Http.BUFFER_SIZE];
     ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
 
-    while (!doneChunks) {
+    while (true) {
       if (Http.LOG.isTraceEnabled()) {
         Http.LOG.trace("Http: starting chunk");
       }

diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
index 4f3ae28..22183c0 100644
--- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
+++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java

@@ -236,34 +236,34 @@
       String httpProtocol = response.protocol().toString()
           .toUpperCase(Locale.ROOT);
       if (useHttp2 && "H2".equals(httpProtocol)) {
-        // back-warc compatible protocol name
+        // back-ward compatible protocol name
         httpProtocol = "HTTP/2";
       }
 
-      StringBuilder resquestverbatim = null;
+      StringBuilder requestverbatim = null;
       StringBuilder responseverbatim = null;
 
       if (storeHttpRequest) {
-        resquestverbatim = new StringBuilder();
+        requestverbatim = new StringBuilder();
 
-        resquestverbatim.append(request.method()).append(' ');
-        resquestverbatim.append(request.url().encodedPath());
+        requestverbatim.append(request.method()).append(' ');
+        requestverbatim.append(request.url().encodedPath());
         String query = request.url().encodedQuery();
         if (query != null) {
-          resquestverbatim.append('?').append(query);
+          requestverbatim.append('?').append(query);
         }
-        resquestverbatim.append(' ').append(httpProtocol).append("\r\n");
+        requestverbatim.append(' ').append(httpProtocol).append("\r\n");
 
         Headers headers = request.headers();
 
         for (int i = 0, size = headers.size(); i < size; i++) {
           String key = headers.name(i);
           String value = headers.value(i);
-          resquestverbatim.append(key).append(": ").append(value)
+          requestverbatim.append(key).append(": ").append(value)
               .append("\r\n");
         }
 
-        resquestverbatim.append("\r\n");
+        requestverbatim.append("\r\n");
       }
 
       if (storeHttpHeaders) {
@@ -294,9 +294,9 @@
         builder = builder.header(Response.IP_ADDRESS, ipAddress);
       }
 
-      if (resquestverbatim != null) {
+      if (requestverbatim != null) {
         byte[] encodedBytesRequest = Base64.getEncoder()
-            .encode(resquestverbatim.toString().getBytes());
+            .encode(requestverbatim.toString().getBytes());
         builder = builder.header(Response.REQUEST,
             new String(encodedBytesRequest));
       }

diff --git a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
index 4d88f74..0d32e19 100644
--- a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
+++ b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java

@@ -72,6 +72,9 @@
   public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
       ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
       CrawlDatum adjust, int allCount) throws ScoringFilterException {
+    if (targets.isEmpty()) {
+      return adjust;
+    }
     String depthString = parseData.getMeta(DEPTH_KEY);
     if (depthString == null) {
       LOG.warn("Missing depth, removing all outlinks from url " + fromUrl);

diff --git a/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java b/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java
index 7900259..9c7a3f3 100644
--- a/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java
+++ b/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java

@@ -62,10 +62,14 @@
   /**
    * Used for orphan control.
    *
-   * @param Text url of the record
-   * @param CrawlDatum old CrawlDatum
-   * @param CrawlDatum new CrawlDatum
-   * @param List<CrawlDatum> list of inlinked CrawlDatums
+   * @param url
+   *          of the record
+   * @param old
+   *          CrawlDatum
+   * @param datum
+   *          new CrawlDatum
+   * @param inLinks
+   *          list of inlinked CrawlDatums
    * @return void
    */
   public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,

diff --git a/src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java b/src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java
index b82829d..f49a996 100644
--- a/src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java
+++ b/src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java

@@ -91,7 +91,7 @@
             + CrawlDatum.getStatusName(datum.getStatus()),
         CrawlDatum.STATUS_DB_NOTMODIFIED, datum.getStatus());
 
-    // Wait until mark.gone.after
+    // Wait until scoring.orphan.mark.gone.after
     try {
       Thread.sleep(5000);
     } catch (Exception e) {
@@ -106,7 +106,7 @@
             + CrawlDatum.getStatusName(datum.getStatus()),
         CrawlDatum.STATUS_DB_GONE, datum.getStatus());
 
-    // Wait until mark.orphan.after
+    // Wait until scoring.orphan.mark.orphan.after
     try {
       Thread.sleep(5000);
     } catch (Exception e) {

diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java b/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
index 13064eb..8478390 100644
--- a/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
+++ b/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java

@@ -69,6 +69,11 @@
    * SubCollection blacklist as String
    */
   String blString;
+  
+  /**
+   * Whether the white and black lists are case sensitive
+   */
+  boolean caseInsensitive = false;
 
   /**
    * public Constructor
@@ -95,10 +100,12 @@
     this.id = id;
     this.key = key;
     this.name = name;
+    caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false);
   }
 
   public Subcollection(Configuration conf) {
     super(conf);
+    caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false);
   }
 
   /**
@@ -231,7 +238,11 @@
 
     while (st.hasMoreElements()) {
       String line = (String) st.nextElement();
-      list.add(line.trim());
+      line = line.trim();
+      if (caseInsensitive) {
+        line = line.toLowerCase();
+      }
+      list.add(line);
     }
   }
 

diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
index 898d314..767d54d 100644
--- a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
+++ b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java

@@ -36,6 +36,7 @@
     IndexingFilter {
 
   private Configuration conf;
+  private boolean caseInsensitive = false;
 
   public SubcollectionIndexingFilter() {
     super(NutchConfiguration.create());
@@ -52,7 +53,9 @@
     this.conf = conf;
     fieldName = conf.get("subcollection.default.fieldname", "subcollection");
     metadataSource = conf.get("subcollection.metadata.source", "subcollection");
+    caseInsensitive = conf.getBoolean("subcollection.case.insensitive", false);
   }
+  
 
   /**
    * @return Configuration
@@ -102,6 +105,9 @@
     }
     
     String sUrl = url.toString();
+    if (caseInsensitive) {
+      sUrl = sUrl.toLowerCase();
+    }
     addSubCollectionField(doc, sUrl);
     return doc;
   }

diff --git a/src/plugin/urlfilter-regex/sample/Benchmarks.rules b/src/plugin/urlfilter-regex/sample/Benchmarks.rules
index c8901e2..6a85118 100644
--- a/src/plugin/urlfilter-regex/sample/Benchmarks.rules
+++ b/src/plugin/urlfilter-regex/sample/Benchmarks.rules

@@ -9,18 +9,18 @@
 # matches, the URL is ignored.
 
 # skip file:, ftp:, & mailto: urls
--^(file|ftp|mailto):
+-^(?:file|ftp|mailto):
 
 # skip image and other suffixes we can't yet parse
--\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
+-(?i)\.(?:gif|jpg|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|png)$
 
 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]
 
 # skip .fr .org and .net domains
--^.*//.*\.fr/
--^.*//.*\.org/
--^.*//.*\.net/
+-^[^/]*//[^/]*\.fr/
+-^[^/]*//[^/]*\.org/
+-^[^/]*//[^/]*\.net/
 
-# skip everything else
+# accept everything else
 +.

diff --git a/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules b/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules
index 705bdb2..e651dd5 100644
--- a/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules
+++ b/src/plugin/urlfilter-regex/sample/IntranetCrawling.rules

@@ -9,10 +9,10 @@
 # matches, the URL is ignored.
 
 # skip file:, ftp:, & mailto: urls
--^(file|ftp|mailto):
+-^(?:file|ftp|mailto):
 
 # skip image and other suffixes we can't yet parse
--\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)$
+-(?i)\.(?:gif|jpg|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|png)$
 
 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]
@@ -21,7 +21,7 @@
 -.*(/.+?)/.*?\1/.*?\1/
 
 # accept hosts in MY.DOMAIN.NAME
-+^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/
++^https?://(?:[a-z0-9]*\.)*MY.DOMAIN.NAME/
 
 # skip everything else
 -.

diff --git a/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules b/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules
index 8778921..ac9ad60 100644
--- a/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules
+++ b/src/plugin/urlfilter-regex/sample/WholeWebCrawling.rules

@@ -7,10 +7,10 @@
 # matches, the URL is ignored.
 
 # skip file: ftp: and mailto: urls
--^(file|ftp|mailto):
+-^(?:file|ftp|mailto):
 
 # skip image and other suffixes we can't yet parse
--\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)$
+-(?i)\.(?:gif|jpg|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe)$
 
 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]

diff --git a/src/test/org/apache/nutch/crawl/TestGenerator.java b/src/test/org/apache/nutch/crawl/TestGenerator.java
index 9a21146..ad05f21 100644
--- a/src/test/org/apache/nutch/crawl/TestGenerator.java
+++ b/src/test/org/apache/nutch/crawl/TestGenerator.java

@@ -131,7 +131,7 @@
   }
 
   /**
-   * Test that generator obeys the property "generate.max.per.host".
+   * Test that generator obeys the property "generate.max.count".
    * 
    * @throws Exception
    */
@@ -195,8 +195,8 @@
   }
 
   /**
-   * Test that generator obeys the property "generator.max.count" and
-   * "generator.count.per.domain".
+   * Test that generator obeys the property "generate.max.count" and
+   * "generate.count.mode".
    * 
    * @throws Exception
    */

diff --git a/src/test/org/apache/nutch/protocol/TestProtocolFactory.java b/src/test/org/apache/nutch/protocol/TestProtocolFactory.java
index 394c303..7cab623 100644
--- a/src/test/org/apache/nutch/protocol/TestProtocolFactory.java
+++ b/src/test/org/apache/nutch/protocol/TestProtocolFactory.java

@@ -59,12 +59,6 @@
       Assert.fail("Must not throw any other exception");
     }
 
-    // cache key
-    Object protocol = ObjectCache.get(conf).getObject(
-        Protocol.X_POINT_ID + "http");
-    Assert.assertNotNull(protocol);
-    Assert.assertEquals(httpProtocol, protocol);
-
     // test same object instance
     try {
       Assert.assertTrue(httpProtocol == factory.getProtocol("http://somehost"));
commit	97a24a2c8d47ca0d24989b273d7b00822b4c33ae	[log] [tgz]
author	r0ann3l <roannel.fdez@gmail.com>	Fri Mar 08 15:41:07 2019 -0400
committer	r0ann3l <roannel.fdez@gmail.com>	Fri Mar 08 15:41:07 2019 -0400
tree	8bfa66713d1b1ca47af67b3eec2e815c8d2806fa
parent	a6ead23f05cc36d534a6201df7aaefd51f1e548f [diff]
parent	8bdec5e3ef77f816c616c978c775a0eb3b4a391a [diff]