Merge pull request #440 from sebastian-nagel/NUTCH-2696-segment-reader-output-charset

NUTCH-2696 Nutch SegmentReader does not dump non-ASCII characters with Hadoop 3.x
diff --git a/.gitignore b/.gitignore
index 732ca05..61e42e0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@
 ivy/ivy-2.4.0.jar
 ivy/ivy-2.5.0-rc1.jar
 naivebayes-model
+.gitconfig
diff --git a/build.xml b/build.xml
index 65e8f3f..f50395e 100644
--- a/build.xml
+++ b/build.xml
@@ -192,6 +192,7 @@
       <packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
       <packageset dir="${plugins.dir}/indexer-elastic-rest/src/java/"/>
       <packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
+      <packageset dir="${plugins.dir}/indexer-kafka/src/java/" />
       <packageset dir="${plugins.dir}/indexer-rabbit/src/java"/>
       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
@@ -230,6 +231,7 @@
       <packageset dir="${plugins.dir}/urlfilter-automaton/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-domain/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-domainblacklist/src/java"/>
+      <packageset dir="${plugins.dir}/urlfilter-fast/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-ignoreexempt/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-prefix/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
@@ -350,6 +352,17 @@
     <artifact:mvn>
       <arg value="test"/>
       <arg value="-e"/>
+      <arg value="-o"/>
+      <!-- run offline (-o): must not download dependencies as this is
+           done from http://repo1.maven.org/ hardwired in
+           maven-ant-tasks-2.1.3.jar, see NUTCH-2722.
+
+           Dependencies and plugins need to be resolved and cached locally beforehand
+           by running
+             `mvn dependency:resolve`
+           resp.
+             `mvn dependency:resolve-plugins`
+           after the pom.xml has been generated. -->
     </artifact:mvn>
   </target>
 
@@ -688,6 +701,7 @@
       <packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
       <packageset dir="${plugins.dir}/indexer-elastic-rest/src/java/"/>
       <packageset dir="${plugins.dir}/indexer-elastic/src/java/" />
+      <packageset dir="${plugins.dir}/indexer-kafka/src/java/" />
       <packageset dir="${plugins.dir}/indexer-rabbit/src/java"/>
       <packageset dir="${plugins.dir}/indexer-solr/src/java"/>
       <packageset dir="${plugins.dir}/language-identifier/src/java"/>
@@ -726,6 +740,7 @@
       <packageset dir="${plugins.dir}/urlfilter-automaton/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-domain/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-domainblacklist/src/java"/>
+      <packageset dir="${plugins.dir}/urlfilter-fast/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-ignoreexempt/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-prefix/src/java"/>
       <packageset dir="${plugins.dir}/urlfilter-regex/src/java"/>
@@ -999,32 +1014,6 @@
     </rat:report>
   </target>
 
-  <!-- ================================================================== -->
-  <!-- SONAR targets                                                      -->
-  <!-- ================================================================== -->
-
-  <!-- Define the Sonar task if this hasn't been done in a common script -->
-  <taskdef uri="antlib:org.sonar.ant" resource="org/sonar/ant/antlib.xml">
-    <classpath path="${ant.library.dir}"/>
-    <classpath path="${mysql.library.dir}"/>
-  </taskdef>
-
-  <!-- Add the target -->
-  <target name="sonar" description="--> run SONAR analysis">
-
-    <!-- list of mandatory source directories (required) -->
-    <property name="sonar.sources" value="${src.dir}"/>
-
-    <!-- list of properties (optional) -->
-    <property name="sonar.projectName" value="Nutch Trunk 1.4 Sonar Analysis" />
-    <property name="sonar.binaries" value="${build.dir}/classes" />
-    <property name="sonar.binaries" value="${build.dir}/plugins" />
-    <property name="sonar.tests" value="${test.src.dir}" />
-
-    <sonar:sonar workDir="${base.dir}" key="org.apache.nutch:trunk"
-     version="1.4-SNAPSHOT" xmlns:sonar="antlib:org.sonar.ant"/>
-  </target>
-
 
   <!-- ================================================================== -->
   <!-- Eclipse targets                                                    -->
@@ -1114,6 +1103,7 @@
         <source path="${plugins.dir}/indexer-elastic-rest/src/java/"/>
         <source path="${plugins.dir}/indexer-elastic/src/java/" />
         <source path="${plugins.dir}/indexer-elastic/src/test/" />
+        <source path="${plugins.dir}/indexer-kafka/src/java/" />
         <source path="${plugins.dir}/indexer-rabbit/src/java/" />
         <source path="${plugins.dir}/indexer-solr/src/java/" />
         <source path="${plugins.dir}/language-identifier/src/java/" />
@@ -1173,6 +1163,8 @@
         <source path="${plugins.dir}/urlfilter-domain/src/test/" />
         <source path="${plugins.dir}/urlfilter-domainblacklist/src/java/" />
         <source path="${plugins.dir}/urlfilter-domainblacklist/src/test/" />
+        <source path="${plugins.dir}/urlfilter-fast/src/java/"/>
+        <source path="${plugins.dir}/urlfilter-fast/src/test/"/>
         <source path="${plugins.dir}/urlfilter-ignoreexempt/src/java/" />
         <source path="${plugins.dir}/urlfilter-prefix/src/java/" />
         <source path="${plugins.dir}/urlfilter-prefix/src/test/" />
diff --git a/conf/cookies.txt b/conf/cookies.txt
new file mode 100644
index 0000000..f75f220
--- /dev/null
+++ b/conf/cookies.txt
@@ -0,0 +1,3 @@
+# Optional per-host configurable cookies. Format:
+#
+# <host>\t<cookie>
diff --git a/conf/fast-urlfilter.txt.template b/conf/fast-urlfilter.txt.template
new file mode 100644
index 0000000..99bb5c9
--- /dev/null
+++ b/conf/fast-urlfilter.txt.template
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Rule file for the plugin urlfilter-fast
+
+# Used to filter a large number of domain and host-specific regular
+# expressions
+
+#
+# `Domain` rules are applied to all hosts and subdomains of a domain, e.g.
+#
+#   Domain example.org
+#     DenyPath (?i)%7c                # matches against just the path part of URL
+#     DenyPathQuery ^/resource\?x=1   # matches against path + query
+#
+#
+# To match against a single hostname:
+#
+#   Host www.example.com
+#     DenyPath (?i)%7c
+#
+#
+# Global rules are defined using the domain name `.`:
+#
+#   Domain .
+#     (/[^/]+)/[^/]+\1/[^/]+\1/
+#     # skips URLs with slash-delimited segment that repeats 3+ times, to break loops
+#
+#
+# Comments start with the `#` character and reach until the end of the line.
+#
+#
+# For more details, see
+#  - src/plugin/urlfilter-fast/README.md
+#  - src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
+#
+
diff --git a/conf/index-writers.xml.template b/conf/index-writers.xml.template
index eaa5870..268554a 100644
--- a/conf/index-writers.xml.template
+++ b/conf/index-writers.xml.template
@@ -161,4 +161,21 @@
       <remove />
     </mapping>
   </writer>
+  <writer id="indexer_kafka_1" class="org.apache.nutch.indexwriter.kafka.KafkaIndexWriter">
+    <parameters>
+      <param name="host" value=""/>
+      <param name="port" value="9092"/>
+      <param name="topic" value=""/>
+      <param name="key.serializer" value="org.apache.kafka.common.serialization.ByteArraySerializer"/>
+      <param name="value.serializer" value="org.apache.kafka.connect.json.JsonSerializer"/>
+      <param name="max.doc.count" value="100"/>
+    </parameters>
+    <mapping>
+      <copy>
+        <field source="title" dest="search"/>
+      </copy>
+      <rename />
+      <remove />
+    </mapping>
+  </writer>
 </writers>
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 97e1801..fd201c7 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -38,7 +38,7 @@
 
 <property>
   <name>file.content.limit</name>
-  <value>65536</value>
+  <value>1048576</value>
   <description>The length limit for downloaded content using the file://
   protocol, in bytes. If this value is nonnegative (>=0), content longer
   than it will be truncated; otherwise, no truncation at all. Do not
@@ -190,6 +190,14 @@
 </property>
 
 <property>
+  <name>http.agent.host.cookie.file</name>
+  <value>cookies.txt</value>
+  <description>
+    File containing per-host configured cookies.
+  </description>
+</property>
+
+<property>
   <name>http.agent.host</name>
   <value></value>
   <description>Name or IP address of the host on which the Nutch crawler
@@ -205,19 +213,10 @@
 </property>
 
 <property>
-  <name>http.max.delays</name>
-  <value>100</value>
-  <description>The number of times a thread will delay when trying to
-  fetch a page.  Each time it finds that a host is busy, it will wait
-  fetcher.server.delay.  After http.max.delays attempts, it will give
-  up on the page for now.</description>
-</property>
-
-<property>
   <name>http.content.limit</name>
-  <value>65536</value>
-  <description>The length limit for downloaded content using the http://
-  protocol, in bytes. If this value is nonnegative (>=0), content longer
+  <value>1048576</value>
+  <description>The length limit for downloaded content using the http/https
+  protocols, in bytes. If this value is nonnegative (>=0), content longer
   than it will be truncated; otherwise, no truncation at all. Do not
   confuse this setting with the file.content.limit setting.
   </description>
@@ -332,12 +331,6 @@
 </property>
 
 <property>
-  <name>http.verbose</name>
-  <value>false</value>
-  <description>If true, HTTP will log more verbosely.</description>
-</property>
-
-<property>
   <name>http.redirect.max</name>
   <value>0</value>
   <description>The maximum number of redirects the fetcher will follow when
@@ -440,7 +433,7 @@
 
 <property>
   <name>ftp.content.limit</name>
-  <value>65536</value> 
+  <value>1048576</value>
   <description>The length limit for downloaded content, in bytes.
   If this value is nonnegative (>=0), content longer than it will be truncated;
   otherwise, no truncation at all.
@@ -1004,6 +997,17 @@
 </property>
 
 <property>
+  <name>http.log.exceptions.suppress.stack</name>
+  <value>java.net.UnknownHostException,java.net.NoRouteToHostException</value>
+  <description>Comma-separated list of exceptions not shown with full
+  stack trace in logs of fetcher and HTTP protocol implementations.
+  The logs may shrink in size significantly, e.g., when for a large
+  unrestriced web crawl unknown hosts are logged shortly without full
+  stack trace.  The full class name of the exception class (extending
+  Throwable) including the package path must be specified.</description>
+</property>
+
+<property>
   <name>fetcher.parse</name>
   <value>false</value>
   <description>If true, fetcher will parse content. Default is false, which means
@@ -1310,6 +1314,20 @@
   </description>
 </property>
 
+<property>
+  <name>indexer.indexwriters.file</name>
+  <value>index-writers.xml</value>
+  <description>The configuration file for index writers.</description>
+</property>
+
+<!-- Exchanges properties -->
+
+<property>
+  <name>exchanges.exchanges.file</name>
+  <value>exchanges.xml</value>
+  <description>The configuration file used by the Exchange component.</description>
+</property>
+
 <!-- URL normalizer properties -->
 
 <property>
@@ -1600,6 +1618,15 @@
   </description>
 </property>
 
+<property>
+  <name>tika.extractor.boilerpipe.mime.types</name>
+  <value>text/html,application/xhtml+xml</value>
+  <description>
+    Comma-separated list of MIME types accepted for Boilerpipe extraction,
+    documents of other MIME types are not passed to the Boilerpipe extractor.
+  </description>
+</property>
+
 <!-- urlfilter plugin properties -->
 
 <property>
@@ -1638,6 +1665,13 @@
 </property>
 
 <property>
+  <name>urlfilter.fast.file</name>
+  <value>fast-urlfilter.txt</value>
+  <description>Name of file on CLASSPATH containing regular expressions
+  used by urlfilter-fast (FastURLFilter) plugin.</description>
+</property>
+
+<property>
   <name>urlfilter.order</name>
   <value></value>
   <description>The order by which url filters are applied.
@@ -2525,10 +2559,11 @@
   <description>
     A String value representing the flavour of Selenium 
     WebDriver() to use. Currently the following options
-    exist - 'firefox', 'chrome', 'safari', 'opera', 'phantomjs' and 'remote'.
+    exist - 'firefox', 'chrome', 'safari', 'opera' and 'remote'.
     If 'remote' is used it is essential to also set correct properties for
     'selenium.hub.port', 'selenium.hub.path', 'selenium.hub.host',
-    'selenium.hub.protocol', 'selenium.grid.driver' and 'selenium.grid.binary'.
+    'selenium.hub.protocol', 'selenium.grid.driver', 'selenium.grid.binary'
+    and 'selenium.enable.headless'.
   </description>
 </property>
 
@@ -2560,8 +2595,9 @@
   <name>selenium.grid.driver</name>
   <value>firefox</value>
   <description>A String value representing the flavour of Selenium 
-    WebDriver() used on the selenium grid. Currently the following options
-    exist - 'firefox', 'phantomjs' </description>
+    WebDriver() used on the selenium grid. We must set `selenium.driver` to `remote` first.
+    Currently the following options
+    exist - 'firefox', 'chrome', 'random' </description>
 </property>
 
 <property>
@@ -2572,6 +2608,14 @@
  </description>
 </property>
 
+<!-- headless options for Firefox and Chrome-->
+<property>
+  <name>selenium.enable.headless</name>
+  <value>false</value>
+  <description>A Boolean value representing the headless option
+    for Firefix and Chrome drivers
+  </description>
+</property>
 <!-- selenium firefox configuration; 
      applies to protocol-selenium and protocol-interactiveselenium plugins -->
 <property>
@@ -2622,6 +2666,14 @@
   Currently this option exist for - 'firefox' </description>
 </property>
 
+<!-- selenium chrome configurations -->
+<property>
+  <name>webdriver.chrome.driver</name>
+  <value>/root/chromedriver</value>
+  <description>The path to the ChromeDriver binary</description>
+</property>
+<!-- end of selenium chrome configurations -->
+
 <!-- protocol-interactiveselenium configuration -->
 <property>
   <name>interactiveselenium.handlers</name>
diff --git a/conf/tika-config.xml.template b/conf/tika-config.xml.template
index 30af37d..571a606 100644
--- a/conf/tika-config.xml.template
+++ b/conf/tika-config.xml.template
@@ -16,5 +16,5 @@
   limitations under the License.
 -->
 <properties>
-    <service-loader initializableProblemHandler="ignore"/>
+    <service-loader initializableProblemHandler="ignore" loadErrorHandler="warn" />
 </properties>
diff --git a/default.properties b/default.properties
index bb987d9..899f33d 100644
--- a/default.properties
+++ b/default.properties
@@ -104,6 +104,7 @@
    org.apache.nutch.urlfilter.automaton*:\
    org.apache.nutch.urlfilter.domain*:\
    org.apache.nutch.urlfilter.domainblacklist*:\
+   org.apache.nutch.urlfilter.fast*:\
    org.apache.nutch.urlfilter.ignoreexempt*:\
    org.apache.nutch.urlfilter.prefix*:\
    org.apache.nutch.urlfilter.regex*:\
@@ -198,6 +199,7 @@
    org.apache.nutch.indexwriter.elastic*:\
    org.apache.nutch.indexwriter.elasticrest*:\
    org.apache.nutch.indexwriter.rabbit*:\
+   org.apache.nutch.indexwriter.kafka*:\
    org.apache.nutch.indexwriter.solr*
 
 #
diff --git a/ivy/ivy.xml b/ivy/ivy.xml
index 52826bb..2ffeac4 100644
--- a/ivy/ivy.xml
+++ b/ivy/ivy.xml
@@ -52,7 +52,7 @@
 		<dependency org="com.tdunning" name="t-digest" rev="3.2" />
 
 		<!-- Hadoop Dependencies -->
-		<dependency org="org.apache.hadoop" name="hadoop-common" rev="2.7.4" conf="*->default">
+		<dependency org="org.apache.hadoop" name="hadoop-common" rev="2.9.2" conf="*->default">
 			<exclude org="hsqldb" name="hsqldb" />
 			<exclude org="net.sf.kosmosfs" name="kfs" />
 			<exclude org="net.java.dev.jets3t" name="jets3t" />
@@ -60,32 +60,35 @@
 			<exclude org="org.mortbay.jetty" name="jsp-*" />
 			<exclude org="ant" name="ant" />
 		</dependency>
-		<dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="2.7.4" conf="*->default"/>
-		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="2.7.4" conf="*->default"/>
-		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.7.4" conf="*->default"/>
+		<dependency org="org.apache.hadoop" name="hadoop-hdfs" rev="2.9.2" conf="*->default"/>
+		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-core" rev="2.9.2" conf="*->default"/>
+		<dependency org="org.apache.hadoop" name="hadoop-mapreduce-client-jobclient" rev="2.9.2" conf="*->default"/>
 		<!-- End of Hadoop Dependencies -->
 
-		<dependency org="org.apache.tika" name="tika-core" rev="1.20" />
-		<dependency org="com.ibm.icu" name="icu4j" rev="61.1" />
+		<dependency org="org.apache.tika" name="tika-core" rev="1.22" />
 
-		<dependency org="xerces" name="xercesImpl" rev="2.11.0" />
-		<dependency org="xerces" name="xmlParserAPIs" rev="2.6.2" />
+		<dependency org="xml-apis" name="xml-apis" rev="1.4.01"/><!-- force this version as it is required by Tika -->
+		<dependency org="xerces" name="xercesImpl" rev="2.12.0" />
+
+		<dependency org="com.ibm.icu" name="icu4j" rev="61.1" />
 
 		<dependency org="com.google.guava" name="guava" rev="25.0-jre" />
 
-		<dependency org="com.github.crawler-commons" name="crawler-commons" rev="0.10" />
+		<dependency org="com.github.crawler-commons" name="crawler-commons" rev="1.0" />
 
-		<dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0" />
+		<dependency org="com.martinkl.warc" name="warc-hadoop" rev="0.1.0">
+			<exclude module="hadoop-client" />
+		</dependency>
 
-		<!--dependency org="org.apache.cxf" name="cxf" rev="3.0.4" conf="*->default"/-->
-		<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.2.7" conf="*->default"/>
-		<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.2.7" conf="*->default"/>
-		<dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.2.7" conf="*->default"/>
-		<dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.2.7" conf="*->default"/>
-		<dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.2.7" conf="test->default"/>
-		<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.9.7" conf="*->default"/>
-		<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.9.7" conf="*->default"/>
-		<dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.9.7" conf="*->default"/>
+		<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxws" rev="3.3.3" conf="*->default"/>
+		<dependency org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" rev="3.3.3" conf="*->default"/>
+		<dependency org="org.apache.cxf" name="cxf-rt-transports-http" rev="3.3.3" conf="*->default"/>
+		<dependency org="org.apache.cxf" name="cxf-rt-transports-http-jetty" rev="3.3.3" conf="*->default"/>
+		<dependency org="org.apache.cxf" name="cxf-rt-rs-client" rev="3.3.3" conf="test->default"/>
+		<dependency org="com.fasterxml.jackson.core" name="jackson-databind" rev="2.9.9" conf="*->default"/>
+		<dependency org="com.fasterxml.jackson.core" name="jackson-annotations" rev="2.9.9" conf="*->default"/>
+		<dependency org="com.fasterxml.jackson.dataformat" name="jackson-dataformat-cbor" rev="2.9.9" conf="*->default"/>
+		<dependency org="com.fasterxml.jackson.jaxrs" name="jackson-jaxrs-json-provider" rev="2.9.9" conf="*->default"/>
 
 		<!-- WARC artifacts needed -->
 		<dependency org="org.netpreserve.commons" name="webarchive-commons" rev="1.1.5" conf="*->default">
@@ -139,11 +142,9 @@
 			<exclude org="org.json"/>
 		</dependency>
 
-
 		<!-- RabbitMQ dependencies -->
 		<dependency org="com.rabbitmq" name="amqp-client" rev="5.2.0" conf="*->default" />
 
-
 		<!--Added Because of Elasticsearch JEST client-->
 		<!--TODO refactor these to indexer-elastic-rest plugin somehow, currently doesn't resolve correctly-->
 		<dependency org="org.apache.httpcomponents" name="httpcore-nio" rev="4.4.9"/>
diff --git a/ivy/ivysettings.xml b/ivy/ivysettings.xml
index a2dc700..18038a5 100644
--- a/ivy/ivysettings.xml
+++ b/ivy/ivysettings.xml
@@ -16,20 +16,11 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 -->
-  <!-- you can override this property to use mirrors
-          http://repo1.maven.org/maven2/
-          http://mirrors.dotsrc.org/maven2
-          http://ftp.ggi-project.org/pub/packages/maven2
-          http://mirrors.sunsite.dk/maven2
-          http://public.planetmirror.com/pub/maven2
-          http://ibiblio.lsu.edu/main/pub/packages/maven2
-          http://www.ibiblio.net/pub/packages/maven2
-  -->
   <property name="oss.sonatype.org" 
-    value="http://oss.sonatype.org/content/repositories/releases/" 
+    value="https://oss.sonatype.org/content/repositories/releases/" 
     override="false"/>
   <property name="repo.maven.org"
-    value="http://repo1.maven.org/maven2/"
+    value="https://repo1.maven.org/maven2/"
     override="false"/>
   <property name="repository.apache.org"
     value="https://repository.apache.org/content/repositories/snapshots/"
@@ -38,14 +29,6 @@
     value="[organisation]/[module]/[revision]/[module]-[revision](-[classifier])"/>
   <property name="maven2.pattern.ext"
     value="${maven2.pattern}.[ext]"/>
-  <!-- define packaging.type=jar to work around the failing dependency download of
-         javax.ws.rs-api.jar
-       required by Tika (1.19 and higher), cf.
-         https://github.com/eclipse-ee4j/jaxrs-api/issues/572
-         https://github.com/jax-rs/api/pull/576
-  -->
-  <property name="packaging.type"
-    value="jar"/>
   <!-- pull in the local repository -->
   <include url="${ivy.default.conf.dir}/ivyconf-local.xml"/>
   <settings defaultResolver="default"/>
diff --git a/ivy/mvn.template b/ivy/mvn.template
index 3809fd0..6d22c84 100644
--- a/ivy/mvn.template
+++ b/ivy/mvn.template
@@ -36,7 +36,7 @@
 
   <scm>
     <developerConnection>scm:git:https://github.com/apache/nutch.git</developerConnection>
-    <connection>scm:git:http://github.com/apache/nutch.git</connection>
+    <connection>scm:git:https://github.com/apache/nutch.git</connection>
     <url>https://github.com/apache/nutch.git</url>
   </scm>
 
diff --git a/src/bin/crawl b/src/bin/crawl
index ff5e456..81d30cc 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -338,7 +338,7 @@
 
   # fetching the segment
   echo "Fetching : $SEGMENT"
-  __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$TIME_LIMIT_FETCH "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $NUM_THREADS
+  __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$TIME_LIMIT_FETCH "$CRAWL_PATH"/segments/$SEGMENT -threads $NUM_THREADS
 
   # parsing the segment
   echo "Parsing : $SEGMENT"
diff --git a/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java b/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
index a320989..25570c6 100644
--- a/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import org.apache.hadoop.conf.Configuration;
diff --git a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
index 73655a0..006c900 100644
--- a/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/AdaptiveFetchSchedule.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import org.apache.hadoop.conf.Configuration;
diff --git a/src/java/org/apache/nutch/crawl/CrawlDatum.java b/src/java/org/apache/nutch/crawl/CrawlDatum.java
index b57fc0b..66a6fff 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDatum.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDatum.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.DataInput;
diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java
index 333a7b6..8cd5e3e 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDb.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDb.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.File;
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
index 56bc482..5da9951 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.IOException;
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
index 97730a3..7c6ef93 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.IOException;
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index 623932a..a7d2f11 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.DataOutputStream;
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
index 7c9de1d..feba08a 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/java/org/apache/nutch/crawl/DeduplicationJob.java b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
index 8887b4f..9b01411 100644
--- a/src/java/org/apache/nutch/crawl/DeduplicationJob.java
+++ b/src/java/org/apache/nutch/crawl/DeduplicationJob.java
@@ -24,26 +24,24 @@
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.BytesWritable;
 import org.apache.hadoop.io.Text;
-
+import org.apache.hadoop.mapreduce.Counter;
+import org.apache.hadoop.mapreduce.CounterGroup;
 import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.mapreduce.CounterGroup;
-import org.apache.hadoop.mapreduce.Counter;
-import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.CrawlDb;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
@@ -76,14 +74,13 @@
       
     private String groupMode;
 
+    @Override
     public void setup(Mapper<Text, CrawlDatum, BytesWritable, CrawlDatum>.Context context) {
       Configuration arg0 = context.getConfiguration();
       groupMode = arg0.get(DEDUPLICATION_GROUP_MODE);
     }
 
-    public void close() throws IOException {
-    }
-
+    @Override
     public void map(Text key, CrawlDatum value,
         Context context)
         throws IOException, InterruptedException {
@@ -118,7 +115,7 @@
         }
         // add the URL as a temporary MD
         value.getMetaData().put(urlKey, key);
-        // reduce on the signature optionall grouped on host or domain or not at all
+        // reduce on the signature optionally grouped on host or domain or not at all
         context.write(sig, value);
       }
     }
@@ -129,9 +126,10 @@
 
     private String[] compareOrder;
     
+    @Override
     public void setup(Reducer<BytesWritable, CrawlDatum, Text, CrawlDatum>.Context context) {
-      Configuration arg0 = context.getConfiguration();
-      compareOrder = arg0.get(DEDUPLICATION_COMPARE_ORDER).split(",");
+      Configuration conf = context.getConfiguration();
+      compareOrder = conf.get(DEDUPLICATION_COMPARE_ORDER).split(",");
     }
 
     private void writeOutAsDuplicate(CrawlDatum datum,
@@ -144,79 +142,90 @@
       context.write(key, datum);
     }
 
+    @Override
     public void reduce(BytesWritable key, Iterable<CrawlDatum> values,
-        Context context)
-        throws IOException, InterruptedException {
+        Context context) throws IOException, InterruptedException {
       CrawlDatum existingDoc = null;
 
-      outerloop:
       for (CrawlDatum newDoc : values) {
         if (existingDoc == null) {
           existingDoc = new CrawlDatum();
           existingDoc.set(newDoc);
           continue;
         }
-
-        for (int i = 0; i < compareOrder.length; i++) {
-          switch (compareOrder[i]) {
-            case "score":
-              // compare based on score
-              if (existingDoc.getScore() < newDoc.getScore()) {
-                writeOutAsDuplicate(existingDoc, context);
-                existingDoc = new CrawlDatum();
-                existingDoc.set(newDoc);
-                continue outerloop;
-              } else if (existingDoc.getScore() > newDoc.getScore()) {
-                // mark new one as duplicate
-                writeOutAsDuplicate(newDoc, context);
-                continue outerloop;
-              }
-              break;
-            case "fetchTime":
-              // same score? delete the one which is oldest
-              if (existingDoc.getFetchTime() > newDoc.getFetchTime()) {
-                // mark new one as duplicate
-                writeOutAsDuplicate(newDoc, context);
-                continue outerloop;
-              } else if (existingDoc.getFetchTime() < newDoc.getFetchTime()) {
-                // mark existing one as duplicate
-                writeOutAsDuplicate(existingDoc, context);
-                existingDoc = new CrawlDatum();
-                existingDoc.set(newDoc);
-                continue outerloop;
-              }
-              break;
-            case "urlLength":
-              // same time? keep the one which has the shortest URL
-              String urlExisting;
-              String urlnewDoc;
-              try {
-                urlExisting = URLDecoder.decode(existingDoc.getMetaData().get(urlKey).toString(), "UTF8");
-                urlnewDoc = URLDecoder.decode(newDoc.getMetaData().get(urlKey).toString(), "UTF8");
-              } catch (UnsupportedEncodingException e) {
-                LOG.error("Error decoding: " + urlKey);
-                throw new IOException("UnsupportedEncodingException for " + urlKey);
-              }
-              if (urlExisting.length() < urlnewDoc.length()) {
-                // mark new one as duplicate
-                writeOutAsDuplicate(newDoc, context);
-                continue outerloop;
-              } else if (urlExisting.length() > urlnewDoc.length()) {
-                // mark existing one as duplicate
-                writeOutAsDuplicate(existingDoc, context);
-                existingDoc = new CrawlDatum();
-                existingDoc.set(newDoc);
-                continue outerloop;
-              }
-              break;
+        CrawlDatum duplicate = getDuplicate(existingDoc, newDoc);
+        if (duplicate != null) {
+          writeOutAsDuplicate(duplicate, context);
+          if (duplicate == existingDoc) {
+            // keep new
+            existingDoc.set(newDoc);
           }
         }
-
       }
     }
 
-    public void close() throws IOException {
-
+    private CrawlDatum getDuplicate(CrawlDatum existingDoc, CrawlDatum newDoc)
+        throws IOException {
+      for (int i = 0; i < compareOrder.length; i++) {
+        switch (compareOrder[i]) {
+        case "score":
+          // compare based on score
+          if (existingDoc.getScore() < newDoc.getScore()) {
+            return existingDoc;
+          } else if (existingDoc.getScore() > newDoc.getScore()) {
+            // mark new one as duplicate
+            return newDoc;
+          }
+          break;
+        case "fetchTime":
+          // same score? delete the one which is oldest
+          if (existingDoc.getFetchTime() > newDoc.getFetchTime()) {
+            // mark new one as duplicate
+            return newDoc;
+          } else if (existingDoc.getFetchTime() < newDoc.getFetchTime()) {
+            // mark existing one as duplicate
+            return existingDoc;
+          }
+          break;
+        case "httpsOverHttp":
+          // prefer https:// over http:// if URLs are identical except for the
+          // protocol
+          String url1 = existingDoc.getMetaData().get(urlKey).toString();
+          String url2 = newDoc.getMetaData().get(urlKey).toString();
+          if (url1.startsWith("https://") && url2.startsWith("http://")
+              && url1.substring(8).equals(url2.substring(7))) {
+            // existingDoc with https://, mark newDoc as duplicate
+            return newDoc;
+          } else if (url2.startsWith("https://") && url1.startsWith("http://")
+              && url2.substring(8).equals(url1.substring(7))) {
+            // newDoc with https://, mark existingDoc as duplicate
+            return existingDoc;
+          }
+          break;
+        case "urlLength":
+          // same time? keep the one which has the shortest URL
+          String urlExisting;
+          String urlnewDoc;
+          try {
+            urlExisting = URLDecoder.decode(
+                existingDoc.getMetaData().get(urlKey).toString(), "UTF8");
+            urlnewDoc = URLDecoder
+                .decode(newDoc.getMetaData().get(urlKey).toString(), "UTF8");
+          } catch (UnsupportedEncodingException e) {
+            LOG.error("Error decoding: " + urlKey);
+            throw new IOException("UnsupportedEncodingException for " + urlKey);
+          }
+          if (urlExisting.length() < urlnewDoc.length()) {
+            // mark new one as duplicate
+            return newDoc;
+          } else if (urlExisting.length() > urlnewDoc.length()) {
+            // mark existing one as duplicate
+            return existingDoc;
+          }
+          break;
+        }
+      }
+      return null; // no decision possible
     }
   }
 
@@ -224,15 +233,14 @@
   public static class StatusUpdateReducer extends
       Reducer<Text, CrawlDatum, Text, CrawlDatum> {
 
+    @Override
     public void setup(Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context context) {
     }
 
-    public void close() {
-    }
-
     private CrawlDatum old = new CrawlDatum();
     private CrawlDatum duplicate = new CrawlDatum();
 
+    @Override
     public void reduce(Text key, Iterable<CrawlDatum> values,
         Context context)
         throws IOException, InterruptedException {
@@ -260,7 +268,7 @@
 
   public int run(String[] args) throws IOException {
     if (args.length < 1) {
-      System.err.println("Usage: DeduplicationJob <crawldb> [-group <none|host|domain>] [-compareOrder <score>,<fetchTime>,<urlLength>]");
+      System.err.println("Usage: DeduplicationJob <crawldb> [-group <none|host|domain>] [-compareOrder <score>,<fetchTime>,<httpsOverHttp>,<urlLength>]");
       return 1;
     }
 
diff --git a/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java b/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
index d979e84..b654353 100644
--- a/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/DefaultFetchSchedule.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import org.apache.hadoop.io.Text;
diff --git a/src/java/org/apache/nutch/crawl/FetchSchedule.java b/src/java/org/apache/nutch/crawl/FetchSchedule.java
index 1d4961f..384c2d6 100644
--- a/src/java/org/apache/nutch/crawl/FetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/FetchSchedule.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import org.apache.hadoop.conf.Configurable;
diff --git a/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java b/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
index e437800..e07d771 100644
--- a/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
+++ b/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import org.slf4j.Logger;
diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index 429eca6..bc6a3aa 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.DataInput;
@@ -56,7 +55,7 @@
 import org.apache.hadoop.io.FloatWritable;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.MapFile;
+import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableComparable;
@@ -73,6 +72,7 @@
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.apache.nutch.util.NutchTool;
+import org.apache.nutch.util.SegmentReaderUtil;
 import org.apache.nutch.util.TimingUtil;
 import org.apache.nutch.util.URLUtil;
 
@@ -293,7 +293,7 @@
       private boolean byDomain = false;
       private URLNormalizers normalizers;
       private static boolean normalise;
-      private MapFile.Reader[] hostdbReaders = null;
+      private SequenceFile.Reader[] hostdbReaders = null;
       private Expression maxCountExpr = null;
       private Expression fetchDelayExpr = null;
 
@@ -301,7 +301,7 @@
         if (conf.get(GENERATOR_HOSTDB) != null) {
           try {
             Path path = new Path(conf.get(GENERATOR_HOSTDB), "current");
-            hostdbReaders = MapFileOutputFormat.getReaders(path, conf);
+            hostdbReaders = SegmentReaderUtil.getReaders(path, conf);
           } catch (IOException e) {
             LOG.error("Error reading HostDB because {}", e.getMessage());
           }
diff --git a/src/java/org/apache/nutch/crawl/Injector.java b/src/java/org/apache/nutch/crawl/Injector.java
index 1f30989..7d4ee84 100644
--- a/src/java/org/apache/nutch/crawl/Injector.java
+++ b/src/java/org/apache/nutch/crawl/Injector.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import org.apache.hadoop.conf.Configuration;
diff --git a/src/java/org/apache/nutch/crawl/Inlink.java b/src/java/org/apache/nutch/crawl/Inlink.java
index 631f8bf..6010d07 100644
--- a/src/java/org/apache/nutch/crawl/Inlink.java
+++ b/src/java/org/apache/nutch/crawl/Inlink.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.DataInput;
diff --git a/src/java/org/apache/nutch/crawl/Inlinks.java b/src/java/org/apache/nutch/crawl/Inlinks.java
index 42dd9db..40ac6e2 100644
--- a/src/java/org/apache/nutch/crawl/Inlinks.java
+++ b/src/java/org/apache/nutch/crawl/Inlinks.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.DataInput;
diff --git a/src/java/org/apache/nutch/crawl/LinkDb.java b/src/java/org/apache/nutch/crawl/LinkDb.java
index 2d11602..b32e64f 100644
--- a/src/java/org/apache/nutch/crawl/LinkDb.java
+++ b/src/java/org/apache/nutch/crawl/LinkDb.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.File;
diff --git a/src/java/org/apache/nutch/crawl/LinkDbFilter.java b/src/java/org/apache/nutch/crawl/LinkDbFilter.java
index 757cfde..33895f9 100644
--- a/src/java/org/apache/nutch/crawl/LinkDbFilter.java
+++ b/src/java/org/apache/nutch/crawl/LinkDbFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.IOException;
diff --git a/src/java/org/apache/nutch/crawl/LinkDbMerger.java b/src/java/org/apache/nutch/crawl/LinkDbMerger.java
index 45e9e94..d5942be 100644
--- a/src/java/org/apache/nutch/crawl/LinkDbMerger.java
+++ b/src/java/org/apache/nutch/crawl/LinkDbMerger.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/crawl/LinkDbReader.java b/src/java/org/apache/nutch/crawl/LinkDbReader.java
index 9ee57ba..2d2a901 100644
--- a/src/java/org/apache/nutch/crawl/LinkDbReader.java
+++ b/src/java/org/apache/nutch/crawl/LinkDbReader.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.IOException;
diff --git a/src/java/org/apache/nutch/crawl/MD5Signature.java b/src/java/org/apache/nutch/crawl/MD5Signature.java
index 6aae872..3d163f3 100644
--- a/src/java/org/apache/nutch/crawl/MD5Signature.java
+++ b/src/java/org/apache/nutch/crawl/MD5Signature.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import org.apache.hadoop.io.MD5Hash;
diff --git a/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java b/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
index 92a4ab9..1f03013 100644
--- a/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.BufferedReader;
diff --git a/src/java/org/apache/nutch/crawl/Signature.java b/src/java/org/apache/nutch/crawl/Signature.java
index 21dfe07..b444326 100644
--- a/src/java/org/apache/nutch/crawl/Signature.java
+++ b/src/java/org/apache/nutch/crawl/Signature.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import org.apache.nutch.parse.Parse;
diff --git a/src/java/org/apache/nutch/crawl/SignatureComparator.java b/src/java/org/apache/nutch/crawl/SignatureComparator.java
index d217d93..36af1ca 100644
--- a/src/java/org/apache/nutch/crawl/SignatureComparator.java
+++ b/src/java/org/apache/nutch/crawl/SignatureComparator.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.util.Comparator;
diff --git a/src/java/org/apache/nutch/crawl/SignatureFactory.java b/src/java/org/apache/nutch/crawl/SignatureFactory.java
index 82e6709..e017cf4 100644
--- a/src/java/org/apache/nutch/crawl/SignatureFactory.java
+++ b/src/java/org/apache/nutch/crawl/SignatureFactory.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import org.slf4j.Logger;
diff --git a/src/java/org/apache/nutch/crawl/TextMD5Signature.java b/src/java/org/apache/nutch/crawl/TextMD5Signature.java
index b88cfa6..1fe0c6c 100644
--- a/src/java/org/apache/nutch/crawl/TextMD5Signature.java
+++ b/src/java/org/apache/nutch/crawl/TextMD5Signature.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import org.apache.hadoop.io.MD5Hash;
diff --git a/src/java/org/apache/nutch/crawl/TextProfileSignature.java b/src/java/org/apache/nutch/crawl/TextProfileSignature.java
index 5a709c8..c831be5 100644
--- a/src/java/org/apache/nutch/crawl/TextProfileSignature.java
+++ b/src/java/org/apache/nutch/crawl/TextProfileSignature.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.BufferedReader;
diff --git a/src/java/org/apache/nutch/crawl/URLPartitioner.java b/src/java/org/apache/nutch/crawl/URLPartitioner.java
index 3d44376..80b4fab 100644
--- a/src/java/org/apache/nutch/crawl/URLPartitioner.java
+++ b/src/java/org/apache/nutch/crawl/URLPartitioner.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/java/org/apache/nutch/exchange/Exchanges.java b/src/java/org/apache/nutch/exchange/Exchanges.java
index 1f443d4..1e0518b 100644
--- a/src/java/org/apache/nutch/exchange/Exchanges.java
+++ b/src/java/org/apache/nutch/exchange/Exchanges.java
@@ -96,8 +96,10 @@
    * @return An array with each exchange's configuration.
    */
   private ExchangeConfig[] loadConfigurations(Configuration conf) {
+    String filename = conf.get("exchanges.exchanges.file",
+        "exchanges.xml");
     InputSource inputSource = new InputSource(
-        conf.getConfResourceAsInputStream("exchanges.xml"));
+        conf.getConfResourceAsInputStream(filename));
 
     final List<ExchangeConfig> configList = new LinkedList<>();
 
@@ -120,7 +122,7 @@
       }
 
     } catch (SAXException | IOException | ParserConfigurationException e) {
-      LOG.warn(e.toString());
+      LOG.error(e.toString());
     }
 
     return configList.toArray(new ExchangeConfig[0]);
diff --git a/src/java/org/apache/nutch/fetcher/FetchNode.java b/src/java/org/apache/nutch/fetcher/FetchNode.java
index 48ab594..e7da731 100644
--- a/src/java/org/apache/nutch/fetcher/FetchNode.java
+++ b/src/java/org/apache/nutch/fetcher/FetchNode.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/fetcher/FetchNodeDb.java b/src/java/org/apache/nutch/fetcher/FetchNodeDb.java
index 5fdde70..695dd90 100644
--- a/src/java/org/apache/nutch/fetcher/FetchNodeDb.java
+++ b/src/java/org/apache/nutch/fetcher/FetchNodeDb.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java
index fe9e71e..792ee49 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -19,7 +19,6 @@
 import java.io.File;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
-import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -427,10 +426,9 @@
 
     checkConfiguration();
 
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
     if (LOG.isInfoEnabled()) {
-      LOG.info("Fetcher: starting at {}", sdf.format(start));
+      LOG.info("Fetcher: starting at {}", TimingUtil.logDateMillis(start));
       LOG.info("Fetcher: segment: {}", segment);
     }
 
@@ -440,7 +438,8 @@
     long timelimit = getConf().getLong("fetcher.timelimit.mins", -1);
     if (timelimit != -1) {
       timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
-      LOG.info("Fetcher Timelimit set for : {}", timelimit);
+      LOG.info("Fetcher Timelimit set for : {}  ({})", timelimit,
+          TimingUtil.logDateMillis(timelimit));
       getConf().setLong("fetcher.timelimit", timelimit);
     }
 
@@ -507,8 +506,8 @@
     }
 
     long end = System.currentTimeMillis();
-    LOG.info("Fetcher: finished at {}, elapsed: {}", sdf.format(end),
-        TimingUtil.elapsedTime(start, end));
+    LOG.info("Fetcher: finished at {}, elapsed: {}",
+        TimingUtil.logDateMillis(end), TimingUtil.elapsedTime(start, end));
   }
 
   /** Run the fetcher. */
diff --git a/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java b/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
index 9feb745..6d27327 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.fetcher;
 
 import java.io.IOException;
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index 395a141..e52b9ea 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -43,6 +43,7 @@
 import org.apache.nutch.net.URLFilterException;
 import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.net.protocols.ProtocolLogUtil;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
@@ -145,6 +146,8 @@
   private FetcherThreadPublisher publisher;
   private boolean activatePublisher;
 
+  private ProtocolLogUtil logUtil = new ProtocolLogUtil();
+
   public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQueues fetchQueues, 
       QueueFeeder feeder, AtomicInteger spinWaiting, AtomicLong lastRequestStart, FetcherRun.Context context,
       AtomicInteger errors, String segmentName, boolean parsing, boolean storingContent, 
@@ -174,6 +177,8 @@
     this.pages = pages;
     this.bytes = bytes;
 
+    this.logUtil.setConf(conf);
+
     // NUTCH-2413 Apply filters and normalizers on outlinks
     // when parsing only if configured
     if (parsing) {
@@ -457,7 +462,15 @@
         } catch (Throwable t) { // unexpected exception
           // unblock
           ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
-          logError(fit.url, StringUtils.stringifyException(t));
+          String message;
+          if (LOG.isDebugEnabled()) {
+            message = StringUtils.stringifyException(t);
+          } else if (logUtil.logShort(t)) {
+            message = t.getClass().getName();
+          } else {
+            message = StringUtils.stringifyException(t);
+          }
+          logError(fit.url, message);
           output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED,
               CrawlDatum.STATUS_FETCH_RETRY);
         }
diff --git a/src/java/org/apache/nutch/indexer/IndexWriterParams.java b/src/java/org/apache/nutch/indexer/IndexWriterParams.java
index 952dc9e..e7b3152 100644
--- a/src/java/org/apache/nutch/indexer/IndexWriterParams.java
+++ b/src/java/org/apache/nutch/indexer/IndexWriterParams.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer;
 
 import org.apache.hadoop.util.StringUtils;
diff --git a/src/java/org/apache/nutch/indexer/IndexWriters.java b/src/java/org/apache/nutch/indexer/IndexWriters.java
index 9fac2e2..5778997 100644
--- a/src/java/org/apache/nutch/indexer/IndexWriters.java
+++ b/src/java/org/apache/nutch/indexer/IndexWriters.java
@@ -16,7 +16,6 @@
  */
 package org.apache.nutch.indexer;
 
-import de.vandermeer.asciitable.AT_ColumnWidthCalculator;
 import de.vandermeer.asciitable.AT_Row;
 import de.vandermeer.asciitable.AsciiTable;
 import de.vandermeer.skb.interfaces.document.TableRowType;
@@ -115,8 +114,10 @@
    * @param conf Nutch configuration instance.
    */
   private IndexWriterConfig[] loadWritersConfiguration(Configuration conf) {
+    String filename = conf.get("indexer.indexwriters.file",
+        "index-writers.xml");
     InputStream ssInputStream = conf
-        .getConfResourceAsInputStream("index-writers.xml");
+        .getConfResourceAsInputStream(filename);
     InputSource inputSource = new InputSource(ssInputStream);
 
     try {
@@ -136,7 +137,7 @@
 
       return indexWriterConfigs;
     } catch (SAXException | IOException | ParserConfigurationException e) {
-      LOG.warn(e.toString());
+      LOG.error(e.toString());
       return new IndexWriterConfig[0];
     }
   }
@@ -218,6 +219,10 @@
 
   public void write(NutchDocument doc) throws IOException {
     for (String indexWriterId : getIndexWriters(doc)) {
+      if (!this.indexWriters.containsKey(indexWriterId)) {
+        LOG.warn("Index writer {} is not present. Maybe the plugin is not in plugin.includes or there is a misspelling.", indexWriterId);
+        continue;
+      }
       NutchDocument mappedDocument = mapDocument(doc,
           this.indexWriters.get(indexWriterId).getIndexWriterConfig()
               .getMapping());
@@ -228,6 +233,10 @@
 
   public void update(NutchDocument doc) throws IOException {
     for (String indexWriterId : getIndexWriters(doc)) {
+      if (!this.indexWriters.containsKey(indexWriterId)) {
+        LOG.warn("Index writer {} is not present. Maybe the plugin is not in plugin.includes or there is a misspelling.", indexWriterId);
+        continue;
+      }
       NutchDocument mappedDocument = mapDocument(doc,
           this.indexWriters.get(indexWriterId).getIndexWriterConfig()
               .getMapping());
diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index 6fa2032..fedfeb7 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -22,6 +22,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.codec.binary.StringUtils;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
@@ -394,7 +395,10 @@
         String binary;
         if (base64) {
           // optionally encode as base64
-          binary = Base64.encodeBase64String(content.getContent());
+          // Note: we need a form which works with many versions of commons-code (1.4, 1.11 and upwards),
+          // cf. NUTCH-2706.  The following returns a chunked string for commons-coded 1.4:
+          //   binary = Base64.encodeBase64String(content.getContent());
+          binary = StringUtils.newStringUtf8(Base64.encodeBase64(content.getContent(), false, false));
         } else {
           binary = new String(content.getContent());
         }
diff --git a/src/java/org/apache/nutch/indexer/IndexingException.java b/src/java/org/apache/nutch/indexer/IndexingException.java
index adfefeb..28882bb 100644
--- a/src/java/org/apache/nutch/indexer/IndexingException.java
+++ b/src/java/org/apache/nutch/indexer/IndexingException.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer;
 
 @SuppressWarnings("serial")
diff --git a/src/java/org/apache/nutch/indexer/IndexingFilter.java b/src/java/org/apache/nutch/indexer/IndexingFilter.java
index b34b9b7..2494167 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFilter.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer;
 
 import org.apache.hadoop.conf.Configurable;
diff --git a/src/java/org/apache/nutch/indexer/IndexingFilters.java b/src/java/org/apache/nutch/indexer/IndexingFilters.java
index 5ebdd7f..8985297 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFilters.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFilters.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer;
 
 import org.apache.nutch.plugin.PluginRepository;
diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
index 08c85c3..a43ccb1 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer;
 
 import java.lang.invoke.MethodHandles;
@@ -59,6 +58,7 @@
   protected URLNormalizers normalizers = null;
   protected boolean dumpText = false;
   protected boolean followRedirects = false;
+  protected boolean doIndex = false;
   // used to simulate the metadata propagated from injection
   protected HashMap<String, String> metadata = new HashMap<>();
 
@@ -68,7 +68,24 @@
   public int run(String[] args) throws Exception {
     String url = null;
 
-    usage = "Usage: IndexingFiltersChecker [-normalize] [-followRedirects] [-dumpText] [-md key=value] (-stdin | -listen <port> [-keepClientCnxOpen])";
+    String usage = "Usage:\n" //
+        + "  IndexingFiltersChecker [OPTIONS] <url>\n" //
+        + "    Fetch single URL and index it\n" //
+        + "  IndexingFiltersChecker [OPTIONS] -stdin\n" //
+        + "    Read URLs to be indexed from stdin\n" //
+        + "  IndexingFiltersChecker [OPTIONS] -listen <port> [-keepClientCnxOpen]\n" //
+        + "    Listen on <port> for URLs to be indexed\n" //
+        + "Options:\n" //
+        + "  -D<property>=<value>\tset/overwrite Nutch/Hadoop properties\n" //
+        + "                  \t(a generic Hadoop option to be passed\n" //
+        + "                  \t before other command-specific options)\n"
+        + "  -normalize      \tnormalize URLs\n" //
+        + "  -followRedirects\tfollow redirects when fetching URL\n" //
+        + "  -dumpText       \tshow the entire plain-text content,\n" //"
+        + "                  \tnot only the first 100 characters\n" //
+        + "  -doIndex        \tpass document to configured index writers\n" //
+        + "                  \tand let them index it\n" //
+        + "  -md <key>=<value>\tmetadata added to CrawlDatum before parsing\n";
 
     // Print help when no args given
     if (args.length < 1) {
@@ -76,6 +93,9 @@
       System.exit(-1);
     }
 
+    // read property "doIndex" for back-ward compatibility
+    doIndex = getConf().getBoolean("doIndex", false);
+
     int numConsumed;
     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("-normalize")) {
@@ -84,6 +104,8 @@
         followRedirects = true;
       } else if (args[i].equals("-dumpText")) {
         dumpText = true;
+      } else if (args[i].equals("-doIndex")) {
+        doIndex = true;
       } else if (args[i].equals("-md")) {
         String k = null, v = null;
         String nextOne = args[++i];
@@ -268,7 +290,7 @@
     
     output.append("\n"); // For readability if keepClientCnxOpen
 
-    if (getConf().getBoolean("doIndex", false)) {
+    if (doIndex) {
       IndexWriters writers = IndexWriters.get(getConf());
       writers.open(getConf(), "IndexingFilterChecker");
       writers.write(doc);
diff --git a/src/java/org/apache/nutch/indexer/NutchField.java b/src/java/org/apache/nutch/indexer/NutchField.java
index de76e23..68f6d70 100644
--- a/src/java/org/apache/nutch/indexer/NutchField.java
+++ b/src/java/org/apache/nutch/indexer/NutchField.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer;
 
 import java.io.DataInput;
diff --git a/src/java/org/apache/nutch/metadata/CreativeCommons.java b/src/java/org/apache/nutch/metadata/CreativeCommons.java
index 37a36a9..45f1422 100644
--- a/src/java/org/apache/nutch/metadata/CreativeCommons.java
+++ b/src/java/org/apache/nutch/metadata/CreativeCommons.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/metadata/DublinCore.java b/src/java/org/apache/nutch/metadata/DublinCore.java
index 9724d80..61c7d6c 100644
--- a/src/java/org/apache/nutch/metadata/DublinCore.java
+++ b/src/java/org/apache/nutch/metadata/DublinCore.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/metadata/Feed.java b/src/java/org/apache/nutch/metadata/Feed.java
index 2697da6..3493ae9 100644
--- a/src/java/org/apache/nutch/metadata/Feed.java
+++ b/src/java/org/apache/nutch/metadata/Feed.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.metadata;
 
 /**
diff --git a/src/java/org/apache/nutch/metadata/HttpHeaders.java b/src/java/org/apache/nutch/metadata/HttpHeaders.java
index b7700e5..33eb07e 100644
--- a/src/java/org/apache/nutch/metadata/HttpHeaders.java
+++ b/src/java/org/apache/nutch/metadata/HttpHeaders.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/metadata/MetaWrapper.java b/src/java/org/apache/nutch/metadata/MetaWrapper.java
index a43fa9d..0fe72c9 100644
--- a/src/java/org/apache/nutch/metadata/MetaWrapper.java
+++ b/src/java/org/apache/nutch/metadata/MetaWrapper.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.metadata;
 
 import java.io.DataInput;
diff --git a/src/java/org/apache/nutch/metadata/Nutch.java b/src/java/org/apache/nutch/metadata/Nutch.java
index 02dfa72..d28808d 100644
--- a/src/java/org/apache/nutch/metadata/Nutch.java
+++ b/src/java/org/apache/nutch/metadata/Nutch.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/net/URLExemptionFilter.java b/src/java/org/apache/nutch/net/URLExemptionFilter.java
index 03b3f61..4956b0a 100644
--- a/src/java/org/apache/nutch/net/URLExemptionFilter.java
+++ b/src/java/org/apache/nutch/net/URLExemptionFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net;
 
 import org.apache.hadoop.conf.Configurable;
diff --git a/src/java/org/apache/nutch/net/URLExemptionFilters.java b/src/java/org/apache/nutch/net/URLExemptionFilters.java
index 3a95a48..c61f43f 100644
--- a/src/java/org/apache/nutch/net/URLExemptionFilters.java
+++ b/src/java/org/apache/nutch/net/URLExemptionFilters.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net;
 
 import org.apache.hadoop.conf.Configuration;
diff --git a/src/java/org/apache/nutch/net/URLFilter.java b/src/java/org/apache/nutch/net/URLFilter.java
index 7fabc5f..afbd1e0 100644
--- a/src/java/org/apache/nutch/net/URLFilter.java
+++ b/src/java/org/apache/nutch/net/URLFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net;
 
 import org.apache.hadoop.conf.Configurable;
diff --git a/src/java/org/apache/nutch/net/URLFilterChecker.java b/src/java/org/apache/nutch/net/URLFilterChecker.java
index 52e557f..c1d1093 100644
--- a/src/java/org/apache/nutch/net/URLFilterChecker.java
+++ b/src/java/org/apache/nutch/net/URLFilterChecker.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net;
 
 import org.apache.hadoop.util.ToolRunner;
diff --git a/src/java/org/apache/nutch/net/URLFilterException.java b/src/java/org/apache/nutch/net/URLFilterException.java
index b367b56..c860718 100644
--- a/src/java/org/apache/nutch/net/URLFilterException.java
+++ b/src/java/org/apache/nutch/net/URLFilterException.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net;
 
 @SuppressWarnings("serial")
diff --git a/src/java/org/apache/nutch/net/URLFilters.java b/src/java/org/apache/nutch/net/URLFilters.java
index 4f5bf36..f8f8186 100644
--- a/src/java/org/apache/nutch/net/URLFilters.java
+++ b/src/java/org/apache/nutch/net/URLFilters.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net;
 
 import org.apache.hadoop.conf.Configuration;
diff --git a/src/java/org/apache/nutch/net/URLNormalizer.java b/src/java/org/apache/nutch/net/URLNormalizer.java
index 78ccb27..e2905c0 100644
--- a/src/java/org/apache/nutch/net/URLNormalizer.java
+++ b/src/java/org/apache/nutch/net/URLNormalizer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net;
 
 import java.net.MalformedURLException;
diff --git a/src/java/org/apache/nutch/net/URLNormalizerChecker.java b/src/java/org/apache/nutch/net/URLNormalizerChecker.java
index bd3ca5e..ee25f2f 100644
--- a/src/java/org/apache/nutch/net/URLNormalizerChecker.java
+++ b/src/java/org/apache/nutch/net/URLNormalizerChecker.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,9 +14,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net;
 
+import java.net.MalformedURLException;
+
 import org.apache.hadoop.util.ToolRunner;
 
 import org.apache.nutch.util.AbstractChecker;
@@ -36,7 +37,8 @@
         + "\n             \t(if not given all configured URL normalizers are applied)"
         + "\n  -scope     \tone of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink"
         + "\n  -stdin     \ttool reads a list of URLs from stdin, one URL per line"
-        + "\n  -listen <port>\trun tool as Telnet server listening on <port>\n";
+        + "\n  -listen <port>\trun tool as Telnet server listening on <port>"
+        + "\n\nAn empty line is added to the output if a URL fails to normalize (MalformedURLException or null returned).\n";
 
     // Print help when no args given
     if (args.length < 1) {
@@ -72,7 +74,16 @@
   }
 
   protected int process(String line, StringBuilder output) throws Exception {
-    output.append(normalizers.normalize(line, scope));
+    try {
+      String norm = normalizers.normalize(line, scope);
+      if (norm == null) {
+        output.append("");
+      } else {
+        output.append(norm);
+      }
+    } catch (MalformedURLException e) {
+      output.append("");
+    }
     return 0;
   }
 
diff --git a/src/java/org/apache/nutch/net/URLNormalizers.java b/src/java/org/apache/nutch/net/URLNormalizers.java
index 3b49aac..4ec904d 100644
--- a/src/java/org/apache/nutch/net/URLNormalizers.java
+++ b/src/java/org/apache/nutch/net/URLNormalizers.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java b/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java
index 5f4115b..fbd45a2 100644
--- a/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java
+++ b/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net.protocols;
 
 import java.util.Calendar;
diff --git a/src/java/org/apache/nutch/net/protocols/ProtocolException.java b/src/java/org/apache/nutch/net/protocols/ProtocolException.java
index 0ae3776..97d1f7f 100644
--- a/src/java/org/apache/nutch/net/protocols/ProtocolException.java
+++ b/src/java/org/apache/nutch/net/protocols/ProtocolException.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net.protocols;
 
 import java.io.Serializable;
diff --git a/src/java/org/apache/nutch/net/protocols/ProtocolLogUtil.java b/src/java/org/apache/nutch/net/protocols/ProtocolLogUtil.java
new file mode 100644
index 0000000..28d8894
--- /dev/null
+++ b/src/java/org/apache/nutch/net/protocols/ProtocolLogUtil.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.protocols;
+
+import java.lang.invoke.MethodHandles;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.conf.Configuration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class ProtocolLogUtil implements Configurable {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  public static final String HTTP_LOG_SUPPRESSION = "http.log.exceptions.suppress.stack";
+
+  private Configuration config;
+
+  /**
+   * Set of exceptions logged shortly without full Java stack trace, see
+   * property <code>http.log.exceptions.suppress.stack</code>.
+   */
+  private Set<Class<? extends Throwable>> exceptionsLogShort = new HashSet<>();
+
+  @Override
+  public Configuration getConf() {
+    return config;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    config = conf;
+    for (String exceptClassName : conf.getTrimmedStrings(HTTP_LOG_SUPPRESSION,
+        "java.net.UnknownHostException", "java.net.NoRouteToHostException")) {
+      Class<?> clazz = conf.getClassByNameOrNull(exceptClassName);
+      if (clazz == null) {
+        LOG.warn("Class {} configured for log stack suppression not found.",
+            exceptClassName);
+        continue;
+      }
+      if (!Throwable.class.isAssignableFrom(clazz)) {
+        LOG.warn(
+            "Class {} configured for log stack suppression does not extend Throwable.",
+            exceptClassName);
+        continue;
+      }
+      exceptionsLogShort.add(clazz.asSubclass(Throwable.class));
+    }
+  }
+
+  /**
+   * Return true if exception is configured to be logged as short message
+   * without stack trace, usually done for frequent exceptions with obvious
+   * reasons (e.g., UnknownHostException), configurable by
+   * <code>http.log.exceptions.suppress.stack</code>
+   */
+  public boolean logShort(Throwable t) {
+    if (exceptionsLogShort.contains(t.getClass())) {
+      return true;
+    }
+    return false;
+  }
+
+}
diff --git a/src/java/org/apache/nutch/net/protocols/Response.java b/src/java/org/apache/nutch/net/protocols/Response.java
index 779650c..16dd698 100644
--- a/src/java/org/apache/nutch/net/protocols/Response.java
+++ b/src/java/org/apache/nutch/net/protocols/Response.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/net/protocols/package-info.java b/src/java/org/apache/nutch/net/protocols/package-info.java
index 8823f5b..199e1e4 100644
--- a/src/java/org/apache/nutch/net/protocols/package-info.java
+++ b/src/java/org/apache/nutch/net/protocols/package-info.java
@@ -17,7 +17,7 @@
 
 /**
  * Helper classes related to the {@link org.apache.nutch.protocol.Protocol Protocol}
- * interface, sea also {@link org.apache.nutch.protocol}.
+ * interface, see also {@link org.apache.nutch.protocol}.
  */
 package org.apache.nutch.net.protocols;
 
diff --git a/src/java/org/apache/nutch/parse/HTMLMetaTags.java b/src/java/org/apache/nutch/parse/HTMLMetaTags.java
index c36c036..7c301e1 100644
--- a/src/java/org/apache/nutch/parse/HTMLMetaTags.java
+++ b/src/java/org/apache/nutch/parse/HTMLMetaTags.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import java.net.URL;
diff --git a/src/java/org/apache/nutch/parse/HtmlParseFilter.java b/src/java/org/apache/nutch/parse/HtmlParseFilter.java
index 2238949..cc64c8e 100644
--- a/src/java/org/apache/nutch/parse/HtmlParseFilter.java
+++ b/src/java/org/apache/nutch/parse/HtmlParseFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import org.w3c.dom.DocumentFragment;
diff --git a/src/java/org/apache/nutch/parse/HtmlParseFilters.java b/src/java/org/apache/nutch/parse/HtmlParseFilters.java
index 9dd9aad..95e23fd 100644
--- a/src/java/org/apache/nutch/parse/HtmlParseFilters.java
+++ b/src/java/org/apache/nutch/parse/HtmlParseFilters.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import org.apache.nutch.protocol.Content;
diff --git a/src/java/org/apache/nutch/parse/Outlink.java b/src/java/org/apache/nutch/parse/Outlink.java
index 3ee0354..71e53ab 100644
--- a/src/java/org/apache/nutch/parse/Outlink.java
+++ b/src/java/org/apache/nutch/parse/Outlink.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import java.io.DataInput;
diff --git a/src/java/org/apache/nutch/parse/OutlinkExtractor.java b/src/java/org/apache/nutch/parse/OutlinkExtractor.java
index 3e5ecc9..a9b2bb1 100644
--- a/src/java/org/apache/nutch/parse/OutlinkExtractor.java
+++ b/src/java/org/apache/nutch/parse/OutlinkExtractor.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/java/org/apache/nutch/parse/Parse.java b/src/java/org/apache/nutch/parse/Parse.java
index 9a33445..118178f 100644
--- a/src/java/org/apache/nutch/parse/Parse.java
+++ b/src/java/org/apache/nutch/parse/Parse.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 /**
diff --git a/src/java/org/apache/nutch/parse/ParseCallable.java b/src/java/org/apache/nutch/parse/ParseCallable.java
index 87b0b3b..3668c6e 100644
--- a/src/java/org/apache/nutch/parse/ParseCallable.java
+++ b/src/java/org/apache/nutch/parse/ParseCallable.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import java.util.concurrent.Callable;
diff --git a/src/java/org/apache/nutch/parse/ParseData.java b/src/java/org/apache/nutch/parse/ParseData.java
index d1debf7..e88c7ac 100644
--- a/src/java/org/apache/nutch/parse/ParseData.java
+++ b/src/java/org/apache/nutch/parse/ParseData.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import java.io.DataInput;
diff --git a/src/java/org/apache/nutch/parse/ParseException.java b/src/java/org/apache/nutch/parse/ParseException.java
index 3f27e33..4505abf 100644
--- a/src/java/org/apache/nutch/parse/ParseException.java
+++ b/src/java/org/apache/nutch/parse/ParseException.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 @SuppressWarnings("serial")
diff --git a/src/java/org/apache/nutch/parse/ParseImpl.java b/src/java/org/apache/nutch/parse/ParseImpl.java
index 77dbe7b..9eb3272 100644
--- a/src/java/org/apache/nutch/parse/ParseImpl.java
+++ b/src/java/org/apache/nutch/parse/ParseImpl.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import java.io.DataInput;
diff --git a/src/java/org/apache/nutch/parse/ParseOutputFormat.java b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
index 82e3c9a..4bc0853 100644
--- a/src/java/org/apache/nutch/parse/ParseOutputFormat.java
+++ b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import java.text.NumberFormat;
diff --git a/src/java/org/apache/nutch/parse/ParsePluginList.java b/src/java/org/apache/nutch/parse/ParsePluginList.java
index b4355a4..510c92b 100644
--- a/src/java/org/apache/nutch/parse/ParsePluginList.java
+++ b/src/java/org/apache/nutch/parse/ParsePluginList.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/parse/ParsePluginsReader.java b/src/java/org/apache/nutch/parse/ParsePluginsReader.java
index 0a57363..4420111 100644
--- a/src/java/org/apache/nutch/parse/ParsePluginsReader.java
+++ b/src/java/org/apache/nutch/parse/ParsePluginsReader.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/parse/ParseResult.java b/src/java/org/apache/nutch/parse/ParseResult.java
index 9987e80..ef42692 100644
--- a/src/java/org/apache/nutch/parse/ParseResult.java
+++ b/src/java/org/apache/nutch/parse/ParseResult.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java
index 1d64463..9a92ced 100644
--- a/src/java/org/apache/nutch/parse/ParseSegment.java
+++ b/src/java/org/apache/nutch/parse/ParseSegment.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import org.slf4j.Logger;
diff --git a/src/java/org/apache/nutch/parse/ParseStatus.java b/src/java/org/apache/nutch/parse/ParseStatus.java
index b9d5959..f5fb487 100644
--- a/src/java/org/apache/nutch/parse/ParseStatus.java
+++ b/src/java/org/apache/nutch/parse/ParseStatus.java
@@ -14,11 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-/*
- * Created on Apr 28, 2005
- * Author: Andrzej Bialecki &lt;ab@getopt.org&gt;
- *
- */
 package org.apache.nutch.parse;
 
 import java.io.DataInput;
diff --git a/src/java/org/apache/nutch/parse/ParseText.java b/src/java/org/apache/nutch/parse/ParseText.java
index 024911c..b93bc47 100644
--- a/src/java/org/apache/nutch/parse/ParseText.java
+++ b/src/java/org/apache/nutch/parse/ParseText.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import org.apache.hadoop.conf.Configuration;
diff --git a/src/java/org/apache/nutch/parse/ParseUtil.java b/src/java/org/apache/nutch/parse/ParseUtil.java
index 169be0f..bc6d752 100644
--- a/src/java/org/apache/nutch/parse/ParseUtil.java
+++ b/src/java/org/apache/nutch/parse/ParseUtil.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/parse/Parser.java b/src/java/org/apache/nutch/parse/Parser.java
index c86a958..fbff009 100644
--- a/src/java/org/apache/nutch/parse/Parser.java
+++ b/src/java/org/apache/nutch/parse/Parser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import org.apache.hadoop.conf.Configurable;
diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java b/src/java/org/apache/nutch/parse/ParserChecker.java
index 454068b..e880485 100644
--- a/src/java/org/apache/nutch/parse/ParserChecker.java
+++ b/src/java/org/apache/nutch/parse/ParserChecker.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import java.lang.invoke.MethodHandles;
@@ -91,7 +90,7 @@
         + "Options:\n" //
         + "  -D<property>=<value>\tset/overwrite Nutch/Hadoop properties\n" //
         + "                  \t(a generic Hadoop option to be passed\n" //
-        + "                  \t before other command-specific options)"
+        + "                  \t before other command-specific options)\n"
         + "  -normalize      \tnormalize URLs\n" //
         + "  -followRedirects\tfollow redirects when fetching URL\n" //
         + "  -dumpText       \talso show the plain-text extracted by parsers\n" //
diff --git a/src/java/org/apache/nutch/parse/ParserFactory.java b/src/java/org/apache/nutch/parse/ParserFactory.java
index 6c7eac9..d02fed4 100644
--- a/src/java/org/apache/nutch/parse/ParserFactory.java
+++ b/src/java/org/apache/nutch/parse/ParserFactory.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/parse/ParserNotFound.java b/src/java/org/apache/nutch/parse/ParserNotFound.java
index 2857efa..f60ba7e 100644
--- a/src/java/org/apache/nutch/parse/ParserNotFound.java
+++ b/src/java/org/apache/nutch/parse/ParserNotFound.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/plugin/PluginRuntimeException.java b/src/java/org/apache/nutch/plugin/PluginRuntimeException.java
index acccda2..29d659a 100644
--- a/src/java/org/apache/nutch/plugin/PluginRuntimeException.java
+++ b/src/java/org/apache/nutch/plugin/PluginRuntimeException.java
@@ -1,5 +1,4 @@
 /*
-/**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/protocol/Content.java b/src/java/org/apache/nutch/protocol/Content.java
index 2b49f7d..c513159 100644
--- a/src/java/org/apache/nutch/protocol/Content.java
+++ b/src/java/org/apache/nutch/protocol/Content.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol;
 
 import java.io.ByteArrayInputStream;
diff --git a/src/java/org/apache/nutch/protocol/Protocol.java b/src/java/org/apache/nutch/protocol/Protocol.java
index 9835744..2287487 100644
--- a/src/java/org/apache/nutch/protocol/Protocol.java
+++ b/src/java/org/apache/nutch/protocol/Protocol.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol;
 
 import java.util.List;
diff --git a/src/java/org/apache/nutch/protocol/ProtocolException.java b/src/java/org/apache/nutch/protocol/ProtocolException.java
index fc4add5..952cfed 100644
--- a/src/java/org/apache/nutch/protocol/ProtocolException.java
+++ b/src/java/org/apache/nutch/protocol/ProtocolException.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol;
 
 @SuppressWarnings("serial")
diff --git a/src/java/org/apache/nutch/protocol/ProtocolFactory.java b/src/java/org/apache/nutch/protocol/ProtocolFactory.java
index 7f900b2..a545a4c 100644
--- a/src/java/org/apache/nutch/protocol/ProtocolFactory.java
+++ b/src/java/org/apache/nutch/protocol/ProtocolFactory.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol;
 
 import java.io.BufferedReader;
@@ -216,4 +215,20 @@
     return false;
   }
 
+  /** Get a {@link Protocol} instance of the specified extension ID. */
+  /**
+   * @param id
+   *          protocol plugin ID, e.g.,
+   *          <code>org.apache.nutch.protocol.http</code>
+   * @return protocol instance for the given ID
+   * @throws PluginRuntimeException
+   *           if plugin not found or failed to instantiate
+   */
+  public Protocol getProtocolById(String id) throws PluginRuntimeException {
+    Extension ext = getExtensionById(id);
+    if (ext == null) {
+      throw new PluginRuntimeException("ID " + id + " not found");
+    }
+    return getProtocolInstanceByExtension(ext);
+  }
 }
diff --git a/src/java/org/apache/nutch/protocol/ProtocolNotFound.java b/src/java/org/apache/nutch/protocol/ProtocolNotFound.java
index 8cadc23..ef10f91 100644
--- a/src/java/org/apache/nutch/protocol/ProtocolNotFound.java
+++ b/src/java/org/apache/nutch/protocol/ProtocolNotFound.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol;
 
 @SuppressWarnings("serial")
diff --git a/src/java/org/apache/nutch/protocol/ProtocolOutput.java b/src/java/org/apache/nutch/protocol/ProtocolOutput.java
index f743b3f..810cbd9 100644
--- a/src/java/org/apache/nutch/protocol/ProtocolOutput.java
+++ b/src/java/org/apache/nutch/protocol/ProtocolOutput.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol;
 
 import java.text.ParseException;
diff --git a/src/java/org/apache/nutch/protocol/ProtocolStatus.java b/src/java/org/apache/nutch/protocol/ProtocolStatus.java
index 46f9730..d9e7e3d 100644
--- a/src/java/org/apache/nutch/protocol/ProtocolStatus.java
+++ b/src/java/org/apache/nutch/protocol/ProtocolStatus.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol;
 
 import java.io.DataInput;
@@ -51,7 +50,7 @@
   /** Temporary failure. Application may retry immediately. */
   public static final int RETRY = 15;
   /**
-   * Unspecified exception occured. Further information may be provided in args.
+   * Unspecified exception occurred. Further information may be provided in args.
    */
   public static final int EXCEPTION = 16;
   /** Access denied - authorization required, but missing/incorrect. */
@@ -69,8 +68,10 @@
    * expected number of milliseconds to wait before retry may be provided in
    * args.
    */
+  @Deprecated
   public static final int WOULDBLOCK = 22;
   /** Thread was blocked http.max.delays times during fetching. */
+  @Deprecated
   public static final int BLOCKED = 23;
 
   // Useful static instances for status codes that don't usually require any
diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
index 1cddeea..0671a8f 100644
--- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
+++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements. See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol;
 
 import java.io.File;
diff --git a/src/java/org/apache/nutch/publisher/NutchPublishers.java b/src/java/org/apache/nutch/publisher/NutchPublishers.java
index 83b5234..bb60897 100644
--- a/src/java/org/apache/nutch/publisher/NutchPublishers.java
+++ b/src/java/org/apache/nutch/publisher/NutchPublishers.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.publisher;
 
 import org.apache.hadoop.conf.Configuration;
diff --git a/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java b/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java
index cd59274..94e1732 100644
--- a/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java
+++ b/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.scoring;
 
 import java.util.Collection;
diff --git a/src/java/org/apache/nutch/scoring/ScoringFilter.java b/src/java/org/apache/nutch/scoring/ScoringFilter.java
index 2941980..bc74fcb 100644
--- a/src/java/org/apache/nutch/scoring/ScoringFilter.java
+++ b/src/java/org/apache/nutch/scoring/ScoringFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/scoring/ScoringFilters.java b/src/java/org/apache/nutch/scoring/ScoringFilters.java
index f9d2f1b..6c38355 100644
--- a/src/java/org/apache/nutch/scoring/ScoringFilters.java
+++ b/src/java/org/apache/nutch/scoring/ScoringFilters.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.scoring;
 
 import java.util.Collection;
diff --git a/src/java/org/apache/nutch/segment/SegmentChecker.java b/src/java/org/apache/nutch/segment/SegmentChecker.java
index 31204ef..991fe72 100644
--- a/src/java/org/apache/nutch/segment/SegmentChecker.java
+++ b/src/java/org/apache/nutch/segment/SegmentChecker.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/segment/SegmentPart.java b/src/java/org/apache/nutch/segment/SegmentPart.java
index d5ca370..9433066 100644
--- a/src/java/org/apache/nutch/segment/SegmentPart.java
+++ b/src/java/org/apache/nutch/segment/SegmentPart.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/segment/SegmentReader.java b/src/java/org/apache/nutch/segment/SegmentReader.java
index a64439c..bcf99b8 100644
--- a/src/java/org/apache/nutch/segment/SegmentReader.java
+++ b/src/java/org/apache/nutch/segment/SegmentReader.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/ConfManager.java b/src/java/org/apache/nutch/service/ConfManager.java
index c71cfa9..fb4ec87 100644
--- a/src/java/org/apache/nutch/service/ConfManager.java
+++ b/src/java/org/apache/nutch/service/ConfManager.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.service;
 
 import java.util.Map;
diff --git a/src/java/org/apache/nutch/service/JobManager.java b/src/java/org/apache/nutch/service/JobManager.java
index 20346fc..ad734cd 100644
--- a/src/java/org/apache/nutch/service/JobManager.java
+++ b/src/java/org/apache/nutch/service/JobManager.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.service;
 
 import java.util.Collection;
diff --git a/src/java/org/apache/nutch/service/NutchReader.java b/src/java/org/apache/nutch/service/NutchReader.java
index d988b69..98d7141 100644
--- a/src/java/org/apache/nutch/service/NutchReader.java
+++ b/src/java/org/apache/nutch/service/NutchReader.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/NutchServer.java b/src/java/org/apache/nutch/service/NutchServer.java
index be5653b..9468670 100644
--- a/src/java/org/apache/nutch/service/NutchServer.java
+++ b/src/java/org/apache/nutch/service/NutchServer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.service;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/java/org/apache/nutch/service/SeedManager.java b/src/java/org/apache/nutch/service/SeedManager.java
index a96c4ac..11ddedb 100644
--- a/src/java/org/apache/nutch/service/SeedManager.java
+++ b/src/java/org/apache/nutch/service/SeedManager.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.service;
 
 import java.util.Map;
diff --git a/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java b/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java
index d75df2e..34c07d3 100644
--- a/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java
+++ b/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/impl/JobFactory.java b/src/java/org/apache/nutch/service/impl/JobFactory.java
index a9bf8af..60bbb25 100644
--- a/src/java/org/apache/nutch/service/impl/JobFactory.java
+++ b/src/java/org/apache/nutch/service/impl/JobFactory.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.service.impl;
 
 import java.util.Map;
diff --git a/src/java/org/apache/nutch/service/impl/JobManagerImpl.java b/src/java/org/apache/nutch/service/impl/JobManagerImpl.java
index a915457..aae40b4 100644
--- a/src/java/org/apache/nutch/service/impl/JobManagerImpl.java
+++ b/src/java/org/apache/nutch/service/impl/JobManagerImpl.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/impl/JobWorker.java b/src/java/org/apache/nutch/service/impl/JobWorker.java
index 6f3b09a..8ee9344 100644
--- a/src/java/org/apache/nutch/service/impl/JobWorker.java
+++ b/src/java/org/apache/nutch/service/impl/JobWorker.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/impl/LinkReader.java b/src/java/org/apache/nutch/service/impl/LinkReader.java
index 64aa7e6..f3e54a3 100644
--- a/src/java/org/apache/nutch/service/impl/LinkReader.java
+++ b/src/java/org/apache/nutch/service/impl/LinkReader.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/impl/NodeReader.java b/src/java/org/apache/nutch/service/impl/NodeReader.java
index 13422ff..612fa26 100644
--- a/src/java/org/apache/nutch/service/impl/NodeReader.java
+++ b/src/java/org/apache/nutch/service/impl/NodeReader.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java b/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java
index 4118527..f533cd1 100644
--- a/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java
+++ b/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/impl/SeedManagerImpl.java b/src/java/org/apache/nutch/service/impl/SeedManagerImpl.java
index c7b7607..46d1bba 100644
--- a/src/java/org/apache/nutch/service/impl/SeedManagerImpl.java
+++ b/src/java/org/apache/nutch/service/impl/SeedManagerImpl.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.service.impl;
 
 import java.util.HashMap;
diff --git a/src/java/org/apache/nutch/service/impl/SequenceReader.java b/src/java/org/apache/nutch/service/impl/SequenceReader.java
index 1e7e865..26b3d55 100644
--- a/src/java/org/apache/nutch/service/impl/SequenceReader.java
+++ b/src/java/org/apache/nutch/service/impl/SequenceReader.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/impl/ServiceWorker.java b/src/java/org/apache/nutch/service/impl/ServiceWorker.java
index f63fd41..f86acad 100644
--- a/src/java/org/apache/nutch/service/impl/ServiceWorker.java
+++ b/src/java/org/apache/nutch/service/impl/ServiceWorker.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.service.impl;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/java/org/apache/nutch/service/model/request/DbQuery.java b/src/java/org/apache/nutch/service/model/request/DbQuery.java
index bf40240..4b707df 100644
--- a/src/java/org/apache/nutch/service/model/request/DbQuery.java
+++ b/src/java/org/apache/nutch/service/model/request/DbQuery.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/model/request/JobConfig.java b/src/java/org/apache/nutch/service/model/request/JobConfig.java
index 1088ab7..76a43e0 100644
--- a/src/java/org/apache/nutch/service/model/request/JobConfig.java
+++ b/src/java/org/apache/nutch/service/model/request/JobConfig.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.service.model.request;
 
 import java.util.Map;
diff --git a/src/java/org/apache/nutch/service/model/request/NutchConfig.java b/src/java/org/apache/nutch/service/model/request/NutchConfig.java
index ffa9e3e..7049463 100644
--- a/src/java/org/apache/nutch/service/model/request/NutchConfig.java
+++ b/src/java/org/apache/nutch/service/model/request/NutchConfig.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/model/request/ReaderConfig.java b/src/java/org/apache/nutch/service/model/request/ReaderConfig.java
index 81d7440..3e44f87 100644
--- a/src/java/org/apache/nutch/service/model/request/ReaderConfig.java
+++ b/src/java/org/apache/nutch/service/model/request/ReaderConfig.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/model/request/SeedList.java b/src/java/org/apache/nutch/service/model/request/SeedList.java
index 5ba60da..5bd3c4f 100644
--- a/src/java/org/apache/nutch/service/model/request/SeedList.java
+++ b/src/java/org/apache/nutch/service/model/request/SeedList.java
@@ -1,19 +1,19 @@
-/*******************************************************************************
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- * 
+ *
  *     http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- ******************************************************************************/
+ */
 package org.apache.nutch.service.model.request;
 
 import java.io.Serializable;
diff --git a/src/java/org/apache/nutch/service/model/request/SeedUrl.java b/src/java/org/apache/nutch/service/model/request/SeedUrl.java
index b1c93a8..f05e4d0 100644
--- a/src/java/org/apache/nutch/service/model/request/SeedUrl.java
+++ b/src/java/org/apache/nutch/service/model/request/SeedUrl.java
@@ -1,19 +1,19 @@
-/*******************************************************************************
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- * 
+ *
  *     http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- ******************************************************************************/
+ */
 package org.apache.nutch.service.model.request;
 
 import java.io.Serializable;
diff --git a/src/java/org/apache/nutch/service/model/request/ServiceConfig.java b/src/java/org/apache/nutch/service/model/request/ServiceConfig.java
index ab88491..85d6a3e 100644
--- a/src/java/org/apache/nutch/service/model/request/ServiceConfig.java
+++ b/src/java/org/apache/nutch/service/model/request/ServiceConfig.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.service.model.request;
 
 import java.util.Map;
diff --git a/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java b/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
index bac0924..21887ad 100644
--- a/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
+++ b/src/java/org/apache/nutch/service/model/response/FetchNodeDbInfo.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/model/response/JobInfo.java b/src/java/org/apache/nutch/service/model/response/JobInfo.java
index c2e185d..e952126 100644
--- a/src/java/org/apache/nutch/service/model/response/JobInfo.java
+++ b/src/java/org/apache/nutch/service/model/response/JobInfo.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/model/response/ServiceInfo.java b/src/java/org/apache/nutch/service/model/response/ServiceInfo.java
index 655e3f8..456f8c5 100644
--- a/src/java/org/apache/nutch/service/model/response/ServiceInfo.java
+++ b/src/java/org/apache/nutch/service/model/response/ServiceInfo.java
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.service.model.response;
 
 import java.util.List;
diff --git a/src/java/org/apache/nutch/service/resources/AbstractResource.java b/src/java/org/apache/nutch/service/resources/AbstractResource.java
index ebe4138..b277a75 100644
--- a/src/java/org/apache/nutch/service/resources/AbstractResource.java
+++ b/src/java/org/apache/nutch/service/resources/AbstractResource.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/resources/AdminResource.java b/src/java/org/apache/nutch/service/resources/AdminResource.java
index 1b01b67..8e1b4af 100644
--- a/src/java/org/apache/nutch/service/resources/AdminResource.java
+++ b/src/java/org/apache/nutch/service/resources/AdminResource.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/resources/ConfigResource.java b/src/java/org/apache/nutch/service/resources/ConfigResource.java
index e625c20..c6372ee 100644
--- a/src/java/org/apache/nutch/service/resources/ConfigResource.java
+++ b/src/java/org/apache/nutch/service/resources/ConfigResource.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,10 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.service.resources;
 
-
 import java.util.Map;
 import java.util.Set;
 
diff --git a/src/java/org/apache/nutch/service/resources/DbResource.java b/src/java/org/apache/nutch/service/resources/DbResource.java
index 67771d4..dc7049a 100644
--- a/src/java/org/apache/nutch/service/resources/DbResource.java
+++ b/src/java/org/apache/nutch/service/resources/DbResource.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -16,7 +16,6 @@
  */
 package org.apache.nutch.service.resources;
 
-
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
diff --git a/src/java/org/apache/nutch/service/resources/JobResource.java b/src/java/org/apache/nutch/service/resources/JobResource.java
index b142d73..3111b85 100644
--- a/src/java/org/apache/nutch/service/resources/JobResource.java
+++ b/src/java/org/apache/nutch/service/resources/JobResource.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/resources/ReaderResouce.java b/src/java/org/apache/nutch/service/resources/ReaderResouce.java
index 030999e..b1f9775 100644
--- a/src/java/org/apache/nutch/service/resources/ReaderResouce.java
+++ b/src/java/org/apache/nutch/service/resources/ReaderResouce.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/resources/SeedResource.java b/src/java/org/apache/nutch/service/resources/SeedResource.java
index 8489e3e..875968d 100644
--- a/src/java/org/apache/nutch/service/resources/SeedResource.java
+++ b/src/java/org/apache/nutch/service/resources/SeedResource.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/service/resources/ServicesResource.java b/src/java/org/apache/nutch/service/resources/ServicesResource.java
index e4224f1..c129652 100644
--- a/src/java/org/apache/nutch/service/resources/ServicesResource.java
+++ b/src/java/org/apache/nutch/service/resources/ServicesResource.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.service.resources;
 
 import java.io.File;
diff --git a/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java b/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
index f693a97..1dde478 100644
--- a/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
+++ b/src/java/org/apache/nutch/tools/AbstractCommonCrawlFormat.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.tools;
 
 import java.io.IOException;
diff --git a/src/java/org/apache/nutch/tools/Benchmark.java b/src/java/org/apache/nutch/tools/Benchmark.java
index c8b4a94..df57e9c 100644
--- a/src/java/org/apache/nutch/tools/Benchmark.java
+++ b/src/java/org/apache/nutch/tools/Benchmark.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.tools;
 
 import java.io.OutputStream;
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlConfig.java b/src/java/org/apache/nutch/tools/CommonCrawlConfig.java
index d8c06c0..49d9c31 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlConfig.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlConfig.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.tools;
 
 import java.io.IOException;
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
index c013059..0fe6606 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
@@ -1,20 +1,19 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.tools;
 
 import java.io.BufferedOutputStream;
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormat.java b/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
index 3eb19b4..aa2f351 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
@@ -1,20 +1,19 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.tools;
 
 import org.apache.nutch.metadata.Metadata;
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java b/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
index 72b24f5..e468532 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlFormatFactory.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.tools;
 
 import java.io.IOException;
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java b/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
index 0d6cae2..9ecf4bc 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlFormatJackson.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.tools;
 
 import java.io.ByteArrayOutputStream;
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java b/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
index 93ac118..169606c 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlFormatJettinson.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.tools;
 
 import java.io.IOException;
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java b/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
index 9e934d3..4310749 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlFormatSimple.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.tools;
 
 import java.io.IOException;
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java b/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
index 27f1198..f401041 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.tools;
 
 import java.io.ByteArrayInputStream;
@@ -193,6 +192,7 @@
     ByteArrayOutputStream output = new ByteArrayOutputStream();
 
     String httpHeaders = metadata.get("_response.headers_");
+    httpHeaders = WARCUtils.fixHttpHeaders(httpHeaders, content.getContent().length);
 
     if (StringUtils.isNotBlank(httpHeaders)) {
       output.write(httpHeaders.getBytes());
diff --git a/src/java/org/apache/nutch/tools/DmozParser.java b/src/java/org/apache/nutch/tools/DmozParser.java
index fa7e7d6..63dbde8 100644
--- a/src/java/org/apache/nutch/tools/DmozParser.java
+++ b/src/java/org/apache/nutch/tools/DmozParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.tools;
 
 import java.io.BufferedInputStream;
diff --git a/src/java/org/apache/nutch/tools/FileDumper.java b/src/java/org/apache/nutch/tools/FileDumper.java
index d09ad74..316b977 100644
--- a/src/java/org/apache/nutch/tools/FileDumper.java
+++ b/src/java/org/apache/nutch/tools/FileDumper.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.tools;
 
 import java.io.DataOutputStream;
diff --git a/src/java/org/apache/nutch/tools/FreeGenerator.java b/src/java/org/apache/nutch/tools/FreeGenerator.java
index ab5109e..4bec975 100644
--- a/src/java/org/apache/nutch/tools/FreeGenerator.java
+++ b/src/java/org/apache/nutch/tools/FreeGenerator.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.tools;
 
 import java.io.IOException;
diff --git a/src/java/org/apache/nutch/tools/WARCUtils.java b/src/java/org/apache/nutch/tools/WARCUtils.java
index dab3ba7..1af6533 100644
--- a/src/java/org/apache/nutch/tools/WARCUtils.java
+++ b/src/java/org/apache/nutch/tools/WARCUtils.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.tools;
 
 import java.io.ByteArrayInputStream;
@@ -25,6 +24,7 @@
 import java.net.UnknownHostException;
 import java.util.Date;
 import java.util.List;
+import java.util.regex.Pattern;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.indexer.NutchDocument;
@@ -47,6 +47,11 @@
   public final static String CONFORMS_TO = "conformsTo";
   public final static String IP = "ip";
   public final static UUIDGenerator generator = new UUIDGenerator();
+  public static final String CRLF = "\r\n";
+  public static final String COLONSP = ": ";
+  protected static final Pattern PROBLEMATIC_HEADERS = Pattern
+      .compile("(?i)(?:Content-(?:Encoding|Length)|Transfer-Encoding)");
+  protected static final String X_HIDE_HEADER = "X-Crawler-";
 
   public static final ANVLRecord getWARCInfoContent(Configuration conf) {
     ANVLRecord record = new ANVLRecord();
@@ -168,4 +173,110 @@
 
     return record;
   }
+  
+  /**
+   * Modify verbatim HTTP response headers: fix, remove or replace headers
+   * <code>Content-Length</code>, <code>Content-Encoding</code> and
+   * <code>Transfer-Encoding</code> which may confuse WARC readers. Ensure that
+   * returned header end with a single empty line (<code>\r\n\r\n</code>).
+   * 
+   * @param headers
+   *          HTTP 1.1 or 1.0 response header string, CR-LF-separated lines,
+   *          first line is status line
+   * @return safe HTTP response header
+   */
+  public static final String fixHttpHeaders(String headers, int contentLength) {
+    if (headers==null) {
+      return null;
+    }
+    int start = 0, lineEnd = 0, last = 0, trailingCrLf= 0;
+    StringBuilder replace = new StringBuilder();
+    while (start < headers.length()) {
+      lineEnd = headers.indexOf(CRLF, start);
+      trailingCrLf = 1;
+      if (lineEnd == -1) {
+        lineEnd = headers.length();
+        trailingCrLf = 0;
+      }
+      int colonPos = -1;
+      for (int i = start; i < lineEnd; i++) {
+        if (headers.charAt(i) == ':') {
+          colonPos = i;
+          break;
+        }
+      }
+      if (colonPos == -1) {
+        boolean valid = true;
+        if (start == 0) {
+          // status line (without colon)
+          // TODO: http/2
+        } else if ((lineEnd + 4) == headers.length()
+            && headers.endsWith(CRLF + CRLF)) {
+          // ok, trailing empty line
+          trailingCrLf = 2;
+        } else {
+          valid = false;
+        }
+        if (!valid) {
+          if (last < start) {
+            replace.append(headers.substring(last, start));
+          }
+          last = lineEnd + 2 * trailingCrLf;
+        }
+        start = lineEnd + 2 * trailingCrLf;
+        /*
+         * skip over invalid header line, no further check for problematic
+         * headers required
+         */
+        continue;
+      }
+      String name = headers.substring(start, colonPos);
+      if (PROBLEMATIC_HEADERS.matcher(name).matches()) {
+        boolean needsFix = true;
+        if (name.equalsIgnoreCase("content-length")) {
+          String value = headers.substring(colonPos + 1, lineEnd).trim();
+          try {
+            int l = Integer.parseInt(value);
+            if (l == contentLength) {
+              needsFix = false;
+            }
+          } catch (NumberFormatException e) {
+            // needs to be fixed
+          }
+        }
+        if (needsFix) {
+          if (last < start) {
+            replace.append(headers.substring(last, start));
+          }
+          last = lineEnd + 2 * trailingCrLf;
+          replace.append(X_HIDE_HEADER)
+              .append(headers.substring(start, lineEnd + 2 * trailingCrLf));
+          if (trailingCrLf == 0) {
+            replace.append(CRLF);
+            trailingCrLf = 1;
+          }
+          if (name.equalsIgnoreCase("content-length")) {
+            // add effective uncompressed and unchunked length of content
+            replace.append("Content-Length").append(COLONSP)
+                .append(contentLength).append(CRLF);
+          }
+        }
+      }
+      start = lineEnd + 2 * trailingCrLf;
+    }
+    if (last > 0 || trailingCrLf != 2) {
+      if (last < headers.length()) {
+        // append trailing headers
+        replace.append(headers.substring(last));
+      }
+      while (trailingCrLf < 2) {
+        replace.append(CRLF);
+        trailingCrLf++;
+      }
+      return replace.toString();
+    }
+    return headers;
+  }
+
+  
 }
diff --git a/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java b/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java
index e4dd2c2..3b1593b 100644
--- a/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java
+++ b/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
index aa021b3..b5f7a44 100644
--- a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
+++ b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
index 8c8e08f..7a26748 100644
--- a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
+++ b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
index a7e08c7..d307000 100644
--- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java
+++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.tools.warc;
 
 import java.io.ByteArrayInputStream;
@@ -52,6 +51,7 @@
 import org.apache.nutch.crawl.NutchWritable;
 import org.apache.nutch.parse.ParseSegment;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.tools.WARCUtils;
 import org.apache.nutch.util.HadoopFSUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
@@ -145,6 +145,7 @@
 
         // were the headers stored as is? Can write a response element then
         String headersVerbatim = content.getMetadata().get("_response.headers_");
+        headersVerbatim = WARCUtils.fixHttpHeaders(headersVerbatim, content.getContent().length);
         byte[] httpheaders = new byte[0];
         if (StringUtils.isNotBlank(headersVerbatim)) {
           // check that ends with an empty line
@@ -242,7 +243,7 @@
           WARCRecord record = new WARCRecord(in);
           context.write(NullWritable.get(), new WARCWritable(record));
           context.getCounter("WARCExporter", "records generated").increment(1);
-        } catch (IOException exception) {
+        } catch (IOException | IllegalStateException exception) {
           LOG.error("Exception when generating WARC record for {} : {}", key,
               exception.getMessage());
           context.getCounter("WARCExporter", "exception").increment(1);
diff --git a/src/java/org/apache/nutch/tools/warc/package-info.java b/src/java/org/apache/nutch/tools/warc/package-info.java
index 44e1a94..4f87b4f 100644
--- a/src/java/org/apache/nutch/tools/warc/package-info.java
+++ b/src/java/org/apache/nutch/tools/warc/package-info.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements. See the NOTICE file distributed with this
  * work for additional information regarding copyright ownership. The ASF
diff --git a/src/java/org/apache/nutch/util/AbstractChecker.java b/src/java/org/apache/nutch/util/AbstractChecker.java
index e0af36d..b41bbc9 100644
--- a/src/java/org/apache/nutch/util/AbstractChecker.java
+++ b/src/java/org/apache/nutch/util/AbstractChecker.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.io.BufferedReader;
diff --git a/src/java/org/apache/nutch/util/CommandRunner.java b/src/java/org/apache/nutch/util/CommandRunner.java
index 6c870c0..ae0a224 100644
--- a/src/java/org/apache/nutch/util/CommandRunner.java
+++ b/src/java/org/apache/nutch/util/CommandRunner.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
index 4208b5c..c138e61 100644
--- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java
+++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.io.File;
diff --git a/src/java/org/apache/nutch/util/DeflateUtils.java b/src/java/org/apache/nutch/util/DeflateUtils.java
index 558762c..11bb29f 100644
--- a/src/java/org/apache/nutch/util/DeflateUtils.java
+++ b/src/java/org/apache/nutch/util/DeflateUtils.java
@@ -1,9 +1,10 @@
-/**
- * Copyright 2005 The Apache Software Foundation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
@@ -13,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.io.ByteArrayOutputStream;
diff --git a/src/java/org/apache/nutch/util/DumpFileUtil.java b/src/java/org/apache/nutch/util/DumpFileUtil.java
index be9ec87..a9ad195 100644
--- a/src/java/org/apache/nutch/util/DumpFileUtil.java
+++ b/src/java/org/apache/nutch/util/DumpFileUtil.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import org.apache.commons.codec.digest.DigestUtils;
diff --git a/src/java/org/apache/nutch/util/GZIPUtils.java b/src/java/org/apache/nutch/util/GZIPUtils.java
index 392eb13..dc40a7f 100644
--- a/src/java/org/apache/nutch/util/GZIPUtils.java
+++ b/src/java/org/apache/nutch/util/GZIPUtils.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.io.ByteArrayOutputStream;
diff --git a/src/java/org/apache/nutch/util/JexlUtil.java b/src/java/org/apache/nutch/util/JexlUtil.java
index b480033..42c8728 100644
--- a/src/java/org/apache/nutch/util/JexlUtil.java
+++ b/src/java/org/apache/nutch/util/JexlUtil.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/util/LockUtil.java b/src/java/org/apache/nutch/util/LockUtil.java
index 5cfd2e3..4093e7a 100644
--- a/src/java/org/apache/nutch/util/LockUtil.java
+++ b/src/java/org/apache/nutch/util/LockUtil.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.io.IOException;
diff --git a/src/java/org/apache/nutch/util/MimeUtil.java b/src/java/org/apache/nutch/util/MimeUtil.java
index 2fd13b7..17bb380 100644
--- a/src/java/org/apache/nutch/util/MimeUtil.java
+++ b/src/java/org/apache/nutch/util/MimeUtil.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.io.File;
diff --git a/src/java/org/apache/nutch/util/NutchConfiguration.java b/src/java/org/apache/nutch/util/NutchConfiguration.java
index ac71a93..4089aec 100644
--- a/src/java/org/apache/nutch/util/NutchConfiguration.java
+++ b/src/java/org/apache/nutch/util/NutchConfiguration.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.util.Map.Entry;
diff --git a/src/java/org/apache/nutch/util/NutchJob.java b/src/java/org/apache/nutch/util/NutchJob.java
index 1ba4a21..991e506 100644
--- a/src/java/org/apache/nutch/util/NutchJob.java
+++ b/src/java/org/apache/nutch/util/NutchJob.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.io.IOException;
diff --git a/src/java/org/apache/nutch/util/NutchTool.java b/src/java/org/apache/nutch/util/NutchTool.java
index fc0d3e4..f7b0b76 100644
--- a/src/java/org/apache/nutch/util/NutchTool.java
+++ b/src/java/org/apache/nutch/util/NutchTool.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.io.IOException;
diff --git a/src/java/org/apache/nutch/util/ObjectCache.java b/src/java/org/apache/nutch/util/ObjectCache.java
index f1b14c8..e313a6e 100644
--- a/src/java/org/apache/nutch/util/ObjectCache.java
+++ b/src/java/org/apache/nutch/util/ObjectCache.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/util/PrefixStringMatcher.java b/src/java/org/apache/nutch/util/PrefixStringMatcher.java
index 6ca48c8..3be0fd7 100644
--- a/src/java/org/apache/nutch/util/PrefixStringMatcher.java
+++ b/src/java/org/apache/nutch/util/PrefixStringMatcher.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,11 +14,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
+import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.Iterator;
+import java.util.List;
 
 /**
  * A class for efficiently matching <code>String</code>s against a set of
@@ -103,8 +105,9 @@
   }
 
   public static final void main(String[] argv) {
-    PrefixStringMatcher matcher = new PrefixStringMatcher(new String[] {
-        "abcd", "abc", "aac", "baz", "foo", "foobar" });
+    String[] prefixes = new String[] { "abcd", "abc", "aac", "baz", "foo",
+        "foobar" };
+    PrefixStringMatcher matcher = new PrefixStringMatcher(prefixes);
 
     String[] tests = { "a", "ab", "abc", "abcdefg", "apple", "aa", "aac",
         "aaccca", "abaz", "baz", "bazooka", "fo", "foobar", "kite", };
@@ -115,5 +118,23 @@
       System.out.println("  shortest: " + matcher.shortestMatch(tests[i]));
       System.out.println("   longest: " + matcher.longestMatch(tests[i]));
     }
+
+    int iterations = 1000;
+    System.out.println("Testing thread-safety (NUTCH-2585) with " + iterations
+        + " iterations:");
+    List<String> testsList = Arrays.asList(tests);
+    for (int i = 0; i < iterations; i++) {
+      matcher = new PrefixStringMatcher(prefixes);
+      Collections.shuffle(testsList);
+      try {
+        long count = testsList.parallelStream().filter(matcher::matches).count();
+        System.out.print(String.format("Cycle %4d : %d matches\r", i, count));
+      } catch (Exception e) {
+        // flush output
+        System.out.println("");
+        throw e;
+      }
+    }
+    System.out.println("");
   }
 }
diff --git a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
index 2082fe5..3b6cc48 100644
--- a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
+++ b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.io.File;
diff --git a/src/java/org/apache/nutch/util/SegmentReaderUtil.java b/src/java/org/apache/nutch/util/SegmentReaderUtil.java
index 3732211..fa9a7c8 100644
--- a/src/java/org/apache/nutch/util/SegmentReaderUtil.java
+++ b/src/java/org/apache/nutch/util/SegmentReaderUtil.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,8 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-
 package org.apache.nutch.util;
 
 import java.util.Arrays;
diff --git a/src/java/org/apache/nutch/util/SitemapProcessor.java b/src/java/org/apache/nutch/util/SitemapProcessor.java
index 0762ae4..cbfbe0c 100644
--- a/src/java/org/apache/nutch/util/SitemapProcessor.java
+++ b/src/java/org/apache/nutch/util/SitemapProcessor.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.io.IOException;
diff --git a/src/java/org/apache/nutch/util/StringUtil.java b/src/java/org/apache/nutch/util/StringUtil.java
index 149269f..b63364d 100644
--- a/src/java/org/apache/nutch/util/StringUtil.java
+++ b/src/java/org/apache/nutch/util/StringUtil.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 /**
diff --git a/src/java/org/apache/nutch/util/SuffixStringMatcher.java b/src/java/org/apache/nutch/util/SuffixStringMatcher.java
index 6e070b9..46df52a 100644
--- a/src/java/org/apache/nutch/util/SuffixStringMatcher.java
+++ b/src/java/org/apache/nutch/util/SuffixStringMatcher.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.util.Collection;
diff --git a/src/java/org/apache/nutch/util/TableUtil.java b/src/java/org/apache/nutch/util/TableUtil.java
index e6ccbbc..1414d15 100644
--- a/src/java/org/apache/nutch/util/TableUtil.java
+++ b/src/java/org/apache/nutch/util/TableUtil.java
@@ -1,4 +1,4 @@
-/*******************************************************************************
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -13,7 +13,7 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- ******************************************************************************/
+ */
 package org.apache.nutch.util;
 
 import org.apache.commons.lang.StringUtils;
diff --git a/src/java/org/apache/nutch/util/TimingUtil.java b/src/java/org/apache/nutch/util/TimingUtil.java
index 77b09d9..3f3e74e 100644
--- a/src/java/org/apache/nutch/util/TimingUtil.java
+++ b/src/java/org/apache/nutch/util/TimingUtil.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,13 +14,28 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
+import java.time.Instant;
+import java.time.LocalDateTime;
+import java.time.ZoneId;
+import java.time.format.DateTimeFormatter;
 import java.util.concurrent.TimeUnit;
 
 public class TimingUtil {
 
+  /** Formats dates for logging */
+  public static DateTimeFormatter logDateFormat = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
+
+  /**
+   * Convert epoch milliseconds ({@link System#currentTimeMillis()}) into date
+   * string (local time zone) used for logging
+   */
+  public static String logDateMillis(long millis) {
+    return logDateFormat.format(
+        LocalDateTime.ofInstant(Instant.ofEpochMilli(millis), ZoneId.systemDefault()));
+  }
+
   /**
    * Calculate the elapsed time between two times specified in milliseconds.
    * 
diff --git a/src/java/org/apache/nutch/util/TrieStringMatcher.java b/src/java/org/apache/nutch/util/TrieStringMatcher.java
index 95086fe..d974ecb 100644
--- a/src/java/org/apache/nutch/util/TrieStringMatcher.java
+++ b/src/java/org/apache/nutch/util/TrieStringMatcher.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.util.Arrays;
@@ -24,6 +23,8 @@
 /**
  * TrieStringMatcher is a base class for simple tree-based string matching.
  * 
+ * This class is thread-safe during string matching but not when adding strings
+ * to the trie.
  */
 public abstract class TrieStringMatcher {
   protected TrieNode root;
@@ -104,9 +105,7 @@
      */
     TrieNode getChild(char nextChar) {
       if (children == null) {
-        children = childrenList.toArray(new TrieNode[childrenList.size()]);
-        childrenList = null;
-        Arrays.sort(children);
+        compile();
       }
 
       int min = 0;
@@ -138,6 +137,18 @@
       // if (this.nodeChar > other.nodeChar)
       return 1;
     }
+
+    /**
+     * Prepare node for matching. Note: this method is synchronized because it
+     * may be called concurrently when the trie is used for matching.
+     */
+    synchronized void compile() {
+      if (childrenList != null) {
+        children = childrenList.toArray(new TrieNode[childrenList.size()]);
+        childrenList = null;
+        Arrays.sort(children);
+      }
+    }
   }
 
   /**
diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java
index 525f14b..e500f5a 100644
--- a/src/java/org/apache/nutch/util/URLUtil.java
+++ b/src/java/org/apache/nutch/util/URLUtil.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.net.IDN;
diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
index 32ba10f..4354ffc 100644
--- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
+++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util.domain;
 
 import java.io.File;
diff --git a/src/java/org/apache/nutch/util/domain/DomainSuffix.java b/src/java/org/apache/nutch/util/domain/DomainSuffix.java
index d40ebe9..05162aa 100644
--- a/src/java/org/apache/nutch/util/domain/DomainSuffix.java
+++ b/src/java/org/apache/nutch/util/domain/DomainSuffix.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util.domain;
 
 /**
diff --git a/src/java/org/apache/nutch/util/domain/DomainSuffixes.java b/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
index faf59ef..9047ecf 100644
--- a/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
+++ b/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util.domain;
 
 import java.io.InputStream;
diff --git a/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java b/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
index ae88736..69e212d 100644
--- a/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
+++ b/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util.domain;
 
 import java.io.IOException;
diff --git a/src/java/org/apache/nutch/util/domain/TopLevelDomain.java b/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
index f442d1f..2e9cddb 100644
--- a/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
+++ b/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util.domain;
 
 /**
diff --git a/src/java/org/apache/nutch/webui/NutchUiApplication.java b/src/java/org/apache/nutch/webui/NutchUiApplication.java
index 6fd2396..67ac281 100644
--- a/src/java/org/apache/nutch/webui/NutchUiApplication.java
+++ b/src/java/org/apache/nutch/webui/NutchUiApplication.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/NutchUiServer.java b/src/java/org/apache/nutch/webui/NutchUiServer.java
index d534b8f..4af3915 100644
--- a/src/java/org/apache/nutch/webui/NutchUiServer.java
+++ b/src/java/org/apache/nutch/webui/NutchUiServer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/client/NutchClient.java b/src/java/org/apache/nutch/webui/client/NutchClient.java
index 3f8887d..bd8072e 100644
--- a/src/java/org/apache/nutch/webui/client/NutchClient.java
+++ b/src/java/org/apache/nutch/webui/client/NutchClient.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/client/NutchClientFactory.java b/src/java/org/apache/nutch/webui/client/NutchClientFactory.java
index 32da00e..e51ae5a 100644
--- a/src/java/org/apache/nutch/webui/client/NutchClientFactory.java
+++ b/src/java/org/apache/nutch/webui/client/NutchClientFactory.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java b/src/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java
index 6a4f2f0..4ddcfbb 100644
--- a/src/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java
+++ b/src/java/org/apache/nutch/webui/client/impl/CrawlingCycle.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java b/src/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java
index c2abde5..86c916e 100644
--- a/src/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java
+++ b/src/java/org/apache/nutch/webui/client/impl/CrawlingCycleListener.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java b/src/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java
index 1a577f9..671f735 100644
--- a/src/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java
+++ b/src/java/org/apache/nutch/webui/client/impl/NutchClientImpl.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java b/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
index ea19a8a..173223e 100644
--- a/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
+++ b/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java b/src/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java
index d6b1767..ceb59b1 100644
--- a/src/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java
+++ b/src/java/org/apache/nutch/webui/client/impl/RemoteCommandBuilder.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java b/src/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java
index 9e4d1a8..30fcf4d 100644
--- a/src/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java
+++ b/src/java/org/apache/nutch/webui/client/impl/RemoteCommandExecutor.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java b/src/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java
index cef56a5..00ff13f 100644
--- a/src/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java
+++ b/src/java/org/apache/nutch/webui/client/impl/RemoteCommandsBatchFactory.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/client/model/ConnectionStatus.java b/src/java/org/apache/nutch/webui/client/model/ConnectionStatus.java
index c5083f2..66dec88 100644
--- a/src/java/org/apache/nutch/webui/client/model/ConnectionStatus.java
+++ b/src/java/org/apache/nutch/webui/client/model/ConnectionStatus.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/client/model/Crawl.java b/src/java/org/apache/nutch/webui/client/model/Crawl.java
index 6057f7f..bb818c5 100644
--- a/src/java/org/apache/nutch/webui/client/model/Crawl.java
+++ b/src/java/org/apache/nutch/webui/client/model/Crawl.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/client/model/JobConfig.java b/src/java/org/apache/nutch/webui/client/model/JobConfig.java
index 80df279..c15444b 100644
--- a/src/java/org/apache/nutch/webui/client/model/JobConfig.java
+++ b/src/java/org/apache/nutch/webui/client/model/JobConfig.java
@@ -1,19 +1,19 @@
-/*******************************************************************************
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- * 
+ *
  *     http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- ******************************************************************************/
+ */
 package org.apache.nutch.webui.client.model;
 
 import java.io.Serializable;
diff --git a/src/java/org/apache/nutch/webui/client/model/JobInfo.java b/src/java/org/apache/nutch/webui/client/model/JobInfo.java
index 312118a..e293f4d 100644
--- a/src/java/org/apache/nutch/webui/client/model/JobInfo.java
+++ b/src/java/org/apache/nutch/webui/client/model/JobInfo.java
@@ -1,19 +1,19 @@
-/*******************************************************************************
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- * 
+ *
  *     http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- ******************************************************************************/
+ */
 package org.apache.nutch.webui.client.model;
 
 import java.io.Serializable;
diff --git a/src/java/org/apache/nutch/webui/client/model/NutchStatus.java b/src/java/org/apache/nutch/webui/client/model/NutchStatus.java
index 0c5c425..3c8c137 100644
--- a/src/java/org/apache/nutch/webui/client/model/NutchStatus.java
+++ b/src/java/org/apache/nutch/webui/client/model/NutchStatus.java
@@ -1,19 +1,19 @@
-/*******************************************************************************
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- * 
+ *
  *     http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- ******************************************************************************/
+ */
 package org.apache.nutch.webui.client.model;
 
 import java.io.Serializable;
diff --git a/src/java/org/apache/nutch/webui/config/CustomDaoFactory.java b/src/java/org/apache/nutch/webui/config/CustomDaoFactory.java
index 09c2d6a..26fbece 100644
--- a/src/java/org/apache/nutch/webui/config/CustomDaoFactory.java
+++ b/src/java/org/apache/nutch/webui/config/CustomDaoFactory.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/config/CustomTableCreator.java b/src/java/org/apache/nutch/webui/config/CustomTableCreator.java
index 9b31d73..87a0e64 100644
--- a/src/java/org/apache/nutch/webui/config/CustomTableCreator.java
+++ b/src/java/org/apache/nutch/webui/config/CustomTableCreator.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java b/src/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java
index 8b76440..82cab4c 100644
--- a/src/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java
+++ b/src/java/org/apache/nutch/webui/config/NutchGuiConfiguration.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/config/SpringConfiguration.java b/src/java/org/apache/nutch/webui/config/SpringConfiguration.java
index 1687cee..7783d78 100644
--- a/src/java/org/apache/nutch/webui/config/SpringConfiguration.java
+++ b/src/java/org/apache/nutch/webui/config/SpringConfiguration.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/model/NutchConfig.java b/src/java/org/apache/nutch/webui/model/NutchConfig.java
index 7a2111e..7acf9ad 100644
--- a/src/java/org/apache/nutch/webui/model/NutchConfig.java
+++ b/src/java/org/apache/nutch/webui/model/NutchConfig.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/model/NutchInstance.java b/src/java/org/apache/nutch/webui/model/NutchInstance.java
index 2c1f1c5..3800df2 100644
--- a/src/java/org/apache/nutch/webui/model/NutchInstance.java
+++ b/src/java/org/apache/nutch/webui/model/NutchInstance.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/model/SeedList.java b/src/java/org/apache/nutch/webui/model/SeedList.java
index 72d3d75..a963e94 100644
--- a/src/java/org/apache/nutch/webui/model/SeedList.java
+++ b/src/java/org/apache/nutch/webui/model/SeedList.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/model/SeedUrl.java b/src/java/org/apache/nutch/webui/model/SeedUrl.java
index 5f89241..b45164c 100644
--- a/src/java/org/apache/nutch/webui/model/SeedUrl.java
+++ b/src/java/org/apache/nutch/webui/model/SeedUrl.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/AbstractBasePage.java b/src/java/org/apache/nutch/webui/pages/AbstractBasePage.java
index 5611d74..0a89c0f 100644
--- a/src/java/org/apache/nutch/webui/pages/AbstractBasePage.java
+++ b/src/java/org/apache/nutch/webui/pages/AbstractBasePage.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/DashboardPage.java b/src/java/org/apache/nutch/webui/pages/DashboardPage.java
index 50586b9..ae6d760 100644
--- a/src/java/org/apache/nutch/webui/pages/DashboardPage.java
+++ b/src/java/org/apache/nutch/webui/pages/DashboardPage.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/LogOutPage.java b/src/java/org/apache/nutch/webui/pages/LogOutPage.java
index 9d0298f..a9bb883 100644
--- a/src/java/org/apache/nutch/webui/pages/LogOutPage.java
+++ b/src/java/org/apache/nutch/webui/pages/LogOutPage.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/SchedulingPage.java b/src/java/org/apache/nutch/webui/pages/SchedulingPage.java
index 54876a4..39b07a3 100644
--- a/src/java/org/apache/nutch/webui/pages/SchedulingPage.java
+++ b/src/java/org/apache/nutch/webui/pages/SchedulingPage.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/SearchPage.java b/src/java/org/apache/nutch/webui/pages/SearchPage.java
index 4a5a736..a390c01 100644
--- a/src/java/org/apache/nutch/webui/pages/SearchPage.java
+++ b/src/java/org/apache/nutch/webui/pages/SearchPage.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/StatisticsPage.java b/src/java/org/apache/nutch/webui/pages/StatisticsPage.java
index 048fb3c..c676b40 100644
--- a/src/java/org/apache/nutch/webui/pages/StatisticsPage.java
+++ b/src/java/org/apache/nutch/webui/pages/StatisticsPage.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/UrlsUploadPage.java b/src/java/org/apache/nutch/webui/pages/UrlsUploadPage.java
index e7c1b28..9ae1c7b 100644
--- a/src/java/org/apache/nutch/webui/pages/UrlsUploadPage.java
+++ b/src/java/org/apache/nutch/webui/pages/UrlsUploadPage.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/UserSettingsPage.java b/src/java/org/apache/nutch/webui/pages/UserSettingsPage.java
index 3e64963..5a95bba 100644
--- a/src/java/org/apache/nutch/webui/pages/UserSettingsPage.java
+++ b/src/java/org/apache/nutch/webui/pages/UserSettingsPage.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/assets/NutchUiCssReference.java b/src/java/org/apache/nutch/webui/pages/assets/NutchUiCssReference.java
index 52fe98e..dbc2be5 100644
--- a/src/java/org/apache/nutch/webui/pages/assets/NutchUiCssReference.java
+++ b/src/java/org/apache/nutch/webui/pages/assets/NutchUiCssReference.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/components/ColorEnumLabel.java b/src/java/org/apache/nutch/webui/pages/components/ColorEnumLabel.java
index ab24652..8518c0b 100644
--- a/src/java/org/apache/nutch/webui/pages/components/ColorEnumLabel.java
+++ b/src/java/org/apache/nutch/webui/pages/components/ColorEnumLabel.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/components/ColorEnumLabelBuilder.java b/src/java/org/apache/nutch/webui/pages/components/ColorEnumLabelBuilder.java
index 274eeae..d12b594 100644
--- a/src/java/org/apache/nutch/webui/pages/components/ColorEnumLabelBuilder.java
+++ b/src/java/org/apache/nutch/webui/pages/components/ColorEnumLabelBuilder.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/components/CpmIteratorAdapter.java b/src/java/org/apache/nutch/webui/pages/components/CpmIteratorAdapter.java
index 2175c67..9ffa77b 100644
--- a/src/java/org/apache/nutch/webui/pages/components/CpmIteratorAdapter.java
+++ b/src/java/org/apache/nutch/webui/pages/components/CpmIteratorAdapter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/crawls/CrawlPanel.java b/src/java/org/apache/nutch/webui/pages/crawls/CrawlPanel.java
index 887b49e..e778f83 100644
--- a/src/java/org/apache/nutch/webui/pages/crawls/CrawlPanel.java
+++ b/src/java/org/apache/nutch/webui/pages/crawls/CrawlPanel.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/crawls/CrawlsPage.java b/src/java/org/apache/nutch/webui/pages/crawls/CrawlsPage.java
index 366b005..df9419b 100644
--- a/src/java/org/apache/nutch/webui/pages/crawls/CrawlsPage.java
+++ b/src/java/org/apache/nutch/webui/pages/crawls/CrawlsPage.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/instances/InstancePanel.java b/src/java/org/apache/nutch/webui/pages/instances/InstancePanel.java
index cc54a7b..09ac71a 100644
--- a/src/java/org/apache/nutch/webui/pages/instances/InstancePanel.java
+++ b/src/java/org/apache/nutch/webui/pages/instances/InstancePanel.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.webui.pages.instances;
 
 import org.apache.nutch.webui.model.NutchInstance;
diff --git a/src/java/org/apache/nutch/webui/pages/menu/VerticalMenu.java b/src/java/org/apache/nutch/webui/pages/menu/VerticalMenu.java
index bcdaa4d..0085e46 100644
--- a/src/java/org/apache/nutch/webui/pages/menu/VerticalMenu.java
+++ b/src/java/org/apache/nutch/webui/pages/menu/VerticalMenu.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/seed/SeedListsPage.java b/src/java/org/apache/nutch/webui/pages/seed/SeedListsPage.java
index 79370cd..b837438 100644
--- a/src/java/org/apache/nutch/webui/pages/seed/SeedListsPage.java
+++ b/src/java/org/apache/nutch/webui/pages/seed/SeedListsPage.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/seed/SeedPage.java b/src/java/org/apache/nutch/webui/pages/seed/SeedPage.java
index b395905..fd9c418 100644
--- a/src/java/org/apache/nutch/webui/pages/seed/SeedPage.java
+++ b/src/java/org/apache/nutch/webui/pages/seed/SeedPage.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/pages/settings/SettingsPage.java b/src/java/org/apache/nutch/webui/pages/settings/SettingsPage.java
index baf341c..dd3780b 100644
--- a/src/java/org/apache/nutch/webui/pages/settings/SettingsPage.java
+++ b/src/java/org/apache/nutch/webui/pages/settings/SettingsPage.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.webui.pages.settings;
 
 import java.util.Iterator;
diff --git a/src/java/org/apache/nutch/webui/service/CrawlService.java b/src/java/org/apache/nutch/webui/service/CrawlService.java
index c742b48..c23c0da 100644
--- a/src/java/org/apache/nutch/webui/service/CrawlService.java
+++ b/src/java/org/apache/nutch/webui/service/CrawlService.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/service/NutchInstanceService.java b/src/java/org/apache/nutch/webui/service/NutchInstanceService.java
index 23f27e8..8607539 100644
--- a/src/java/org/apache/nutch/webui/service/NutchInstanceService.java
+++ b/src/java/org/apache/nutch/webui/service/NutchInstanceService.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/service/NutchService.java b/src/java/org/apache/nutch/webui/service/NutchService.java
index 643236a..6a145c4 100644
--- a/src/java/org/apache/nutch/webui/service/NutchService.java
+++ b/src/java/org/apache/nutch/webui/service/NutchService.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/service/SeedListService.java b/src/java/org/apache/nutch/webui/service/SeedListService.java
index dda8c71..731ab6c 100644
--- a/src/java/org/apache/nutch/webui/service/SeedListService.java
+++ b/src/java/org/apache/nutch/webui/service/SeedListService.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java b/src/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java
index 09798ea..1d9120e 100644
--- a/src/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java
+++ b/src/java/org/apache/nutch/webui/service/impl/CrawlServiceImpl.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java b/src/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java
index e100054..ac8b815 100644
--- a/src/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java
+++ b/src/java/org/apache/nutch/webui/service/impl/NutchInstanceServiceImpl.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java b/src/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java
index 8e34866..322b864 100644
--- a/src/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java
+++ b/src/java/org/apache/nutch/webui/service/impl/NutchServiceImpl.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java b/src/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java
index fced2d3..d54d3ff 100644
--- a/src/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java
+++ b/src/java/org/apache/nutch/webui/service/impl/SeedListServiceImpl.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
index 1acda93..950732f 100644
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
+++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
index 6cfd421..9980ae8 100644
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
+++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/package-info.java b/src/plugin/any23/src/java/org/apache/nutch/any23/package-info.java
index dce9061..802a33c 100644
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/package-info.java
+++ b/src/plugin/any23/src/java/org/apache/nutch/any23/package-info.java
@@ -14,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 /**
  * This packages uses the <a href="http://any23.apache.org">Apache Any23</a> library
  * for parsing and extracting structured data in RDF format from a
diff --git a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java
index ad7d8cf..1367e19 100644
--- a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java
+++ b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23IndexingFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java
index 251dfaf..3f0ace3 100644
--- a/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java
+++ b/src/plugin/any23/src/test/org/apache/nutch/any23/TestAny23ParseFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index d8826e8..51c3fe7 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -54,6 +54,7 @@
     <ant dir="indexer-dummy" target="deploy"/>
     <ant dir="indexer-elastic" target="deploy"/>
     <ant dir="indexer-elastic-rest" target="deploy"/>
+    <ant dir="indexer-kafka" target="deploy"/>
     <ant dir="indexer-rabbit" target="deploy"/>
     <ant dir="indexer-solr" target="deploy"/>
     <ant dir="language-identifier" target="deploy"/>
@@ -88,6 +89,7 @@
     <ant dir="urlfilter-automaton" target="deploy"/>
     <ant dir="urlfilter-domain" target="deploy" />
     <ant dir="urlfilter-domainblacklist" target="deploy" />
+    <ant dir="urlfilter-fast" target="deploy"/>
     <ant dir="urlfilter-prefix" target="deploy"/>
     <ant dir="urlfilter-regex" target="deploy"/>
     <ant dir="urlfilter-suffix" target="deploy"/>
@@ -145,6 +147,7 @@
      <ant dir="urlfilter-automaton" target="test"/>
      <ant dir="urlfilter-domain" target="test"/>
      <ant dir="urlfilter-domainblacklist" target="test"/>
+     <ant dir="urlfilter-fast" target="test"/>
      <!--ant dir="urlfilter-ignoreexempt" target="test"/-->
      <ant dir="urlfilter-prefix" target="test"/>
      <ant dir="urlfilter-regex" target="test"/>
@@ -191,6 +194,7 @@
     <ant dir="indexer-dummy" target="clean"/>
     <ant dir="indexer-elastic" target="clean"/>
     <ant dir="indexer-elastic-rest" target="clean"/>
+    <ant dir="indexer-kafka" target="clean"/>
     <ant dir="indexer-rabbit" target="clean"/>
     <ant dir="indexer-solr" target="clean"/>
     <ant dir="language-identifier" target="clean"/>
@@ -232,6 +236,7 @@
     <ant dir="urlfilter-automaton" target="clean"/>
     <ant dir="urlfilter-domain" target="clean" />
     <ant dir="urlfilter-domainblacklist" target="clean" />
+    <ant dir="urlfilter-fast" target="clean"/>
     <ant dir="urlfilter-ignoreexempt" target="clean"/>
     <ant dir="urlfilter-prefix" target="clean"/>
     <ant dir="urlfilter-regex" target="clean"/>
diff --git a/src/plugin/creativecommons/conf/nutch-site.xml b/src/plugin/creativecommons/conf/nutch-site.xml
index 71e344b..e639746 100644
--- a/src/plugin/creativecommons/conf/nutch-site.xml
+++ b/src/plugin/creativecommons/conf/nutch-site.xml
@@ -26,13 +26,6 @@
 </property>
 
 <property>
-  <name>http.max.delays</name>
-  <value>3</value>
-  <description>The CC crawl visits a large number of different
-  hosts, so we should not need to delay much.</description>
-</property>
-
-<property>
   <name>creativecommons.exclude.unlicensed</name>
   <value>true</value>
   <description>Exclude HTML content which does not contain a CC license.
diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
index cfaac1f..8636580 100644
--- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
+++ b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.creativecommons.nutch;
 
 import org.apache.nutch.metadata.CreativeCommons;
diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
index 7b20a28..3c9a8b2 100644
--- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
+++ b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.creativecommons.nutch;
 
 import org.apache.nutch.metadata.CreativeCommons;
diff --git a/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java b/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
index 41be9ed..de18968 100644
--- a/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
+++ b/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.creativecommons.nutch;
 
 import org.apache.nutch.metadata.Metadata;
diff --git a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java b/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
index 2a067d7..5a2fa77 100644
--- a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
+++ b/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer.feed;
 
 import java.util.Date;
diff --git a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
index feac070..646c4f9 100644
--- a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
+++ b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java b/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
index 9243009..915ee54 100644
--- a/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
+++ b/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.feed;
 
 import java.util.Iterator;
diff --git a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java b/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
index beddfc3..d955001 100644
--- a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
+++ b/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.headings;
 
 import java.util.ArrayList;
diff --git a/src/plugin/headings/src/test/org/apache/nutch/parse/headings/TestHeadingsParseFilter.java b/src/plugin/headings/src/test/org/apache/nutch/parse/headings/TestHeadingsParseFilter.java
index 082b5f4..0795a9a 100644
--- a/src/plugin/headings/src/test/org/apache/nutch/parse/headings/TestHeadingsParseFilter.java
+++ b/src/plugin/headings/src/test/org/apache/nutch/parse/headings/TestHeadingsParseFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.headings;
 
 import org.apache.hadoop.conf.Configuration;
diff --git a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java b/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
index 0e62fea..94cd1fc 100644
--- a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
+++ b/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer.basic;
 
 import org.apache.nutch.metadata.Nutch;
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
index 38e75b1..8e48529 100644
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
+++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
index 19e1f74..8bdc9fa 100644
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
+++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
index e0b3904..350fb6f 100644
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
+++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 /**
  * <p>This plugin implements an indexing filter which takes 
  * advantage of the 
diff --git a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
index 890020a..633e0d4 100644
--- a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
+++ b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/JexlIndexingFilter.java
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer.jexl;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/package-info.java b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/package-info.java
index 809f716..7ca13ef 100644
--- a/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/package-info.java
+++ b/src/plugin/index-jexl-filter/src/java/org/apache/nutch/indexer/jexl/package-info.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 /**
  * <p>This plugin implements a dynamic indexing filter which uses JEXL 
  * expressions to allow filtering based on the page's metadata 
diff --git a/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java b/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
index 2b52230..35370f2 100644
--- a/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
+++ b/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java b/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
index 0b22a98..0f8b660 100644
--- a/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
+++ b/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
index 74d9eb1..be56377 100644
--- a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
+++ b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer.metadata;
 
 import java.util.Arrays;
@@ -94,12 +93,15 @@
   
   protected void add(NutchDocument doc, String key, String value) {
     if (separator == null || value.indexOf(separator) == -1 || !mvFields.contains(key)) {
-      doc.add(key, value);
+      value = value.trim();
+      if (!value.isEmpty()) {
+        doc.add(key, value);
+      }
     } else {
       String[] parts = value.split(separator);
       for (String part : parts) {
         part = part.trim();
-        if (part.length() != 0) {
+        if (!part.isEmpty()) {
           doc.add(key, part);
         }
       }
diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
index 05c215f..45b79b7 100644
--- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
+++ b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java
index 038229d..04c5765 100644
--- a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java
+++ b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
index 4066ce0..4592de9 100644
--- a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
+++ b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java b/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java
index ca90ca3..fcd8cb5 100644
--- a/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java
+++ b/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer.replace;
 
 import org.apache.hadoop.conf.Configuration;
diff --git a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java b/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
index 1a81041..52d5c11 100644
--- a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
+++ b/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer.staticfield;
 
 import java.util.HashMap;
diff --git a/src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchUtils.java b/src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchUtils.java
index 5783981..3ccf840 100644
--- a/src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchUtils.java
+++ b/src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchUtils.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexwriter.cloudsearch;
 
 import java.nio.charset.StandardCharsets;
diff --git a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
index 1564541..160d03d 100644
--- a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
+++ b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexwriter.csv;
 
 import java.io.IOException;
diff --git a/src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java b/src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java
index 9110cd9..761d042 100644
--- a/src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java
+++ b/src/plugin/indexer-csv/src/test/org/apache/nutch/indexwriter/csv/TestCSVIndexWriter.java
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexwriter.csv;
 
 import static org.junit.Assert.assertEquals;
diff --git a/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyConstants.java b/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyConstants.java
index 46d6d45..cd84091 100644
--- a/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyConstants.java
+++ b/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyConstants.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexwriter.dummy;
 
 public interface DummyConstants {
diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
index d9a1b3e..a82beae 100644
--- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
+++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexwriter.elastic;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/indexer-elastic/src/test/org/apache/nutch/indexwriter/elastic/TestElasticIndexWriter.java b/src/plugin/indexer-elastic/src/test/org/apache/nutch/indexwriter/elastic/TestElasticIndexWriter.java
index 6fb1ab2..ea9552c 100644
--- a/src/plugin/indexer-elastic/src/test/org/apache/nutch/indexwriter/elastic/TestElasticIndexWriter.java
+++ b/src/plugin/indexer-elastic/src/test/org/apache/nutch/indexwriter/elastic/TestElasticIndexWriter.java
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexwriter.elastic;
 
 import java.io.IOException;
diff --git a/src/plugin/indexer-kafka/build-ivy.xml b/src/plugin/indexer-kafka/build-ivy.xml
new file mode 100644
index 0000000..0932dfc
--- /dev/null
+++ b/src/plugin/indexer-kafka/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="indexer-kafka" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0"/>
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+        <isset property="env.IVY_HOME"/>
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant"/>
+    <property name="ivy.checksums" value=""/>
+    <property name="ivy.jar.dir" value="${ivy.home}/lib"/>
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar"/>
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar"
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+        <!-- try to load ivy here from ivy home, in case the user has not already dropped
+                it into ant's lib dir (note that the latter copy will always take precedence).
+                We will not fail as long as local lib dir exists (it may be empty) and
+                ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+    <target name="deps-jar" depends="init-ivy">
+        <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+    </target>
+
+</project>
diff --git a/src/plugin/indexer-kafka/build.xml b/src/plugin/indexer-kafka/build.xml
new file mode 100644
index 0000000..c2f8078
--- /dev/null
+++ b/src/plugin/indexer-kafka/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="indexer-kafka" default="jar-core">
+
+    <import file="../build-plugin.xml"/>
+
+</project>
diff --git a/src/plugin/indexer-kafka/ivy.xml b/src/plugin/indexer-kafka/ivy.xml
new file mode 100644
index 0000000..26f143e
--- /dev/null
+++ b/src/plugin/indexer-kafka/ivy.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+      <license name="Apache 2.0"/>
+      <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+      <description>
+          Apache Nutch
+      </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+      <dependency org="org.apache.kafka" name="kafka_2.12" rev="1.1.0"/>
+      <dependency org="org.apache.kafka" name="connect-json" rev="1.1.0"/>
+  </dependencies>
+
+</ivy-module>
diff --git a/src/plugin/indexer-kafka/plugin.xml b/src/plugin/indexer-kafka/plugin.xml
new file mode 100644
index 0000000..c5cc21c
--- /dev/null
+++ b/src/plugin/indexer-kafka/plugin.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<plugin id="indexer-kafka" name="KafkaIndexWriter" version="1.0.0"
+        provider-name="nutch.apache.org">
+
+    <runtime>
+        <library name="indexer-kafka.jar">
+            <export name="*"/>
+        </library>
+        <library name="kafka_2.12-1.1.0.jar"/>
+        <library name="connect-json-1.1.0.jar"/>
+        <library name="connect-api-1.1.0.jar"/>
+        <library name="jackson-annotations-2.9.0.jar"/>
+        <library name="jackson-core-2.9.4.jar"/>
+        <library name="jackson-databind-2.9.4.jar"/>
+        <library name="jopt-simple-5.0.4.jar"/>
+        <library name="kafka-clients-1.1.0.jar"/>
+        <library name="lz4-java-1.4.jar"/>
+        <library name="metrics-core-2.2.0.jar"/>
+        <library name="scala-library-2.12.4.jar"/>
+        <library name="scala-logging_2.12-3.7.2.jar"/>
+        <library name="scala-reflect-2.12.4.jar"/>
+        <library name="snappy-java-1.1.7.1.jar"/>
+        <library name="zkclient-0.10.jar"/>
+        <library name="zookeeper-3.4.10.jar"/>
+    </runtime>
+
+    <requires>
+        <import plugin="nutch-extensionpoints"/>
+    </requires>
+
+    <extension id="org.apache.nutch.indexer.kafka"
+               name="Kafka Index Writer"
+               point="org.apache.nutch.indexer.IndexWriter">
+        <implementation id="KafkaIndexWriter"
+                        class="org.apache.nutch.indexwriter.kafka.KafkaIndexWriter"/>
+    </extension>
+
+</plugin>
diff --git a/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaConstants.java b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaConstants.java
new file mode 100644
index 0000000..f722382
--- /dev/null
+++ b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaConstants.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.kafka;
+
+public interface KafkaConstants {
+
+  public static final String HOST = "host";
+  public static final String PORT = "port";
+
+  public static final String KEY_SERIALIZER = "key.serializer";
+  public static final String VALUE_SERIALIZER = 
+      "value.serializer";
+  public static final String TOPIC =  "topic";
+  public static final String MAX_DOC_COUNT = "max.doc.count";
+}
diff --git a/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaIndexWriter.java b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaIndexWriter.java
new file mode 100644
index 0000000..1702004
--- /dev/null
+++ b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/KafkaIndexWriter.java
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexwriter.kafka;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang3.exception.ExceptionUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.indexer.IndexWriter;
+import org.apache.nutch.indexer.IndexWriterParams;
+import org.apache.nutch.indexer.NutchDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+import org.apache.kafka.clients.producer.KafkaProducer;
+import org.apache.kafka.clients.producer.ProducerConfig;
+import org.apache.kafka.clients.producer.ProducerRecord;
+import com.fasterxml.jackson.databind.JsonNode;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+/**
+ * Sends Nutch documents to a configured Kafka Cluster
+ */
+public class KafkaIndexWriter implements IndexWriter {
+  public static Logger LOG = LoggerFactory.getLogger(KafkaIndexWriter.class);
+
+  private org.apache.kafka.clients.producer.Producer<String, JsonNode> producer;
+  private ProducerRecord<String, JsonNode> data;
+
+  private Configuration config;
+
+  private int port = -1;
+  private String host = null;
+  private String valueSerializer = null;
+  private String keySerializer = null;
+  private String topic = null;
+  private int maxDocCount = -1;
+
+  private String jsonString = null;
+  private JsonNode json = null;
+
+  private List<ProducerRecord<String, JsonNode>> inputDocs = null;
+
+  @Override
+  public void open(Configuration job, String name) throws IOException {
+    //Implementation not required
+  }
+  
+  @Override
+  public void open(IndexWriterParams params) throws IOException {
+    host = params.get(KafkaConstants.HOST);
+    port = params.getInt(KafkaConstants.PORT, 9092);
+    
+    keySerializer = params.get(KafkaConstants.KEY_SERIALIZER,
+        "org.apache.kafka.common.serialization.ByteArraySerializer");
+    valueSerializer = params.get(KafkaConstants.VALUE_SERIALIZER,
+        "org.apache.kafka.connect.json.JsonSerializer");
+    topic = params.get(KafkaConstants.TOPIC);
+    maxDocCount = params.getInt(KafkaConstants.MAX_DOC_COUNT, 100);
+
+    inputDocs = new ArrayList<ProducerRecord<String, JsonNode>>(maxDocCount);
+    
+    if (StringUtils.isBlank(host)) {
+      String message = "Missing host. It should be set in index-writers.xml";
+      message += "\n" + describe();
+      LOG.error(message);
+      throw new RuntimeException(message);
+    }
+    
+    Properties configProperties = new Properties();
+    configProperties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,
+        host + ":" + port);
+    configProperties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,
+        keySerializer);
+    configProperties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,
+        valueSerializer);
+
+    Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());
+    producer = new KafkaProducer<String, JsonNode>(configProperties);
+  }
+
+  @Override
+  public void write(NutchDocument doc) throws IOException {
+
+    Map<String, Object> source = new HashMap<String, Object>();
+
+    // Loop through all fields of this doc
+    for (String fieldName : doc.getFieldNames()) {
+      Set<String> allFieldValues = new HashSet<String>();
+      for (Object value : doc.getField(fieldName).getValues()) {
+        allFieldValues.add(value.toString());
+      }
+      String[] fieldValues = allFieldValues
+          .toArray(new String[allFieldValues.size()]);
+      source.put(fieldName, fieldValues);
+    }
+    try {
+      jsonString = new ObjectMapper().writeValueAsString(source);
+      json = new ObjectMapper().readTree(jsonString);
+      data = new ProducerRecord<String, JsonNode>(topic, json);
+
+      inputDocs.add(data);
+      if (inputDocs.size() == maxDocCount) {
+        commit();
+      }
+    } catch (NullPointerException e) {
+      LOG.info("Data is empty, all messages have been sent");
+    }
+  }
+
+  @Override
+  public void delete(String key) throws IOException {
+    // Not applicable in Kafka
+  }
+
+  @Override
+  public void update(NutchDocument doc) throws IOException {
+    try {
+      write(doc);
+    } catch (IOException e) {
+      LOG.error(ExceptionUtils.getStackTrace(e));
+      throw e;
+    }
+  }
+
+  @Override
+  public void commit() throws IOException {
+    try {
+      for (ProducerRecord<String, JsonNode> datum : inputDocs) {
+        producer.send(datum);
+      }
+      inputDocs.clear();
+    } catch (NullPointerException e) {
+      LOG.info("All records have been sent to Kakfa on topic {}", topic);
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    commit();
+    producer.close();
+  }
+
+  @Override
+  public Map<String, Map.Entry<String, Object>> describe() {
+    Map<String, Map.Entry<String, Object>> properties = new LinkedHashMap<>();
+
+    properties.put(KafkaConstants.HOST,
+            new AbstractMap.SimpleEntry<>(
+                    "Location of the host Kafka cluster to connect to using producerConfig",
+                    this.host));
+
+    properties.put(KafkaConstants.PORT,
+            new AbstractMap.SimpleEntry<>(
+                    "The port to connect to using the producerConfig",
+                    this.port));
+
+    properties.put(KafkaConstants.TOPIC,
+            new AbstractMap.SimpleEntry<>(
+                    "Default index to attach to documents",
+                    this.topic));
+
+    properties.put(KafkaConstants.KEY_SERIALIZER,
+    new AbstractMap.SimpleEntry<>(
+            "instruct how to turn the key object the user provides with their ProducerRecord into bytes",
+            this.keySerializer));      
+
+    properties.put(KafkaConstants.VALUE_SERIALIZER,
+    new AbstractMap.SimpleEntry<>(
+            "instruct how to turn the value object the user provides with their ProducerRecord into bytes",
+            this.valueSerializer));
+
+    properties.put(KafkaConstants.MAX_DOC_COUNT,
+    new AbstractMap.SimpleEntry<>(
+            "Maximum number of documents before a commit is forced",
+            this.maxDocCount));
+    return properties;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    config = conf;
+  }
+
+  @Override
+  public Configuration getConf() {
+    return config;
+  }
+
+}
diff --git a/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/package-info.java b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/package-info.java
new file mode 100644
index 0000000..b720872
--- /dev/null
+++ b/src/plugin/indexer-kafka/src/java/org/apache/nutch/indexwriter/kafka/package-info.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Index writer plugin to produce JSON messages to <a href="https://kafka.apache.org/">Kafka</a>.
+ */
+package org.apache.nutch.indexwriter.kafka;
diff --git a/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java b/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
index 475d313..86ca3eb 100644
--- a/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
+++ b/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
@@ -18,8 +18,6 @@
 
 import java.lang.invoke.MethodHandles;
 import java.io.IOException;
-import java.io.UnsupportedEncodingException;
-import java.net.URLDecoder;
 import java.time.format.DateTimeFormatter;
 import java.util.AbstractMap;
 import java.util.ArrayList;
@@ -153,16 +151,6 @@
   }
 
   public void delete(String key) throws IOException {
-    try {
-      key = URLDecoder.decode(key, "UTF8");
-    } catch (UnsupportedEncodingException e) {
-      LOG.error("Error decoding: " + key);
-      throw new IOException("UnsupportedEncodingException for " + key);
-    } catch (IllegalArgumentException e) {
-      LOG.warn("Could not decode: " + key
-          + ", it probably wasn't encoded in the first place..");
-    }
-
     // escape solr hash separator
     key = key.replaceAll("!", "\\!");
 
@@ -269,9 +257,19 @@
         LOG.info(
             "SolrIndexer: deleting " + Integer.toString(deleteIds.size()) + "/"
                 + Integer.toString(totalDeletes) + " documents");
-        for (SolrClient solrClient : solrClients) {
-          solrClient.deleteById(deleteIds);
+        
+        UpdateRequest req = new UpdateRequest();
+        req.deleteById(deleteIds);
+        req.setAction(UpdateRequest.ACTION.OPTIMIZE, false, false);
+        req.setParams(params);
+        if (this.auth) {
+          req.setBasicAuthCredentials(this.username, this.password);
         }
+        
+        for (SolrClient solrClient : solrClients) {
+          solrClient.request(req);
+        }
+        
       } catch (final SolrServerException e) {
         LOG.error("Error deleting: " + deleteIds);
         throw makeIOException(e);
diff --git a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
index 68f1b6f..28878dc 100644
--- a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
+++ b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
index 6336afa..10289e5 100644
--- a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
+++ b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java b/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
index 8245151..4a381fa 100644
--- a/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
+++ b/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/lib-htmlunit/ivy.xml b/src/plugin/lib-htmlunit/ivy.xml
index 6430535..f54534c 100644
--- a/src/plugin/lib-htmlunit/ivy.xml
+++ b/src/plugin/lib-htmlunit/ivy.xml
@@ -37,7 +37,8 @@
 
   <dependencies>
     <!-- begin selenium dependencies -->
-    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.44.0" />
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="3.141.59" />
+    <dependency org="org.seleniumhq.selenium" name="htmlunit-driver" rev="2.35.1" />
     
     <dependency org="com.opera" name="operadriver" rev="1.5">
       <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
diff --git a/src/plugin/lib-htmlunit/plugin.xml b/src/plugin/lib-htmlunit/plugin.xml
index 290a137..bdfed92 100644
--- a/src/plugin/lib-htmlunit/plugin.xml
+++ b/src/plugin/lib-htmlunit/plugin.xml
@@ -29,76 +29,97 @@
         <export name="*"/>
      </library>
      <!-- all classes from dependent libraries are exported -->
-     <library name="cglib-nodep-2.1_3.jar">
+     <library name="animal-sniffer-annotations-1.14.jar">
        <export name="*"/>
      </library>
-     <library name="commons-codec-1.9.jar">
+     <library name="byte-buddy-1.8.15.jar">
        <export name="*"/>
      </library>
-     <library name="commons-collections-3.2.1.jar">
+     <library name="checker-compat-qual-2.0.0.jar">
        <export name="*"/>
      </library>
-     <library name="commons-exec-1.1.jar">
+     <library name="commons-codec-1.11.jar">
        <export name="*"/>
      </library>
-     <library name="commons-io-2.4.jar">
+     <library name="commons-exec-1.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-io-2.6.jar">
        <export name="*"/>
      </library>
      <library name="commons-jxpath-1.3.jar">
        <export name="*"/>
      </library>
-     <library name="commons-lang3-3.3.2.jar">
+     <library name="commons-lang3-3.9.jar">
        <export name="*"/>
      </library>
-     <library name="commons-logging-1.1.3.jar">
+     <library name="commons-logging-1.2.jar">
        <export name="*"/>
      </library>
-     <library name="cssparser-0.9.14.jar">
+     <library name="commons-net-3.6.jar">
        <export name="*"/>
      </library>
-     <library name="gson-2.3.jar">
+     <library name="commons-text-1.6.jar">
        <export name="*"/>
      </library>
-     <library name="guava-18.0.jar">
+     <library name="error_prone_annotations-2.1.3.jar">
        <export name="*"/>
      </library>
-     <library name="htmlunit-2.15.jar">
+     <library name="guava-25.0-jre.jar">
        <export name="*"/>
      </library>
-     <library name="htmlunit-core-js-2.15.jar">
+     <library name="htmlunit-2.35.0.jar">
        <export name="*"/>
      </library>
-     <library name="httpclient-4.3.4.jar">
+     <library name="htmlunit-core-js-2.35.0.jar">
        <export name="*"/>
      </library>
-     <library name="httpcore-4.3.2.jar">
+     <library name="htmlunit-cssparser-1.4.0.jar">
        <export name="*"/>
      </library>
-     <library name="httpmime-4.3.3.jar">
+     <library name="htmlunit-driver-2.35.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpclient-4.5.8.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpcore-4.4.11.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpmime-4.5.8.jar">
        <export name="*"/>
      </library>
      <library name="ini4j-0.5.2.jar">
        <export name="*"/>
      </library>
-     <library name="jetty-http-8.1.15.v20140411.jar">
+     <library name="j2objc-annotations-1.1.jar">
        <export name="*"/>
      </library>
-     <library name="jetty-io-8.1.15.v20140411.jar">
+     <library name="jetty-client-9.4.16.v20190411.jar">
        <export name="*"/>
      </library>
-     <library name="jetty-util-8.1.15.v20140411.jar">
+     <library name="jetty-http-9.4.16.v20190411.jar">
        <export name="*"/>
      </library>
-     <library name="jetty-websocket-8.1.15.v20140411.jar">
+     <library name="jetty-io-9.4.16.v20190411.jar">
        <export name="*"/>
      </library>
-     <library name="jna-3.4.0.jar">
+     <library name="jetty-util-9.4.16.v20190411.jar">
        <export name="*"/>
      </library>
-     <library name="nekohtml-1.9.21.jar">
+     <library name="jetty-xml-9.4.16.v20190411.jar">
        <export name="*"/>
      </library>
-     <library name="netty-3.5.2.Final.jar">
+     <library name="jsr305-1.3.9.jar">
+       <export name="*"/>
+     </library>
+     <library name="neko-htmlunit-2.35.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="okhttp-3.11.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="okio-1.14.0.jar">
        <export name="*"/>
      </library>
      <library name="operadriver-1.5.jar">
@@ -110,52 +131,55 @@
      <library name="phantomjsdriver-1.2.1.jar">
        <export name="*"/>
      </library>
-     <library name="platform-3.4.0.jar">
-       <export name="*"/>
-     </library>
      <library name="protobuf-java-2.4.1.jar">
        <export name="*"/>
      </library>
-     <library name="sac-1.3.jar">
+     <library name="selenium-api-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-api-2.44.0.jar">
+     <library name="selenium-chrome-driver-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-chrome-driver-2.44.0.jar">
+     <library name="selenium-edge-driver-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-firefox-driver-2.44.0.jar">
+     <library name="selenium-firefox-driver-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-htmlunit-driver-2.44.0.jar">
+     <library name="selenium-ie-driver-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-ie-driver-2.44.0.jar">
+     <library name="selenium-java-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-java-2.44.0.jar">
+     <library name="selenium-opera-driver-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-remote-driver-2.44.0.jar">
+     <library name="selenium-remote-driver-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-safari-driver-2.44.0.jar">
+     <library name="selenium-safari-driver-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-support-2.44.0.jar">
+     <library name="selenium-support-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="serializer-2.7.1.jar">
+     <library name="serializer-2.7.2.jar">
        <export name="*"/>
      </library>
-     <library name="webbit-0.4.14.jar">
+     <library name="websocket-api-9.4.16.v20190411.jar">
        <export name="*"/>
      </library>
-     <library name="xalan-2.7.1.jar">
+     <library name="websocket-client-9.4.16.v20190411.jar">
        <export name="*"/>
      </library>
-     <library name="xercesImpl-2.11.0.jar">
+     <library name="websocket-common-9.4.16.v20190411.jar">
+       <export name="*"/>
+     </library>
+     <library name="xalan-2.7.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="xercesImpl-2.12.0.jar">
        <export name="*"/>
      </library>
      <library name="xml-apis-1.4.01.jar">
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
index b1103f8..28de010 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/BlockedException.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.http.api;
 
 public class BlockedException extends HttpException {
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index a5c0a90..d5bc0b7 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -28,6 +28,7 @@
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ThreadLocalRandom;
 
@@ -36,6 +37,7 @@
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.protocols.ProtocolLogUtil;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
@@ -45,6 +47,7 @@
 import org.apache.nutch.util.GZIPUtils;
 import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.DeflateUtils;
+import org.apache.nutch.util.URLUtil;
 import org.apache.hadoop.util.StringUtils;
 
 import org.apache.hadoop.conf.Configuration;
@@ -66,6 +69,9 @@
   private HttpRobotRulesParser robots = null;
 
   private ArrayList<String> userAgentNames = null;
+  
+  /** Mapping hostnames to cookies */
+  private Map<String, String> hostCookies = null;
 
   /** The proxy hostname. */
   protected String proxyHost = null;
@@ -86,7 +92,7 @@
   protected int timeout = 10000;
 
   /** The length limit for downloaded content, in bytes. */
-  protected int maxContent = 64 * 1024;
+  protected int maxContent = 1024 * 1024;
 
   /** The time limit to download the entire content, in seconds. */
   protected int maxDuration = 300;
@@ -118,6 +124,12 @@
   private Configuration conf = null;
 
   /**
+   * Logging utility, used to suppress stack traces for common exceptions in a
+   * configurable way.
+   */
+  private ProtocolLogUtil logUtil = new ProtocolLogUtil();
+
+  /**
    * MimeUtil for MIME type detection. Note (see NUTCH-2578): MimeUtil object is
    * used concurrently by parallel fetcher threads, methods to detect MIME type
    * must be thread-safe.
@@ -194,7 +206,7 @@
     this.proxyException = arrayToMap(conf.getStrings("http.proxy.exception.list"));
     this.useProxy = (proxyHost != null && proxyHost.length() > 0);
     this.timeout = conf.getInt("http.timeout", 10000);
-    this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
+    this.maxContent = conf.getInt("http.content.limit", 1024 * 1024);
     this.maxDuration = conf.getInt("http.time.limit", -1);
     this.partialAsTruncated = conf
         .getBoolean("http.partial.truncated", false);
@@ -219,6 +231,8 @@
     this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header", true);
     this.robots.setConf(conf);
 
+    this.logUtil.setConf(conf);
+
     // NUTCH-1941: read list of alternating agent names
     if (conf.getBoolean("http.agent.rotate", false)) {
       String agentsFile = conf.get("http.agent.rotate.file", "agents.txt");
@@ -257,6 +271,42 @@
             .warn("Falling back to fixed user agent set via property http.agent.name");
       }
     }
+    
+    // If cookies are enabled, try to load a per-host cookie file
+    if (enableCookieHeader) {
+      String cookieFile = conf.get("http.agent.host.cookie.file", "cookies.txt");
+      BufferedReader br = null;
+      try {
+        Reader reader = conf.getConfResourceAsReader(cookieFile);
+        br = new BufferedReader(reader);
+        hostCookies = new HashMap<String,String>();
+        String word = "";
+        while ((word = br.readLine()) != null) {
+          if (!word.trim().isEmpty()) {
+            if (word.indexOf("#") == -1) { // skip comment
+              String[] parts = word.split("\t");
+              if (parts.length == 2) {
+                hostCookies.put(parts[0], parts[1]);
+              } else {
+                LOG.warn("Unable to parse cookie file correctly at: " + word);
+              }
+            }
+          }
+        }
+      } catch (Exception e) {
+        logger.warn("Failed to read http.agent.host.cookie.file {}: {}", cookieFile,
+            StringUtils.stringifyException(e));
+        hostCookies = null;
+      } finally {
+        if (br != null) {
+          try {
+            br.close();
+          } catch (IOException e) {
+            // ignore
+          }
+        }
+      }
+    }
 
     String[] protocols = conf.getStrings("http.tls.supported.protocols",
         "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
@@ -395,7 +445,12 @@
             ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
       }
     } catch (Throwable e) {
-      logger.error("Failed to get protocol output", e);
+      if (logger.isDebugEnabled() || !logUtil.logShort(e)) {
+        logger.error("Failed to get protocol output", e);
+      } else {
+        logger.error("Failed to get protocol output: {}",
+            e.getClass().getName());
+      }
       return new ProtocolOutput(null, new ProtocolStatus(e));
     }
   }
@@ -479,6 +534,21 @@
     }
     return userAgent;
   }
+  
+  /**
+   * If per-host cookies are configured, this method will look it up
+   * for the given url.
+   *
+   * @param url the url to look-up a cookie for
+   * @return the cookie or null
+   */
+  public String getCookie(URL url) {
+    if (hostCookies != null) {
+      return hostCookies.get(url.getHost());
+    }
+    
+    return null;
+  }
 
   /**
    * Value of "Accept-Language" request header sent by Nutch.
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java
index d7ee51a..0e0a963 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpException.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
index 6b9de08..f761bd0 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.http.api;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
index 23e4ef6..93bb51b 100644
--- a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
+++ b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.http.api;
 
 import org.junit.Assert;
diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
index e408586..6eebb73 100644
--- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
+++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexRule.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
index 993b37d..2cf6dc1 100644
--- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
+++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
index fec01ad..c77c67e 100644
--- a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
+++ b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -43,8 +43,8 @@
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
 
-  private final static String SEPARATOR = System.getProperty("file.separator");
-  private final static String SAMPLES = System.getProperty("test.data", ".");
+  protected final static String SEPARATOR = System.getProperty("file.separator");
+  protected final static String SAMPLES = System.getProperty("test.data", ".");
 
   protected abstract URLFilter getURLFilter(Reader rules);
 
@@ -72,6 +72,24 @@
         + (System.currentTimeMillis() - start) + "ms");
   }
 
+  protected void bench(int loops, String rulesFile, String urlsFile) {
+    try {
+      bench(loops, new FileReader(SAMPLES + SEPARATOR + rulesFile),
+          new FileReader(SAMPLES + SEPARATOR + urlsFile));
+    } catch (Exception e) {
+      Assert.fail(e.toString());
+    }
+  }
+
+  protected void test(String rulesFile, String urlsFile) {
+    try {
+      test(new FileReader(SAMPLES + SEPARATOR + rulesFile),
+          new FileReader(SAMPLES + SEPARATOR + urlsFile));
+    } catch (Exception e) {
+      Assert.fail(e.toString());
+    }
+  }
+
   protected void test(String file) {
     try {
       test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"),
diff --git a/src/plugin/lib-selenium/README.md b/src/plugin/lib-selenium/README.md
new file mode 100644
index 0000000..1c6b37c
--- /dev/null
+++ b/src/plugin/lib-selenium/README.md
@@ -0,0 +1,13 @@
+# Updates
+* The use of phantomjs has been deprecated. Check [Wikipedia](https://en.wikipedia.org/wiki/PhantomJS) for more info.
+* The updated code for Safari webriver is under development as starting Safari 10 on OS X El Capitan and macOS Sierra, Safari comes bundled with a new driver implementation.
+* Opera is now based on ChromeDriver and has been adapted by Opera that enables programmatic automation of Chromium-based Opera products but hasn't been updated since April 5, 2017. We have suspended its support and removed from the code.([link](https://github.com/operasoftware/operachromiumdriver)) 
+* Headless mode has been added for Chrome and Firefox. Set `selenium.enable.headless` to `true` in nutch-default.xml or nutch-site.xml to use it.
+
+
+Your can run Nutch in Docker.  Check  some examples at https://github.com/sbatururimi/nutch-test.
+Don't forget to update Dockefile to point to the original Nutch repository when updated.
+
+# Contributors
+Stas Batururimi [s.batururimi@gmail.com]
+
diff --git a/src/plugin/lib-selenium/build-ivy.xml b/src/plugin/lib-selenium/build-ivy.xml
index 3abcf6d..fe919e5 100644
--- a/src/plugin/lib-selenium/build-ivy.xml
+++ b/src/plugin/lib-selenium/build-ivy.xml
@@ -17,7 +17,7 @@
 -->
 <project name="lib-selenium" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
 
-    <property name="ivy.install.version" value="2.1.0" />
+    <property name="ivy.install.version" value="2.4.0" />
     <condition property="ivy.home" value="${env.IVY_HOME}">
       <isset property="env.IVY_HOME" />
     </condition>
diff --git a/src/plugin/lib-selenium/ivy.xml b/src/plugin/lib-selenium/ivy.xml
index 701b725..d70dfaf 100644
--- a/src/plugin/lib-selenium/ivy.xml
+++ b/src/plugin/lib-selenium/ivy.xml
@@ -37,16 +37,13 @@
 
   <dependencies>
     <!-- begin selenium dependencies -->
-    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.48.2" />
-    
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="3.141.5" />
+    <!-- 
     <dependency org="com.opera" name="operadriver" rev="1.5">
       <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
     </dependency>
-    <dependency org="com.codeborne" name="phantomjsdriver" rev="1.2.1" >
-      <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
-      <exclude org="org.seleniumhq.selenium" name="selenium-java" />
-    </dependency>
+    -->
     <!-- end selenium dependencies -->
   </dependencies>
-  
+
 </ivy-module>
diff --git a/src/plugin/lib-selenium/plugin.xml b/src/plugin/lib-selenium/plugin.xml
index a86d665..bf50ca0 100644
--- a/src/plugin/lib-selenium/plugin.xml
+++ b/src/plugin/lib-selenium/plugin.xml
@@ -29,147 +29,65 @@
         <export name="*"/>
      </library>
      <!-- all classes from dependent libraries are exported -->
-     <library name="cglib-nodep-2.1_3.jar">
+     <library name="animal-sniffer-annotations-1.14.jar">
        <export name="*"/>
      </library>
-     <library name="commons-codec-1.10.jar">
+     <library name="byte-buddy-1.8.15.jar">
        <export name="*"/>
      </library>
-     <library name="commons-collections-3.2.1.jar">
+     <library name="checker-compat-qual-2.0.0.jar">
        <export name="*"/>
      </library>
      <library name="commons-exec-1.3.jar">
        <export name="*"/>
      </library>
-     <library name="commons-io-2.4.jar">
+     <library name="error_prone_annotations-2.1.3.jar">
        <export name="*"/>
      </library>
-     <library name="commons-jxpath-1.3.jar">
+     <library name="guava-25.0-jre.jar">
        <export name="*"/>
      </library>
-     <library name="commons-lang3-3.4.jar">
+     <library name="j2objc-annotations-1.1.jar">
        <export name="*"/>
      </library>
-     <library name="commons-logging-1.2.jar">
+     <library name="jsr305-1.3.9.jar">
        <export name="*"/>
      </library>
-     <library name="cssparser-0.9.16.jar">
+     <library name="okhttp-3.11.0.jar">
        <export name="*"/>
      </library>
-     <library name="gson-2.3.1.jar">
+     <library name="okio-1.14.0.jar">
        <export name="*"/>
      </library>
-     <library name="guava-18.0.jar">
+     <library name="selenium-api-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="htmlunit-2.18.jar">
+     <library name="selenium-chrome-driver-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="htmlunit-core-js-2.17.jar">
+     <library name="selenium-edge-driver-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="httpclient-4.5.1.jar">
+     <library name="selenium-firefox-driver-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="httpcore-4.4.3.jar">
+     <library name="selenium-ie-driver-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="httpmime-4.5.jar">
+     <library name="selenium-java-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="ini4j-0.5.2.jar">
+     <library name="selenium-opera-driver-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="jetty-io-9.2.12.v20150709.jar">
+     <library name="selenium-remote-driver-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="jetty-util-9.2.12.v20150709.jar">
+     <library name="selenium-safari-driver-3.141.5.jar">
        <export name="*"/>
      </library>
-     <library name="jna-4.1.0.jar">
-       <export name="*"/>
-     </library>
-     <library name="jna-platform-4.1.0.jar">
-       <export name="*"/>
-     </library>
-     <library name="nekohtml-1.9.22.jar">
-       <export name="*"/>
-     </library>
-     <library name="netty-3.5.2.Final.jar">
-       <export name="*"/>
-     </library>
-     <library name="operadriver-1.5.jar">
-       <export name="*"/>
-     </library>
-     <library name="operalaunchers-1.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="phantomjsdriver-1.2.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="protobuf-java-2.4.1.jar">
-       <export name="*"/>
-     </library>
-     <library name="sac-1.3.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-api-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-chrome-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-edge-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-firefox-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-htmlunit-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-ie-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-java-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-leg-rc-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-remote-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-safari-driver-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="selenium-support-2.48.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="serializer-2.7.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="webbit-0.4.14.jar">
-       <export name="*"/>
-     </library>
-     <library name="websocket-api-9.2.12.v20150709.jar">
-       <export name="*"/>
-     </library>
-     <library name="websocket-client-9.2.12.v20150709.jar">
-       <export name="*"/>
-     </library>
-     <library name="websocket-common-9.2.12.v20150709.jar">
-       <export name="*"/>
-     </library>
-     <library name="xalan-2.7.2.jar">
-       <export name="*"/>
-     </library>
-     <library name="xercesImpl-2.11.0.jar">
-       <export name="*"/>
-     </library>
-     <library name="xml-apis-1.4.01.jar">
+     <library name="selenium-support-3.141.5.jar">
        <export name="*"/>
      </library>
    </runtime>
-
 </plugin>
diff --git a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
index 6e137f9..6af20b0 100644
--- a/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
+++ b/src/plugin/lib-selenium/src/java/org/apache/nutch/protocol/selenium/HttpWebClient.java
@@ -24,182 +24,274 @@
 import java.io.OutputStream;
 import java.net.URL;
 import java.util.concurrent.TimeUnit;
+import java.util.Random;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IOUtils;
+
 import org.openqa.selenium.By;
+import org.openqa.selenium.Capabilities;
 import org.openqa.selenium.OutputType;
 import org.openqa.selenium.TakesScreenshot;
 import org.openqa.selenium.TimeoutException;
 import org.openqa.selenium.WebDriver;
+
 import org.openqa.selenium.chrome.ChromeDriver;
-import org.openqa.selenium.firefox.FirefoxBinary;
+import org.openqa.selenium.chrome.ChromeOptions;
+
+//import org.openqa.selenium.firefox.FirefoxBinary;
 import org.openqa.selenium.firefox.FirefoxDriver;
-import org.openqa.selenium.firefox.FirefoxProfile;
+//import org.openqa.selenium.firefox.FirefoxProfile;
+import org.openqa.selenium.firefox.FirefoxOptions;
+
 import org.openqa.selenium.io.TemporaryFilesystem;
+
 import org.openqa.selenium.remote.DesiredCapabilities;
 import org.openqa.selenium.remote.RemoteWebDriver;
-import org.openqa.selenium.safari.SafariDriver;
-import org.openqa.selenium.phantomjs.PhantomJSDriver;
-import org.openqa.selenium.phantomjs.PhantomJSDriverService;
+
+//import org.openqa.selenium.safari.SafariDriver;
+
+//import org.openqa.selenium.phantomjs.PhantomJSDriver;
+//import org.openqa.selenium.phantomjs.PhantomJSDriverService;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.opera.core.systems.OperaDriver;
+import org.openqa.selenium.opera.OperaOptions;
+import org.openqa.selenium.opera.OperaDriver;
+//import com.opera.core.systems.OperaDriver;
 
 public class HttpWebClient {
 
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
 
-  public static ThreadLocal<WebDriver> threadWebDriver = new ThreadLocal<WebDriver>() {
-
-    @Override
-    protected WebDriver initialValue()
-    {
-      FirefoxProfile profile = new FirefoxProfile();
-      profile.setPreference("permissions.default.stylesheet", 2);
-      profile.setPreference("permissions.default.image", 2);
-      profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", "false");
-      profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, "localhost");
-      WebDriver driver = new FirefoxDriver(profile);
-      return driver;          
-    };
-  };
-
   public static WebDriver getDriverForPage(String url, Configuration conf) {
-      WebDriver driver = null;
-      DesiredCapabilities capabilities = null;
-      long pageLoadWait = conf.getLong("page.load.delay", 3);
+    WebDriver driver = null;
+    long pageLoadWait = conf.getLong("page.load.delay", 3);
 
-      try {
-        String driverType  = conf.get("selenium.driver", "firefox");
-        switch (driverType) {
-          case "firefox":
-          	String allowedHost = conf.get("selenium.firefox.allowed.hosts", "localhost");
-          	long firefoxBinaryTimeout = conf.getLong("selenium.firefox.binary.timeout", 45);
-          	boolean enableFlashPlayer = conf.getBoolean("selenium.firefox.enable.flash", false);
-          	int loadImage = conf.getInt("selenium.firefox.load.image", 1);
-          	int loadStylesheet = conf.getInt("selenium.firefox.load.stylesheet", 1);
-    		    FirefoxProfile profile = new FirefoxProfile();
-    		    FirefoxBinary binary = new FirefoxBinary();
-    		    profile.setPreference(FirefoxProfile.ALLOWED_HOSTS_PREFERENCE, allowedHost);
-    		    profile.setPreference("dom.ipc.plugins.enabled.libflashplayer.so", enableFlashPlayer);
-    		    profile.setPreference("permissions.default.stylesheet", loadStylesheet);
-  	      	profile.setPreference("permissions.default.image", loadImage);
-    		    binary.setTimeout(TimeUnit.SECONDS.toMillis(firefoxBinaryTimeout));
-            driver = new FirefoxDriver(binary, profile);
-            break;
-          case "chrome":
-            driver = new ChromeDriver();
-            break;
-          case "safari":
-            driver = new SafariDriver();
-            break;
-          case "opera":
-            driver = new OperaDriver();
-            break;
-          case "phantomjs":
-            driver = new PhantomJSDriver();
-            break;
-          case "remote":
-            String seleniumHubHost = conf.get("selenium.hub.host", "localhost");
-            int seleniumHubPort = Integer.parseInt(conf.get("selenium.hub.port", "4444"));
-            String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub");
-            String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http");
-            String seleniumGridDriver = conf.get("selenium.grid.driver","firefox");
-            String seleniumGridBinary = conf.get("selenium.grid.binary");
+    try {
+      String driverType = conf.get("selenium.driver", "firefox");
+      boolean enableHeadlessMode = conf.getBoolean("selenium.enable.headless",
+          false);
 
-            switch (seleniumGridDriver){
-              case "firefox":
-                capabilities = DesiredCapabilities.firefox();
-                capabilities.setBrowserName("firefox");
-                capabilities.setJavascriptEnabled(true);
-                capabilities.setCapability("firefox_binary",seleniumGridBinary);
-                System.setProperty("webdriver.reap_profile", "false");
-                driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities);
-                break;
-              case "phantomjs":
-                capabilities = DesiredCapabilities.phantomjs();
-                capabilities.setBrowserName("phantomjs");
-                capabilities.setJavascriptEnabled(true);
-                capabilities.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY,seleniumGridBinary);
-                driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), capabilities);
-                break;
-              default:
-                LOG.error("The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType);
-                driver = new RemoteWebDriver(new URL(seleniumHubProtocol, seleniumHubHost, seleniumHubPort, seleniumHubPath), DesiredCapabilities.firefox());
-                break;
-            }
-            break;
-          default:
-            LOG.error("The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().", driverType);
-            driver = new FirefoxDriver();
-            break;
+      switch (driverType) {
+      case "firefox":
+        String geckoDriverPath = conf.get("selenium.grid.binary",
+            "/root/geckodriver");
+        driver = createFirefoxWebDriver(geckoDriverPath, enableHeadlessMode);
+        break;
+      case "chrome":
+        String chromeDriverPath = conf.get("selenium.grid.binary",
+            "/root/chromedriver");
+        driver = createChromeWebDriver(chromeDriverPath, enableHeadlessMode);
+        break;
+      // case "opera":
+      // // This class is provided as a convenience for easily testing the
+      // Chrome browser.
+      // String operaDriverPath = conf.get("selenium.grid.binary",
+      // "/root/operadriver");
+      // driver = createOperaWebDriver(operaDriverPath, enableHeadlessMode);
+      // break;
+      case "remote":
+        String seleniumHubHost = conf.get("selenium.hub.host", "localhost");
+        int seleniumHubPort = Integer
+            .parseInt(conf.get("selenium.hub.port", "4444"));
+        String seleniumHubPath = conf.get("selenium.hub.path", "/wd/hub");
+        String seleniumHubProtocol = conf.get("selenium.hub.protocol", "http");
+        URL seleniumHubUrl = new URL(seleniumHubProtocol, seleniumHubHost,
+            seleniumHubPort, seleniumHubPath);
+
+        String seleniumGridDriver = conf.get("selenium.grid.driver", "firefox");
+
+        switch (seleniumGridDriver) {
+        case "firefox":
+          driver = createFirefoxRemoteWebDriver(seleniumHubUrl,
+              enableHeadlessMode);
+          break;
+        case "chrome":
+          driver = createChromeRemoteWebDriver(seleniumHubUrl,
+              enableHeadlessMode);
+          break;
+        case "random":
+          driver = createRandomRemoteWebDriver(seleniumHubUrl,
+              enableHeadlessMode);
+          break;
+        default:
+          LOG.error(
+              "The Selenium Grid WebDriver choice {} is not available... defaulting to FirefoxDriver().",
+              driverType);
+          driver = createDefaultRemoteWebDriver(seleniumHubUrl,
+              enableHeadlessMode);
+          break;
         }
-        LOG.debug("Selenium {} WebDriver selected.", driverType);
-  
-        driver.manage().timeouts().pageLoadTimeout(pageLoadWait, TimeUnit.SECONDS);
-        driver.get(url);
-      } catch (Exception e) {
-			  if(e instanceof TimeoutException) {
-          LOG.debug("Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far...");
-          return driver;
-			  }
-			  cleanUpDriver(driver);
-		    throw new RuntimeException(e);
-	    } 
+        break;
+      default:
+        LOG.error(
+            "The Selenium WebDriver choice {} is not available... defaulting to FirefoxDriver().",
+            driverType);
+        FirefoxOptions options = new FirefoxOptions();
+        driver = new FirefoxDriver(options);
+        break;
+      }
+      LOG.debug("Selenium {} WebDriver selected.", driverType);
 
-      return driver;
+      driver.manage().timeouts().pageLoadTimeout(pageLoadWait,
+          TimeUnit.SECONDS);
+      driver.get(url);
+    } catch (Exception e) {
+      if (e instanceof TimeoutException) {
+        LOG.error(
+            "Selenium WebDriver: Timeout Exception: Capturing whatever loaded so far...");
+        return driver;
+      } else {
+        LOG.error(e.toString());
+      }
+      cleanUpDriver(driver);
+      throw new RuntimeException(e);
+    }
+
+    return driver;
   }
 
-  public static String getHTMLContent(WebDriver driver, Configuration conf) {
-      if (conf.getBoolean("take.screenshot", false)) {
-        takeScreenshot(driver, conf);
-      }
+  public static WebDriver createFirefoxWebDriver(String firefoxDriverPath,
+      boolean enableHeadlessMode) {
+    System.setProperty("webdriver.gecko.driver", firefoxDriverPath);
+    FirefoxOptions firefoxOptions = new FirefoxOptions();
+    if (enableHeadlessMode) {
+      firefoxOptions.addArguments("--headless");
+    }
+    WebDriver driver = new FirefoxDriver(firefoxOptions);
+    return driver;
+  }
 
-      return driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+  public static WebDriver createChromeWebDriver(String chromeDriverPath,
+      boolean enableHeadlessMode) {
+    // if not specified, WebDriver will search your path for chromedriver
+    System.setProperty("webdriver.chrome.driver", chromeDriverPath);
+    ChromeOptions chromeOptions = new ChromeOptions();
+    chromeOptions.addArguments("--no-sandbox");
+    chromeOptions.addArguments("--disable-extensions");
+    // be sure to set selenium.enable.headless to true if no monitor attached
+    // to your server
+    if (enableHeadlessMode) {
+      chromeOptions.addArguments("--headless");
+    }
+    WebDriver driver = new ChromeDriver(chromeOptions);
+    return driver;
+  }
+
+  public static WebDriver createOperaWebDriver(String operaDriverPath,
+      boolean enableHeadlessMode) {
+    // if not specified, WebDriver will search your path for operadriver
+    System.setProperty("webdriver.opera.driver", operaDriverPath);
+    OperaOptions operaOptions = new OperaOptions();
+    // operaOptions.setBinary("/usr/bin/opera");
+    operaOptions.addArguments("--no-sandbox");
+    operaOptions.addArguments("--disable-extensions");
+    // be sure to set selenium.enable.headless to true if no monitor attached
+    // to your server
+    if (enableHeadlessMode) {
+      operaOptions.addArguments("--headless");
+    }
+    WebDriver driver = new OperaDriver(operaOptions);
+    return driver;
+  }
+
+  public static RemoteWebDriver createFirefoxRemoteWebDriver(URL seleniumHubUrl,
+      boolean enableHeadlessMode) {
+    FirefoxOptions firefoxOptions = new FirefoxOptions();
+    if (enableHeadlessMode) {
+      firefoxOptions.setHeadless(true);
+    }
+    RemoteWebDriver driver = new RemoteWebDriver(seleniumHubUrl,
+        firefoxOptions);
+    return driver;
+  }
+
+  public static RemoteWebDriver createChromeRemoteWebDriver(URL seleniumHubUrl,
+      boolean enableHeadlessMode) {
+    ChromeOptions chromeOptions = new ChromeOptions();
+    if (enableHeadlessMode) {
+      chromeOptions.setHeadless(true);
+    }
+    RemoteWebDriver driver = new RemoteWebDriver(seleniumHubUrl, chromeOptions);
+    return driver;
+  }
+
+  public static RemoteWebDriver createRandomRemoteWebDriver(URL seleniumHubUrl,
+      boolean enableHeadlessMode) {
+    // we consider a possibility of generating only 2 types of browsers: Firefox
+    // and
+    // Chrome only
+    Random r = new Random();
+    int min = 0;
+    // we have actually hardcoded the maximum number of types of web driver that
+    // can
+    // be created
+    // but this must be later moved to the configuration file in order to be
+    // able
+    // to randomly choose between much more types(ex: Edge, Opera, Safari)
+    int max = 1; // for 3 types, change to 2 and update the if-clause
+    int num = r.nextInt((max - min) + 1) + min;
+    if (num == 0) {
+      return createFirefoxRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
+    }
+
+    return createChromeRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
+  }
+
+  public static RemoteWebDriver createDefaultRemoteWebDriver(URL seleniumHubUrl,
+      boolean enableHeadlessMode) {
+    return createFirefoxRemoteWebDriver(seleniumHubUrl, enableHeadlessMode);
   }
 
   public static void cleanUpDriver(WebDriver driver) {
     if (driver != null) {
       try {
-	      driver.close();
+        // driver.close();
         driver.quit();
         TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
       } catch (Exception e) {
-        throw new RuntimeException(e);
+        LOG.error(e.toString());
+        // throw new RuntimeException(e);
       }
     }
   }
 
   /**
-   * Function for obtaining the HTML BODY using the selected
-   * <a href='https://seleniumhq.github.io/selenium/docs/api/java/org/openqa/selenium/WebDriver.html'>selenium webdriver</a>
-   * There are a number of configuration properties within
-   * <code>nutch-site.xml</code> which determine whether to
-   * take screenshots of the rendered pages and persist them
-   * as timestamped .png's into HDFS.
-   * @param url the URL to fetch and render
-   * @param conf the {@link org.apache.hadoop.conf.Configuration}
+   * Function for obtaining the HTML BODY using the selected <a href=
+   * 'https://seleniumhq.github.io/selenium/docs/api/java/org/openqa/selenium/WebDriver.html'>selenium
+   * webdriver</a> There are a number of configuration properties within
+   * <code>nutch-site.xml</code> which determine whether to take screenshots of
+   * the rendered pages and persist them as timestamped .png's into HDFS.
+   * 
+   * @param url
+   *          the URL to fetch and render
+   * @param conf
+   *          the {@link org.apache.hadoop.conf.Configuration}
    * @return the rendered inner HTML page
    */
   public static String getHtmlPage(String url, Configuration conf) {
     WebDriver driver = getDriverForPage(url, conf);
-    
+
     try {
       if (conf.getBoolean("take.screenshot", false)) {
         takeScreenshot(driver, conf);
       }
 
-      String innerHtml = driver.findElement(By.tagName("body")).getAttribute("innerHTML");
+      String innerHtml = driver.findElement(By.tagName("body"))
+          .getAttribute("innerHTML");
       return innerHtml;
 
-      // I'm sure this catch statement is a code smell ; borrowing it from lib-htmlunit
+      // I'm sure this catch statement is a code smell ; borrowing it from
+      // lib-htmlunit
     } catch (Exception e) {
       TemporaryFilesystem.getDefaultTmpFS().deleteTemporaryFiles();
+      // throw new RuntimeException(e);
+      LOG.error("getHtmlPage(url, conf): " + e.toString());
       throw new RuntimeException(e);
     } finally {
       cleanUpDriver(driver);
@@ -213,24 +305,32 @@
   private static void takeScreenshot(WebDriver driver, Configuration conf) {
     try {
       String url = driver.getCurrentUrl();
-      File srcFile = ((TakesScreenshot)driver).getScreenshotAs(OutputType.FILE);
+      File srcFile = ((TakesScreenshot) driver)
+          .getScreenshotAs(OutputType.FILE);
       LOG.debug("In-memory screenshot taken of: {}", url);
       FileSystem fs = FileSystem.get(conf);
       if (conf.get("screenshot.location") != null) {
-        Path screenshotPath = new Path(conf.get("screenshot.location") + "/" + srcFile.getName());
+        Path screenshotPath = new Path(
+            conf.get("screenshot.location") + "/" + srcFile.getName());
         OutputStream os = null;
         if (!fs.exists(screenshotPath)) {
-          LOG.debug("No existing screenshot already exists... creating new file at {} {}.", screenshotPath, srcFile.getName());
+          LOG.debug(
+              "No existing screenshot already exists... creating new file at {} {}.",
+              screenshotPath, srcFile.getName());
           os = fs.create(screenshotPath);
         }
         InputStream is = new BufferedInputStream(new FileInputStream(srcFile));
         IOUtils.copyBytes(is, os, conf);
-        LOG.debug("Screenshot for {} successfully saved to: {} {}", url, screenshotPath, srcFile.getName()); 
+        LOG.debug("Screenshot for {} successfully saved to: {} {}", url,
+            screenshotPath, srcFile.getName());
       } else {
-        LOG.warn("Screenshot for {} not saved to HDFS (subsequently disgarded) as value for "
-            + "'screenshot.location' is absent from nutch-site.xml.", url);
+        LOG.warn(
+            "Screenshot for {} not saved to HDFS (subsequently disgarded) as value for "
+                + "'screenshot.location' is absent from nutch-site.xml.",
+            url);
       }
     } catch (Exception e) {
+      LOG.error("Error taking screenshot: ", e);
       cleanUpDriver(driver);
       throw new RuntimeException(e);
     }
diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
index 8fbfb58..e0fcfa7 100644
--- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
+++ b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
index cc5fc45..0efcbb3 100644
--- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
+++ b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java b/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
index cf92a7d..aa370ea 100644
--- a/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
+++ b/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer.filter;
 
 import org.slf4j.Logger;
diff --git a/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java b/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
index 4522f99..28b9f40 100644
--- a/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
+++ b/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer.filter;
 
 import org.apache.hadoop.conf.Configuration;
diff --git a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java b/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
index a4c14a9..dfebb53 100644
--- a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
+++ b/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.ext;
 
 import org.apache.nutch.protocol.Content;
diff --git a/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java b/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
index a399273..782a152 100644
--- a/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
+++ b/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.ext;
 
 import org.apache.nutch.protocol.ProtocolFactory;
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
index 6a1038b..62b7b6d 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
@@ -1,10 +1,4 @@
 /*
- * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
- * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to
- * avoid dependency on Xalan.
- */
-
-/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -20,6 +14,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.DOMBuilder, in order to
+ * avoid dependency on Xalan.
+ */
 /*
  * $Id$
  */
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
index 95a419a..a9aa0e4 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.html;
 
 import java.net.URL;
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
index b454ccb..4e7ef14 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.html;
 
 import java.net.URL;
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
index 0d1d17e..5852b14 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.html;
 
 import java.io.ByteArrayInputStream;
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
index eb382e8..7361ac7 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/XMLCharacterRecognizer.java
@@ -1,10 +1,4 @@
 /*
- * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
- * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer,
- * XXX in order to avoid dependency on Xalan.
- */
-
-/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -20,6 +14,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer,
+ * XXX in order to avoid dependency on Xalan.
+ */
 /*
  * $Id$
  */
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
index 0faa013..0c1212a 100644
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
+++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.html;
 
 import org.apache.nutch.parse.Outlink;
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
index a4c8206..397a310 100644
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
+++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.html;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
index 5089a10..e756350 100644
--- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
+++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.html;
 
 import org.apache.nutch.parse.HTMLMetaTags;
diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
index f9b9722..96c56fc 100644
--- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
+++ b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
index 024aadf..3f03af9 100644
--- a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
+++ b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.metatags;
 
 import java.util.Set;
diff --git a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
index 81d4485..bcb8c36 100644
--- a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
+++ b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.swf;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java b/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
index 129b85f..688e9b9 100644
--- a/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
+++ b/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.swf;
 
 import java.io.FileInputStream;
diff --git a/src/plugin/parse-tika/build-ivy.xml b/src/plugin/parse-tika/build-ivy.xml
index e4984d8..738f041 100644
--- a/src/plugin/parse-tika/build-ivy.xml
+++ b/src/plugin/parse-tika/build-ivy.xml
@@ -17,14 +17,14 @@
 -->
 <project name="parse-tika" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
 
-    <property name="ivy.install.version" value="2.1.0" />
+    <property name="ivy.install.version" value="2.4.0" />
     <condition property="ivy.home" value="${env.IVY_HOME}">
       <isset property="env.IVY_HOME" />
     </condition>
     <property name="ivy.home" value="${user.home}/.ant" />
     <property name="ivy.checksums" value="" />
     <property name="ivy.jar.dir" value="${ivy.home}/lib" />
-    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy-${ivy.install.version}.jar" />
 
     <target name="download-ivy" unless="offline">
 
diff --git a/src/plugin/parse-tika/howto_upgrade_tika.txt b/src/plugin/parse-tika/howto_upgrade_tika.txt
index fbf7207..aa4147c 100644
--- a/src/plugin/parse-tika/howto_upgrade_tika.txt
+++ b/src/plugin/parse-tika/howto_upgrade_tika.txt
@@ -1,4 +1,4 @@
-1. Upgrade Tika depencency (tika-core) in ivy/ivy.xml
+1. Upgrade Tika dependency (tika-core) in ivy/ivy.xml
 
 2. Upgrade Tika dependency in src/plugin/parse-tika/ivy.xml
 
diff --git a/src/plugin/parse-tika/ivy.xml b/src/plugin/parse-tika/ivy.xml
index df06f14..08d0f12 100644
--- a/src/plugin/parse-tika/ivy.xml
+++ b/src/plugin/parse-tika/ivy.xml
@@ -36,8 +36,8 @@
   </publications>
 
   <dependencies>
-    <dependency org="org.apache.tika" name="tika-parsers" rev="1.20" conf="*->default">
-      <!-- exclusions of dependencies in Nutch core (ivy/ivy.xml) -->
+    <dependency org="org.apache.tika" name="tika-parsers" rev="1.22" conf="*->default">
+      <!-- exclusions of dependencies provided in Nutch core (ivy/ivy.xml) -->
       <exclude org="org.apache.tika" name="tika-core" />
       <exclude org="org.apache.httpcomponents" name="httpclient" />
       <exclude org="org.apache.httpcomponents" name="httpcore" />
@@ -50,10 +50,13 @@
       <exclude org="org.apache.cxf" name="cxf-core" />
       <exclude org="org.apache.cxf" name="cxf-rt-transports-http" />
       <exclude org="org.apache.cxf" name="cxf-rt-frontend-jaxrs" />
+      <exclude org="com.fasterxml.jackson.core" name="jackson-core" />
       <exclude org="com.fasterxml.jackson.core" name="jackson-databind" />
+      <exclude org="com.fasterxml.jackson.core" name="jackson-annotations" />
       <exclude org="com.google.protobuf" name="protobuf-java" />
       <exclude org="org.slf4j" name="slf4j-log4j12" />
       <exclude org="org.slf4j" name="slf4j-api" />
+      <exclude org="xml-apis" name="xml-apis" /><!-- must be provided in core as it is used also by tika-core -->
     </dependency>
   </dependencies>
   
diff --git a/src/plugin/parse-tika/plugin.xml b/src/plugin/parse-tika/plugin.xml
index b89f41e..18dad6c 100644
--- a/src/plugin/parse-tika/plugin.xml
+++ b/src/plugin/parse-tika/plugin.xml
@@ -26,83 +26,91 @@
          <export name="*"/>
       </library>
       <!-- dependencies of Tika (tika-parsers) -->
-      <library name="apache-mime4j-core-0.8.2.jar"/>
-      <library name="apache-mime4j-dom-0.8.2.jar"/>
-      <library name="asm-7.0.jar"/>
-      <library name="bcmail-jdk15on-1.60.jar"/>
-      <library name="bcpkix-jdk15on-1.60.jar"/>
-      <library name="bcprov-jdk15on-1.60.jar"/>
+      <library name="animal-sniffer-annotations-1.17.jar"/>
+      <library name="ant-1.10.5.jar"/>
+      <library name="ant-launcher-1.10.5.jar"/>
+      <library name="apache-mime4j-core-0.8.3.jar"/>
+      <library name="apache-mime4j-dom-0.8.3.jar"/>
+      <library name="asm-7.2-beta.jar"/>
+      <library name="bcmail-jdk15on-1.62.jar"/>
+      <library name="bcpkix-jdk15on-1.62.jar"/>
+      <library name="bcprov-jdk15on-1.62.jar"/>
       <library name="boilerpipe-1.1.0.jar"/>
       <library name="bzip2-0.9.1.jar"/>
-      <library name="c3p0-0.9.1.1.jar"/>
+      <library name="c3p0-0.9.5.4.jar"/>
       <library name="cdm-4.5.5.jar"/>
-      <library name="commons-collections4-4.2.jar"/>
-      <library name="commons-compress-1.18.jar"/>
-      <library name="commons-csv-1.6.jar"/>
+      <library name="checker-qual-2.8.1.jar"/>
+      <library name="codemodel-2.3.2.jar"/>
+      <library name="commons-csv-1.7.jar"/>
       <library name="commons-exec-1.3.jar"/>
       <library name="commons-io-2.6.jar"/>
-      <library name="commons-lang3-3.8.1.jar"/>
+      <library name="commons-logging-1.2.jar"/>
       <library name="commons-math3-3.6.1.jar"/>
       <library name="curvesapi-1.05.jar"/>
-      <library name="cxf-core-3.2.7.jar"/>
-      <library name="cxf-rt-frontend-jaxrs-3.2.7.jar"/>
-      <library name="cxf-rt-rs-client-3.2.7.jar"/>
-      <library name="cxf-rt-transports-http-3.2.7.jar"/>
+      <library name="cxf-rt-rs-client-3.3.2.jar"/>
+      <library name="cxf-rt-security-3.3.2.jar"/>
       <library name="dec-0.1.2.jar"/>
+      <library name="dtd-parser-1.4.1.jar"/>
       <library name="ehcache-core-2.6.2.jar"/>
-      <library name="FastInfoset-1.2.15.jar"/>
-      <library name="fontbox-2.0.13.jar"/>
+      <library name="error_prone_annotations-2.3.2.jar"/>
+      <library name="failureaccess-1.0.1.jar"/>
+      <library name="FastInfoset-1.2.16.jar"/>
+      <library name="fontbox-2.0.16.jar"/>
       <library name="geoapi-3.0.1.jar"/>
       <library name="grib-4.5.5.jar"/>
       <library name="gson-2.8.5.jar"/>
-      <library name="guava-17.0.jar"/>
-      <library name="httpmime-4.5.6.jar"/>
+      <library name="guava-28.0-jre.jar"/>
+      <library name="httpmime-4.5.9.jar"/>
       <library name="httpservices-4.5.5.jar"/>
       <library name="isoparser-1.1.22.jar"/>
-      <library name="istack-commons-runtime-3.0.7.jar"/>
-      <library name="jackcess-2.1.12.jar"/>
-      <library name="jackcess-encrypt-2.1.4.jar"/>
-      <library name="jackson-annotations-2.9.7.jar"/>
-      <library name="jackson-core-2.9.7.jar"/>
-      <library name="jackson-databind-2.9.7.jar"/>
+      <library name="istack-commons-runtime-3.0.8.jar"/>
+      <library name="istack-commons-tools-3.0.8.jar"/>
+      <library name="j2objc-annotations-1.3.jar"/>
+      <library name="jackcess-3.0.1.jar"/>
+      <library name="jackcess-encrypt-3.0.0.jar"/>
       <library name="jai-imageio-core-1.4.0.jar"/>
+      <library name="jakarta.activation-1.2.1.jar"/>
+      <library name="jakarta.activation-api-1.2.1.jar"/>
+      <library name="jakarta.ws.rs-api-2.1.5.jar"/>
+      <library name="jakarta.xml.bind-api-2.3.2.jar"/>
       <library name="java-libpst-0.8.1.jar"/>
-      <library name="javax.activation-1.2.0.jar"/>
       <library name="javax.annotation-api-1.3.2.jar"/>
-      <library name="javax.ws.rs-api-2.1.1.jar"/>
-      <library name="jaxb-api-2.3.1.jar"/>
-      <library name="jaxb-runtime-2.3.1.jar"/>
+      <library name="jaxb-runtime-2.3.2.jar"/>
+      <library name="jaxb-xjc-2.3.2.jar"/>
       <library name="jbig2-imageio-3.0.2.jar"/>
       <library name="jcip-annotations-1.0.jar"/>
-      <library name="jcl-over-slf4j-1.7.25.jar"/>
+      <library name="jcl-over-slf4j-1.7.26.jar"/>
       <library name="jcommander-1.35.jar"/>
       <library name="jdom2-2.0.6.jar"/>
       <library name="jempbox-1.8.16.jar"/>
       <library name="jhighlight-1.0.3.jar"/>
       <library name="jmatio-1.5.jar"/>
-      <library name="jna-5.1.0.jar"/>
+      <library name="jna-5.3.1.jar"/>
       <library name="joda-time-2.2.jar"/>
       <library name="json-simple-1.1.1.jar"/>
-      <library name="jsoup-1.11.3.jar"/>
-      <library name="jul-to-slf4j-1.7.25.jar"/>
+      <library name="jsoup-1.12.1.jar"/>
+      <library name="jsr305-3.0.2.jar"/>
+      <library name="jul-to-slf4j-1.7.26.jar"/>
       <library name="juniversalchardet-1.0.3.jar"/>
-      <library name="junrar-2.0.0.jar"/>
+      <library name="junrar-4.0.0.jar"/>
+      <library name="listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar"/>
+      <library name="mchange-commons-java-0.2.15.jar"/>
       <library name="metadata-extractor-2.11.0.jar"/>
       <library name="netcdf4-4.5.5.jar"/>
-      <library name="openjson-1.0.10.jar"/>
-      <library name="opennlp-tools-1.9.0.jar"/>
-      <library name="parso-2.0.10.jar"/>
-      <library name="pdfbox-2.0.13.jar"/>
-      <library name="pdfbox-tools-2.0.13.jar"/>
+      <library name="openjson-1.0.11.jar"/>
+      <library name="opennlp-tools-1.9.1.jar"/>
+      <library name="parso-2.0.11.jar"/>
+      <library name="pdfbox-2.0.16.jar"/>
+      <library name="pdfbox-tools-2.0.16.jar"/>
       <library name="poi-4.0.1.jar"/>
       <library name="poi-ooxml-4.0.1.jar"/>
       <library name="poi-ooxml-schemas-4.0.1.jar"/>
       <library name="poi-scratchpad-4.0.1.jar"/>
-      <library name="procyon-compilertools-0.5.32.jar"/>
-      <library name="procyon-core-0.5.32.jar"/>
       <library name="quartz-2.2.0.jar"/>
-      <library name="rome-1.12.0.jar"/>
-      <library name="rome-utils-1.12.0.jar"/>
+      <library name="relaxng-datatype-2.3.2.jar"/>
+      <library name="rngom-2.3.2.jar"/>
+      <library name="rome-1.12.1.jar"/>
+      <library name="rome-utils-1.12.1.jar"/>
       <library name="sentiment-analysis-parser-0.1.jar"/>
       <library name="sis-feature-0.8.jar"/>
       <library name="sis-metadata-0.8.jar"/>
@@ -111,20 +119,20 @@
       <library name="sis-storage-0.8.jar"/>
       <library name="sis-utility-0.8.jar"/>
       <library name="stax2-api-3.1.4.jar"/>
-      <library name="stax-ex-1.8.jar"/>
+      <library name="stax-ex-1.8.1.jar"/>
       <library name="tagsoup-1.2.1.jar"/>
-      <library name="tika-parsers-1.20.jar"/>
-      <library name="txw2-2.3.1.jar"/>
+      <library name="tika-parsers-1.22.jar"/>
+      <library name="txw2-2.3.2.jar"/>
       <library name="udunits-4.5.5.jar"/>
-      <library name="uimafit-core-2.4.0.jar"/>
-      <library name="uimaj-core-3.0.1.jar"/>
       <library name="unit-api-1.0.jar"/>
       <library name="vorbis-java-core-0.8.jar"/>
       <library name="vorbis-java-tika-0.8.jar"/>
       <library name="woodstox-core-5.0.3.jar"/>
+      <library name="xercesImpl-2.12.0.jar"/>
       <library name="xmlbeans-3.0.2.jar"/>
-      <library name="xmlschema-core-2.2.3.jar"/>
+      <library name="xmlschema-core-2.2.4.jar"/>
       <library name="xmpcore-5.1.3.jar"/>
+      <library name="xsom-2.3.2.jar"/>
       <library name="xz-1.8.jar"/>
       <!-- end of dependencies of Tika (tika-parsers) -->
    </runtime>
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
index 34da6a0..9948136 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.tika;
 
 import java.net.MalformedURLException;
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
index 09762e0..58f93ac 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.tika;
 
 import java.net.MalformedURLException;
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index 7440333..3a48c98 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -21,8 +21,11 @@
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
@@ -73,6 +76,7 @@
   private boolean upperCaseElementNames = true;
   private String boilerpipeExtractorName;
   private boolean useBoilerpipe;
+  private Set<String> boilerpipeMimeTypes;
 
   public ParseResult getParse(Content content) {
     HTMLDocumentImpl doc = new HTMLDocumentImpl();
@@ -114,7 +118,7 @@
     ContentHandler domHandler;
 
     // Check whether to use Tika's BoilerplateContentHandler
-    if (useBoilerpipe) {
+    if (useBoilerpipe && boilerpipeMimeTypes.contains(mimeType)) {
       BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(
           (ContentHandler) new DOMBuilder(doc, root),
           BoilerpipeExtractorRepository.getExtractor(boilerpipeExtractorName));
@@ -253,15 +257,17 @@
       try {
         // see if a Tika config file can be found in the job file
         URL customTikaConfig = conf.getResource(customConfFile);
-        if (customTikaConfig != null)
+        if (customTikaConfig != null) {
           tikaConfig = new TikaConfig(customTikaConfig,
               this.getClass().getClassLoader());
+        }
       } catch (Exception e1) {
         String message = "Problem loading custom Tika configuration from "
             + customConfFile;
         LOG.error(message, e1);
       }
-    } else {
+    }
+    if (tikaConfig == null) {
       try {
         tikaConfig = new TikaConfig(this.getClass().getClassLoader());
       } catch (Exception e2) {
@@ -291,16 +297,18 @@
       }
     }
 
-    htmlParseFilters = new HtmlParseFilters(getConf());
+    htmlParseFilters = new HtmlParseFilters(conf);
     utils = new DOMContentUtils(conf);
-    cachingPolicy = getConf().get("parser.caching.forbidden.policy",
+    cachingPolicy = conf.get("parser.caching.forbidden.policy",
         Nutch.CACHING_FORBIDDEN_CONTENT);
-    upperCaseElementNames = getConf().getBoolean("tika.uppercase.element.names",
+    upperCaseElementNames = conf.getBoolean("tika.uppercase.element.names",
         true);
-    useBoilerpipe = getConf().get("tika.extractor", "none")
-        .equals("boilerpipe");
-    boilerpipeExtractorName = getConf()
-        .get("tika.extractor.boilerpipe.algorithm", "ArticleExtractor");
+    useBoilerpipe = conf.get("tika.extractor", "none").equals("boilerpipe");
+    boilerpipeExtractorName = conf.get("tika.extractor.boilerpipe.algorithm",
+        "ArticleExtractor");
+    boilerpipeMimeTypes = new HashSet<>(Arrays
+        .asList(conf.getTrimmedStrings("tika.extractor.boilerpipe.mime.types",
+            "text/html", "application/xhtml+xml")));
   }
 
   public Configuration getConf() {
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
index d625c33..06cb10f 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/XMLCharacterRecognizer.java
@@ -1,10 +1,4 @@
 /*
- * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
- * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer,
- * XXX in order to avoid dependency on Xalan.
- */
-
-/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -20,6 +14,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+/*
+ * XXX ab@apache.org: This class is copied verbatim from Xalan-J 2.6.0
+ * XXX distribution, org.apache.xml.utils.XMLCharacterRecognizer,
+ * XXX in order to avoid dependency on Xalan.
+ */
 /*
  * $Id: XMLCharacterRecognizer.java 823614 2009-10-09 17:02:32Z ab $
  */
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
index 06828cf..2f04d7f 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestDOMContentUtils.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.tika;
 
 import java.net.URL;
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
index 3a4d70a..87b452c 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.tika;
 
 import org.junit.Assert;
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
index f7d01f6..4924511 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.tika;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
index c688ee4..779278c 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.tika;
 
 import org.apache.nutch.protocol.ProtocolFactory;
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
index a3c04ca..7183ceb 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.tika;
 
 import org.apache.nutch.protocol.ProtocolFactory;
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
index f9ad710..b0226d9 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.tika;
 
 import java.io.FileInputStream;
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
index c7d7d0a..36b2ecf 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.tika;
 
 import org.apache.nutch.protocol.ProtocolFactory;
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
index 6585d98..b45a20f 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRobotsMetaProcessor.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRobotsMetaProcessor.java
index 8a949a6..7591cef 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRobotsMetaProcessor.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRobotsMetaProcessor.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.tika;
 
 import java.net.URL;
diff --git a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
index 25a52a1..c4b953e 100644
--- a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
+++ b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.zip;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
index 966281d..019c2e3 100644
--- a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
+++ b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.zip;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java b/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
index bbb0866..767099e 100644
--- a/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
+++ b/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse.zip;
 
 import org.apache.nutch.protocol.ProtocolFactory;
diff --git a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
index 697b833..c98a843 100644
--- a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
+++ b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Classify.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parsefilter.naivebayes;
 
 import java.io.BufferedReader;
diff --git a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
index 05ea5c9..25354bd 100644
--- a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
+++ b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java
index 19a6911..eb6864e 100644
--- a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java
+++ b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/Train.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parsefilter.naivebayes;
 
 import java.io.BufferedReader;
diff --git a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
index 6a892be..3543969 100644
--- a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
+++ b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/package-info.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
index c2661a5..3c43cf5 100644
--- a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
+++ b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parsefilter.regex;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
index 8a415b6..8b613e1 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.file;
 
 import java.lang.invoke.MethodHandles;
@@ -73,7 +72,7 @@
    */
   public void setConf(Configuration conf) {
     this.conf = conf;
-    this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);
+    this.maxContentLength = conf.getInt("file.content.limit", 1024 * 1024);
     this.crawlParents = conf.getBoolean("file.crawl.parent", true);
     this.symlinksAsRedirects = conf.getBoolean(
         "file.crawl.redirect_noncanonical", true);
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java
index 2019de0..38fd303 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileError.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.file;
 
 /**
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileException.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileException.java
index f0467de..468b79e 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileException.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileException.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.file;
 
 import org.apache.nutch.protocol.ProtocolException;
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
index ce98270..b2db228 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.file;
 
 import java.net.URL;
diff --git a/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java b/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
index 5f95377..ffee2ba 100644
--- a/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
+++ b/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.file;
 
 // Hadoop imports
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
index 71059f2..1c48ab3 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.ftp;
 
 import java.io.BufferedReader;
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
index 6d21b50..3da83bd 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.ftp;
 
 import org.slf4j.Logger;
@@ -251,7 +250,7 @@
    */
   public void setConf(Configuration conf) {
     this.conf = conf;
-    this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024);
+    this.maxContentLength = conf.getInt("ftp.content.limit", 1024 * 1024);
     this.timeout = conf.getInt("ftp.timeout", 10000);
     this.userName = conf.get("ftp.username", "anonymous");
     this.passWord = conf.get("ftp.password", "anonymous@example.com");
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpError.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpError.java
index 558747a..8ff19f8 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpError.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpError.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.ftp;
 
 /**
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpException.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpException.java
index 5a29668..3ea0208 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpException.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpException.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.ftp;
 
 import org.apache.nutch.protocol.ProtocolException;
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
index 689ac8e..70311ac 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.ftp;
 
 /**
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
index 9f35b74..d7e2480 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.ftp;
 
 /**
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
index c058fcb..91860bf 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.ftp;
 
 /**
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
index 9083d7c..f0acecd 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.ftp;
 
 /**
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
index 07adb4c..0451201 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.ftp;
 
 import org.apache.commons.net.ftp.FTP;
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
index 603514b..b28d021 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.ftp;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
index c68eac8..d41c35a 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.ftp;
 
 import java.io.BufferedReader;
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
index cd6d742..b82880d 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index 361b41e..e76bc04 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -1,13 +1,13 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -262,16 +262,14 @@
           } else if ("deflate".equals(contentEncoding)) {
             content = http.processDeflateEncoded(content, url);
           } else {
-            // store the headers verbatim only if the response was not compressed
-            // as the content length reported with not match otherwise
-            if (httpHeaders != null) {
-              headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
-            }
             if (Http.LOG.isTraceEnabled()) {
               Http.LOG.trace("fetched " + content.length + " bytes from " + url);
             }
           }
         }
+        if (httpHeaders != null) {
+          headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
+        }
       }
 
     } finally {
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/DummyX509TrustManager.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/DummyX509TrustManager.java
index 879f703..c92f7d2 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/DummyX509TrustManager.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/DummyX509TrustManager.java
@@ -17,7 +17,6 @@
 /*
  * Based on EasyX509TrustManager from commons-httpclient.
  */
-
 package org.apache.nutch.protocol.http;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
index 772a6c0..6c7a7be 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -50,11 +50,6 @@
    */
   public void setConf(Configuration conf) {
     super.setConf(conf);
-    // Level logLevel = Level.WARNING;
-    // if (conf.getBoolean("http.verbose", false)) {
-    // logLevel = Level.FINE;
-    // }
-    // LOG.setLevel(logLevel);
   }
 
   public static void main(String[] args) throws Exception {
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index 0f9c00d..2d75b1c 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -1,13 +1,13 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -213,13 +213,22 @@
         reqStr.append("\r\n");
       }
 
-      if (http.isCookieEnabled()
-          && datum.getMetaData().containsKey(HttpBase.COOKIE)) {
-        String cookie = ((Text) datum.getMetaData().get(HttpBase.COOKIE))
-            .toString();
-        reqStr.append("Cookie: ");
-        reqStr.append(cookie);
-        reqStr.append("\r\n");
+      if (http.isCookieEnabled()) {
+        String cookie = null;
+        
+        if (datum.getMetaData().containsKey(HttpBase.COOKIE)) {
+          cookie = ((Text)datum.getMetaData().get(HttpBase.COOKIE)).toString();
+        }
+        
+        if (cookie == null) {
+          cookie = http.getCookie(url);
+        }
+        
+        if (cookie != null) {
+          reqStr.append("Cookie: ");
+          reqStr.append(cookie);
+          reqStr.append("\r\n");
+        }
       }
 
       if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
@@ -294,16 +303,14 @@
         } else if ("deflate".equals(contentEncoding)) {
           content = http.processDeflateEncoded(content, url);
         } else {
-          // store the headers verbatim only if the response was not compressed
-          // as the content length reported does not match otherwise
-          if (httpHeaders != null) {
-            httpHeaders.append("\r\n");
-            headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
-          }
           if (Http.LOG.isTraceEnabled()) {
             Http.LOG.trace("fetched " + content.length + " bytes from " + url);
           }
         }
+        if (httpHeaders != null) {
+          httpHeaders.append("\r\n");
+          headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
+        }
       } catch (IOException | HttpException e) {
         // Headers parsing went fine, but an error occurred while trying to read
         // the body of the request (the body may be malformed)
diff --git a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestBadServerResponses.java b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestBadServerResponses.java
index 51c7930..03ddfe9 100644
--- a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestBadServerResponses.java
+++ b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestBadServerResponses.java
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.http;
 
 import static org.junit.Assert.assertEquals;
diff --git a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
index 7dd9e9b..18db917 100644
--- a/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
+++ b/src/plugin/protocol-http/src/test/org/apache/nutch/protocol/http/TestProtocolHttp.java
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.http;
 
 import static org.junit.Assert.assertEquals;
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
index a49b3e3..f867a56 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummySSLProtocolSocketFactory.java
@@ -14,15 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-/*
- * Based on EasySSLProtocolSocketFactory from commons-httpclient:
- * 
- * $Header:
- * /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/contrib/org/apache/commons/httpclient/contrib/ssl/DummySSLProtocolSocketFactory.java,v
- * 1.7 2004/06/11 19:26:27 olegk Exp $ $Revision$ $Date: 2005-02-26 05:01:52
- * -0800 (Sat, 26 Feb 2005) $
- */
-
 package org.apache.nutch.protocol.httpclient;
 
 import java.lang.invoke.MethodHandles;
@@ -42,6 +33,14 @@
 import javax.net.ssl.SSLContext;
 import javax.net.ssl.TrustManager;
 
+/*
+ * Based on EasySSLProtocolSocketFactory from commons-httpclient:
+ *
+ * $Header:
+ * /home/jerenkrantz/tmp/commons/commons-convert/cvs/home/cvs/jakarta-commons//httpclient/src/contrib/org/apache/commons/httpclient/contrib/ssl/DummySSLProtocolSocketFactory.java,v
+ * 1.7 2004/06/11 19:26:27 olegk Exp $ $Revision$ $Date: 2005-02-26 05:01:52
+ * -0800 (Sat, 26 Feb 2005) $
+ */
 public class DummySSLProtocolSocketFactory implements
     SecureProtocolSocketFactory {
 
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
index 44683cc..3188092 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
@@ -14,10 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-/*
- * Based on EasyX509TrustManager from commons-httpclient.
- */
-
 package org.apache.nutch.protocol.httpclient;
 
 import java.security.KeyStore;
@@ -30,6 +26,9 @@
 import javax.net.ssl.TrustManager;
 import javax.net.ssl.X509TrustManager;
 
+/*
+ * Based on EasyX509TrustManager from commons-httpclient.
+ */
 public class DummyX509TrustManager implements X509TrustManager {
   private X509TrustManager standardTrustManager = null;
 
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
index 2cd29d3..6ca8f42 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
index 54dc905..46fc2cc 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthentication.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
index daff5ec..62125fa 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationException.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
index c4d0345..f33be6d 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpAuthenticationFactory.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
index 506902d..277313c 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
index f9cff36..60b4b27 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthConfigurer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
index e7e96d3..868e738 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpFormAuthentication.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
index 05b9e2a..010f5ca 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -100,9 +100,20 @@
     // XXX little danger in retrying...
     // params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
     
-    if (http.isCookieEnabled() && datum.getMetaData().containsKey(http.COOKIE)) {
-      String cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString();
-      get.addRequestHeader("Cookie", cookie);
+    if (http.isCookieEnabled()) {
+      String cookie = null;
+      
+      if (datum.getMetaData().containsKey(http.COOKIE)) {
+        cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString();
+      }
+      
+      if (cookie == null) {
+        cookie = http.getCookie(url);
+      }
+      
+      if (cookie != null) {
+        get.addRequestHeader("Cookie", cookie);
+      }
     }
     
     try {
diff --git a/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java b/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
index 783e5af..a897cd4 100644
--- a/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
+++ b/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.httpclient;
 
 import java.net.URL;
diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/Http.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/Http.java
index 90d2be7..e15ae11 100644
--- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/Http.java
+++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/Http.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
index 6d91b33..8ebd898 100644
--- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
+++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/HttpResponse.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
index bf93996..126d9e9 100644
--- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
+++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefalultMultiInteractionHandler.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
index 59dfc20..f670d5f 100644
--- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
+++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
index e0d2861..7f97ba0 100644
--- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
+++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultHandler.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.interactiveselenium.handlers;
 
 import org.openqa.selenium.WebDriver;
diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java
index 7213c6e..4d397fa 100644
--- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java
+++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/InteractiveSeleniumHandler.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.interactiveselenium.handlers;
 
 import org.openqa.selenium.WebDriver;
diff --git a/src/plugin/protocol-okhttp/ivy.xml b/src/plugin/protocol-okhttp/ivy.xml
index 4c90351..14b38d5 100644
--- a/src/plugin/protocol-okhttp/ivy.xml
+++ b/src/plugin/protocol-okhttp/ivy.xml
@@ -36,7 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="com.squareup.okhttp3" name="okhttp" rev="3.10.0"/>
+    <dependency org="com.squareup.okhttp3" name="okhttp" rev="3.14.2"/>
   </dependencies>
   
 </ivy-module>
diff --git a/src/plugin/protocol-okhttp/plugin.xml b/src/plugin/protocol-okhttp/plugin.xml
index 0152fb0..b843736 100755
--- a/src/plugin/protocol-okhttp/plugin.xml
+++ b/src/plugin/protocol-okhttp/plugin.xml
@@ -25,8 +25,8 @@
       <library name="protocol-okhttp.jar">
          <export name="*"/>
       </library>
-      <library name="okhttp-3.10.0.jar"/>
-      <library name="okio-1.14.0.jar"/>
+      <library name="okhttp-3.14.2.jar"/>
+      <library name="okio-1.17.2.jar"/>
    </runtime>
 
    <requires>
diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
index c52e225..22183c0 100644
--- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
+++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
index 5e861c7..b84fdc0 100644
--- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
+++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttpResponse.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -84,11 +84,20 @@
           HttpDateFormat.toString(datum.getModifiedTime()));
     }
 
-    if (okhttp.isCookieEnabled()
-        && datum.getMetaData().containsKey(HttpBase.COOKIE)) {
-      String cookie = ((Text) datum.getMetaData().get(HttpBase.COOKIE))
-          .toString();
-      rb.header("Cookie", cookie);
+    if (okhttp.isCookieEnabled()) {
+      String cookie = null;
+      
+      if (datum.getMetaData().containsKey(HttpBase.COOKIE)) {
+        cookie = ((Text)datum.getMetaData().get(HttpBase.COOKIE)).toString();
+      }
+      
+      if (cookie == null) {
+        cookie = okhttp.getCookie(url);
+      }
+      
+      if (cookie != null) {
+        rb.header("Cookie", cookie);
+      }
     }
 
     Request request = rb.build();
@@ -148,32 +157,37 @@
     }
 
     int maxContentBytes = Integer.MAX_VALUE;
-    if (maxContent != -1) {
+    if (maxContent >= 0) {
       maxContentBytes = Math.min(maxContentBytes, maxContent);
     }
 
     BufferedSource source = responseBody.source();
-    int contentBytesBuffered = 0;
-    int contentBytesRequested = 0;
+    int bytesRequested = 0;
     int bufferGrowStepBytes = 8192;
-    while (contentBytesBuffered < maxContentBytes) {
-      contentBytesRequested += Math.min(bufferGrowStepBytes,
-          (maxContentBytes - contentBytesBuffered));
+    while (source.buffer().size() <= maxContentBytes) {
+      bytesRequested += Math.min(bufferGrowStepBytes,
+          /*
+           * request one byte more than required to reliably detect truncated
+           * content, but beware of integer overflows
+           */
+          (maxContentBytes == Integer.MAX_VALUE ? maxContentBytes
+              : (1 + maxContentBytes)) - bytesRequested);
       boolean success = false;
       try {
-        success = source.request(contentBytesRequested);
+        success = source.request(bytesRequested);
       } catch (IOException e) {
-        if (partialAsTruncated && contentBytesBuffered > 0) {
+        if (partialAsTruncated && source.buffer().size() > 0) {
           // treat already fetched content as truncated
           truncated.setReason(TruncatedContentReason.DISCONNECT);
+          LOG.info("Truncated content for {}, partial fetch caused by:", url,
+              e);
         } else {
           throw e;
         }
       }
-      contentBytesBuffered = (int) source.buffer().size();
       if (LOG.isDebugEnabled()) {
-        LOG.debug("total bytes requested = {}, buffered = {}",
-            contentBytesRequested, contentBytesBuffered);
+        LOG.debug("total bytes requested = {}, buffered = {}", bytesRequested,
+            source.buffer().size());
       }
       if (!success) {
         LOG.debug("source exhausted, no more data to read");
@@ -184,13 +198,15 @@
         truncated.setReason(TruncatedContentReason.TIME);
         break;
       }
-      if (contentBytesBuffered > maxContentBytes) {
+      if (source.buffer().size() >= maxContentBytes) {
         LOG.debug("content limit reached");
-        truncated.setReason(TruncatedContentReason.LENGTH);
       }
+      // okhttp may fetch more content than requested, forward requested bytes
+      bytesRequested = (int) source.buffer().size();
     }
-    int bytesToCopy = contentBytesBuffered;
-    if (maxContent != -1 && contentBytesBuffered > maxContent) {
+    int bytesBuffered = (int) source.buffer().size();
+    int bytesToCopy = bytesBuffered;
+    if (maxContent >= 0 && bytesToCopy > maxContent) {
       // okhttp's internal buffer is larger than maxContent
       truncated.setReason(TruncatedContentReason.LENGTH);
       bytesToCopy = maxContentBytes;
@@ -199,8 +215,8 @@
     source.buffer().readFully(arr);
     if (LOG.isDebugEnabled()) {
       LOG.debug(
-          "copied {} bytes out of {} buffered, remaining buffer contains {} bytes",
-          bytesToCopy, contentBytesBuffered, source.buffer().size());
+          "copied {} bytes out of {} buffered, remaining {} bytes in buffer",
+          bytesToCopy, bytesBuffered, source.buffer().size());
     }
     return arr;
   }
diff --git a/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml b/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml
index 72776c3..1e9e4a6 100644
--- a/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml
+++ b/src/plugin/protocol-okhttp/src/test/conf/nutch-site-test.xml
@@ -20,6 +20,11 @@
 <configuration>
 
 <property>
+  <name>plugin.includes</name>
+  <value>protocol-okhttp</value>
+</property>
+
+<property>
   <name>http.agent.name</name>
   <value>Nutch-Test</value>
 </property>
diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java
index d8d2654..34c5f6f 100644
--- a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java
+++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestBadServerResponses.java
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.okhttp;
 
 import static org.junit.Assert.assertEquals;
@@ -22,21 +21,28 @@
 import static org.junit.Assert.assertTrue;
 
 import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
 import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
 import java.lang.invoke.MethodHandles;
 import java.net.InetSocketAddress;
+import java.net.MalformedURLException;
 import java.net.ServerSocket;
 import java.net.Socket;
 import java.net.URL;
 import java.nio.charset.StandardCharsets;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.zip.GZIPOutputStream;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.util.NutchConfiguration;
 import org.junit.After;
 import org.junit.Ignore;
 import org.junit.Test;
@@ -52,7 +58,7 @@
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
 
-  private OkHttp http;
+  private Protocol http;
   private ServerSocket server;
   private Configuration conf;
   private int port = 47506;
@@ -61,13 +67,15 @@
   private static final String simpleContent = "Content-Type: text/html\r\n\r\nThis is a text.";
 
   public void setUp() throws Exception {
-    conf = new Configuration();
+    conf = NutchConfiguration.create();
     conf.addResource("nutch-default.xml");
+    // plugin tests specific config file - adds protocol-okhttp to
+    // plugin.includes
     conf.addResource("nutch-site-test.xml");
     conf.setBoolean("store.http.headers", true);
 
-    http = new OkHttp();
-    http.setConf(conf);
+    http = new ProtocolFactory(conf)
+        .getProtocolById("org.apache.nutch.protocol.okhttp.OkHttp");
   }
 
   @After
@@ -75,15 +83,29 @@
     server.close();
   }
 
+  public static String getHeaders(ProtocolOutput response) {
+    return response.getContent().getMetadata().get(Response.RESPONSE_HEADERS);
+  }
+
+  public static String getHeader(ProtocolOutput response, String header) {
+    for (String line : getHeaders(response).split("\r\n")) {
+      String[] parts = line.split(": ", 1);
+      if (parts[0].equals(header)) {
+        return parts[1];
+      }
+    }
+    return null;
+  }
+
   /**
-   * Starts the test server at a specified port and constant response.
-   * 
-   * @param portno
-   *          Port number.
-   * @param response
-   *          response sent on every request
-   */
-  private void runServer(int port, String response) throws Exception {
+     * Starts the test server at a specified port and constant response.
+     * 
+     * @param portno
+     *          Port number.
+     * @param response
+     *          response sent on every request
+     */
+  private void runServer(int port, byte[] response) throws Exception {
     server = new ServerSocket();
     server.bind(new InetSocketAddress("127.0.0.1", port));
     Pattern requestPattern = Pattern.compile("(?i)^GET\\s+(\\S+)");
@@ -93,9 +115,7 @@
       LOG.info("Connection received");
       try (
           BufferedReader in = new BufferedReader(new InputStreamReader(
-              socket.getInputStream(), StandardCharsets.UTF_8));
-          PrintWriter out = new PrintWriter(new OutputStreamWriter(
-              socket.getOutputStream(), StandardCharsets.UTF_8), true)) {
+              socket.getInputStream(), StandardCharsets.UTF_8))) {
 
         String line;
         while ((line = in.readLine()) != null) {
@@ -107,13 +127,11 @@
           if (m.find()) {
             LOG.info("Requested {}", m.group(1));
             if (!m.group(1).startsWith("/")) {
-              response = "HTTP/1.1 400 Bad request\r\n\r\n";
+              response = "HTTP/1.1 400 Bad request\r\n\r\n".getBytes(StandardCharsets.UTF_8);
             }
           }
         }
-        LOG.info("Response: {}",
-            response.substring(0, Math.min(1024, response.length())));
-        out.print(response);
+        socket.getOutputStream().write(response);
       } catch (Exception e) {
         LOG.warn("Exception in test server:", e);
       }
@@ -121,6 +139,10 @@
   }
 
   private void launchServer(String response) throws InterruptedException {
+    launchServer(response.getBytes(StandardCharsets.UTF_8));
+  }
+
+  private void launchServer(byte[] response) throws InterruptedException {
     Thread serverThread = new Thread(() -> {
       try {
         runServer(port, response);
@@ -142,14 +164,22 @@
    * @param expectedCode
    *          HTTP response status code expected while fetching the page.
    */
-  private Response fetchPage(String page, int expectedCode) throws Exception {
+  private ProtocolOutput fetchPage(String page, int expectedCode)
+      throws MalformedURLException {
     URL url = new URL("http", "127.0.0.1", port, page);
     LOG.info("Fetching {}", url);
     CrawlDatum crawlDatum = new CrawlDatum();
-    Response response = http.getResponse(url, crawlDatum, true);
-    assertEquals("HTTP Status Code for " + url, expectedCode,
-        response.getCode());
-    return response;
+    ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()),
+        crawlDatum);
+    int httpStatusCode = -1;
+    if (crawlDatum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) {
+      httpStatusCode = Integer.parseInt(crawlDatum.getMetaData()
+          .get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
+    }
+
+    assertEquals("HTTP Status Code for " + url, expectedCode, httpStatusCode);
+
+    return out;
   }
 
   @Test
@@ -215,10 +245,10 @@
     setUp();
     launchServer("HTTP/1.1 302 Found\r\nLocation: http://example.com/\r\n"
         + "Transfer-Encoding: chunked\r\n\r\nNot a valid chunk.");
-    Response fetched = fetchPage("/", 302);
-    assertNotNull("No redirect Location.", fetched.getHeader("Location"));
+    ProtocolOutput fetched = fetchPage("/", 302);
+    assertNotNull("No redirect Location.", getHeader(fetched, "Location"));
     assertEquals("Wrong redirect Location.", "http://example.com/",
-        fetched.getHeader("Location"));
+        getHeader(fetched, "Location"));
   }
 
   /**
@@ -230,9 +260,9 @@
     setUp();
     String text = "This is a text containing non-ASCII characters: \u00e4\u00f6\u00fc\u00df";
     launchServer(text);
-    Response fetched = fetchPage("/", 200);
+    ProtocolOutput fetched = fetchPage("/", 200);
     assertEquals("Wrong text returned for response with no status line.", text,
-        new String(fetched.getContent(), StandardCharsets.UTF_8));
+        new String(fetched.getContent().getContent(), StandardCharsets.UTF_8));
     server.close();
     text = "<!DOCTYPE html>\n<html>\n<head>\n"
         + "<title>Testing no HTTP header èéâ</title>\n"
@@ -242,7 +272,7 @@
     launchServer(text);
     fetched = fetchPage("/", 200);
     assertEquals("Wrong text returned for response with no status line.", text,
-        new String(fetched.getContent(), StandardCharsets.UTF_8));
+        new String(fetched.getContent().getContent(), StandardCharsets.UTF_8));
   }
 
   /**
@@ -256,18 +286,18 @@
     launchServer(responseHeader
         + "Set-Cookie: UserID=JohnDoe;\r\n  Max-Age=3600;\r\n  Version=1\r\n"
         + simpleContent);
-    Response fetched = fetchPage("/", 200);
-    LOG.info("Headers: {}", fetched.getHeaders());
-    assertNotNull("Failed to set multi-line \"Set-Cookie\" header.", fetched.getHeader("Set-Cookie"));
+    ProtocolOutput fetched = fetchPage("/", 200);
+    LOG.info("Headers: {}", getHeaders(fetched));
+    assertNotNull("Failed to set multi-line \"Set-Cookie\" header.",
+        getHeader(fetched, "Set-Cookie"));
     assertTrue("Failed to set multi-line \"Set-Cookie\" header.",
-        fetched.getHeader("Set-Cookie").contains("Version=1"));
+        getHeader(fetched, "Set-Cookie").contains("Version=1"));
   }
 
   /**
    * NUTCH-2561 protocol-http can be made to read arbitrarily large HTTP
    * responses
    */
-  @Test(expected = Exception.class)
   public void testOverlongHeader() throws Exception {
     setUp();
     StringBuilder response = new StringBuilder();
@@ -282,13 +312,14 @@
     response.append("\r\n" + simpleContent);
     launchServer(response.toString());
     // should throw exception because of overlong header
-    fetchPage("/", 200);
+    fetchPage("/", -1);
   }
 
   /**
    * NUTCH-2562 protocol-http fails to read large chunked HTTP responses,
    * NUTCH-2575 protocol-http does not respect the maximum content-size for
-   * chunked responses
+   * chunked responses. Also test whether truncations of chunked content are
+   * properly marked.
    */
   @Test
   public void testChunkedContent() throws Exception {
@@ -309,10 +340,141 @@
     }
     response.append("\r\n0\r\n\r\n");
     launchServer(response.toString());
-    Response fetched = fetchPage("/", 200);
+    ProtocolOutput fetched = fetchPage("/", 200);
     assertEquals(
         "Chunked content not truncated according to http.content.limit", 65536,
-        fetched.getContent().length);
+        fetched.getContent().getContent().length);
+    assertNotNull("Content truncation not marked",
+        fetched.getContent().getMetadata().get(Response.TRUNCATED_CONTENT));
+    assertEquals("Content truncation not marked",
+        Response.TruncatedContentReason.LENGTH.toString().toLowerCase(),
+        fetched.getContent().getMetadata().get(Response.TRUNCATED_CONTENT_REASON));
+  }
+
+  /**
+   * NUTCH-2729 Check for http.content.limit defined in nutch-site-test.xml:
+   * whether content is truncated to the configured 64 kB and whether it is
+   * properly marked as truncated.
+   */
+  @Test
+  public void testTruncationMarking() throws Exception {
+    setUp();
+    int[] kBs = { 63, 64, 65 };
+    for (int kB : kBs) {
+      StringBuilder response = new StringBuilder();
+      response.append(responseHeader);
+      response.append("Content-Type: text/plain\r\nContent-Length: "
+          + (kB * 1024) + "\r\n\r\n");
+      for (int i = 0; i < kB; i++) {
+        for (int j = 0; j < 16; j++) {
+          // 16 chunks a 64 bytes = 1 kB
+          response.append(
+              "abcdefghijklmnopqurstuvxyz0123456789-ABCDEFGHIJKLMNOPQURSTUVXYZ\n");
+        }
+      }
+      launchServer(response.toString());
+      ProtocolOutput fetched = fetchPage("/", 200);
+      assertEquals("Content not truncated according to http.content.limit",
+          Math.min(kB * 1024, 65536), fetched.getContent().getContent().length);
+      if (kB * 1024 > 65536) {
+        assertNotNull("Content truncation not marked",
+            fetched.getContent().getMetadata().get(Response.TRUNCATED_CONTENT));
+        assertEquals("Content truncation not marked",
+            Response.TruncatedContentReason.LENGTH.toString().toLowerCase(),
+            fetched.getContent().getMetadata().get(Response.TRUNCATED_CONTENT_REASON));
+      }
+      server.close(); // need to close server before next loop iteration
+    }
+  }
+
+  /**
+   * NUTCH-2729 Check for http.content.limit defined in nutch-site-test.xml:
+   * whether content is truncated to the configured 64 kB and whether it is
+   * properly marked as truncated.
+   */
+  @Test
+  public void testTruncationMarkingGzip() throws Exception {
+    setUp();
+    int[] kBs = { 63, 64, 65 };
+    for (int kB : kBs) {
+      StringBuilder payload = new StringBuilder();
+      for (int i = 0; i < kB; i++) {
+        for (int j = 0; j < 16; j++) {
+          // 16 chunks a 64 bytes = 1 kB
+          payload.append(
+              "abcdefghijklmnopqurstuvxyz0123456789-ABCDEFGHIJKLMNOPQURSTUVXYZ\n");
+        }
+      }
+      ByteArrayOutputStream bytes = new ByteArrayOutputStream();
+      GZIPOutputStream gzip = new GZIPOutputStream(bytes);
+      gzip.write(payload.toString().getBytes(StandardCharsets.UTF_8));
+      gzip.close();
+      StringBuilder responseHead = new StringBuilder();
+      responseHead.append(responseHeader);
+      responseHead.append("Content-Type: text/plain\r\nContent-Length: "
+          + bytes.size() + "\r\nContent-Encoding: gzip\r\n\r\n");
+      ByteArrayOutputStream response = new ByteArrayOutputStream();
+      response.write(responseHead.toString().getBytes(StandardCharsets.UTF_8));
+      response.write(bytes.toByteArray());
+
+      launchServer(response.toByteArray());
+      ProtocolOutput fetched = fetchPage("/", 200);
+      assertEquals("Content not truncated according to http.content.limit",
+          Math.min(kB * 1024, 65536), fetched.getContent().getContent().length);
+      if (kB * 1024 > 65536) {
+        assertNotNull("Content truncation not marked",
+            fetched.getContent().getMetadata().get(Response.TRUNCATED_CONTENT));
+        assertEquals("Content truncation not marked",
+            Response.TruncatedContentReason.LENGTH.toString().toLowerCase(),
+            fetched.getContent().getMetadata().get(Response.TRUNCATED_CONTENT_REASON));
+      }
+      server.close(); // need to close server before next loop iteration
+    }
+  }
+
+  /**
+   * Force an exception after all content has been fetched by sending a wrong
+   * `Content-Length` header and check whether the content is stored anyway if
+   * http.partial.truncated == true
+   */
+  @Test
+  public void testPartialContentTruncated() throws Exception {
+    setUp();
+    conf.setBoolean("http.partial.truncated", true);
+    http.setConf(conf);
+    String testContent = "This is a text.";
+    launchServer(
+        responseHeader + "Content-Length: 50000\r\n\r\n" + testContent);
+    ProtocolOutput fetched = fetchPage("/", 200);
+    assertEquals("Content not saved as truncated", testContent,
+        new String(fetched.getContent().getContent(), StandardCharsets.UTF_8));
+    assertNotNull("Content truncation not marked",
+        fetched.getContent().getMetadata().get(Response.TRUNCATED_CONTENT));
+  }
+
+  @Test
+  public void testNoContentLimit() throws Exception {
+    setUp();
+    conf.setInt("http.content.limit", -1);
+    http.setConf(conf);
+    StringBuilder response = new StringBuilder();
+    response.append(responseHeader);
+    // Even 128 kB content shouldn't cause any truncation because
+    // http.content.limit == -1
+    int kB = 128;
+    response.append("Content-Type: text/plain\r\nContent-Length: " + (kB * 1024)
+        + "\r\n\r\n");
+    for (int i = 0; i < kB; i++) {
+      for (int j = 0; j < 16; j++) {
+        // 16 chunks a 64 bytes = 1 kB
+        response.append(
+            "abcdefghijklmnopqurstuvxyz0123456789-ABCDEFGHIJKLMNOPQURSTUVXYZ\n");
+      }
+    }
+    launchServer(response.toString());
+    ProtocolOutput fetched = fetchPage("/", 200);
+    assertEquals("Content truncated although http.content.limit == -1",
+        (kB * 1024), fetched.getContent().getContent().length);
   }
 
 }
diff --git a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java
index d276f1c..3650722 100644
--- a/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java
+++ b/src/plugin/protocol-okhttp/src/test/org/apache/nutch/protocol/okhttp/TestProtocolOkHttp.java
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol.okhttp;
 
 import static org.junit.Assert.assertEquals;
@@ -24,10 +23,12 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.okhttp.OkHttp;
+import org.apache.nutch.util.NutchConfiguration;
 import org.junit.After;
 import org.junit.Test;
 import org.mortbay.jetty.Server;
@@ -41,19 +42,21 @@
 public class TestProtocolOkHttp {
   private static final String RES_DIR = System.getProperty("test.data", ".");
 
-  private OkHttp http;
+  private Protocol http;
   private Server server;
   private Context root;
   private Configuration conf;
   private int port;
 
   public void setUp(boolean redirection) throws Exception {
-    conf = new Configuration();
+    conf = NutchConfiguration.create();
     conf.addResource("nutch-default.xml");
+    // plugin tests specific config file - adds protocol-okhttp to
+    // plugin.includes
     conf.addResource("nutch-site-test.xml");
 
-    http = new OkHttp();
-    http.setConf(conf);
+    http = new ProtocolFactory(conf)
+        .getProtocolById("org.apache.nutch.protocol.okhttp.OkHttp");
 
     server = new Server();
 
@@ -124,12 +127,17 @@
   private void fetchPage(String page, int expectedCode) throws Exception {
     URL url = new URL("http", "127.0.0.1", port, page);
     CrawlDatum crawlDatum = new CrawlDatum();
-    Response response = http.getResponse(url, crawlDatum, true);
+
     ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()),
         crawlDatum);
+    int httpStatusCode = -1;
+    if (crawlDatum.getMetaData().containsKey(Nutch.PROTOCOL_STATUS_CODE_KEY)) {
+      httpStatusCode = Integer.parseInt(crawlDatum.getMetaData()
+          .get(Nutch.PROTOCOL_STATUS_CODE_KEY).toString());
+    }
     Content content = out.getContent();
-    assertEquals("HTTP Status Code for " + url, expectedCode,
-        response.getCode());
+
+    assertEquals("HTTP Status Code for " + url, expectedCode, httpStatusCode);
 
     if (page.compareTo("/nonexists.html") != 0
         && page.compareTo("/brokenpage.jsp") != 0
diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
index ee98af4..646dfed 100644
--- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
+++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
index dd12716..4a20b04 100644
--- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
+++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -295,6 +295,9 @@
             }
           }
         }
+        if (httpHeaders != null) {
+          headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
+        }
       } 
 
     } finally {
diff --git a/src/plugin/publish-rabbitmq/src/java/org/apache/nutch/publisher/rabbitmq/RabbitMQPublisherImpl.java b/src/plugin/publish-rabbitmq/src/java/org/apache/nutch/publisher/rabbitmq/RabbitMQPublisherImpl.java
index e712741..66d18ee 100644
--- a/src/plugin/publish-rabbitmq/src/java/org/apache/nutch/publisher/rabbitmq/RabbitMQPublisherImpl.java
+++ b/src/plugin/publish-rabbitmq/src/java/org/apache/nutch/publisher/rabbitmq/RabbitMQPublisherImpl.java
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.publisher.rabbitmq;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
index c016030..0d32e19 100644
--- a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
+++ b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.scoring.depth;
 
 import java.util.Collection;
diff --git a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
index 5a080be..3991464 100644
--- a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
+++ b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.scoring.opic;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java b/src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java
index 142c9a5..f49a996 100644
--- a/src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java
+++ b/src/plugin/scoring-orphan/src/test/org/apache/nutch/scoring/orphan/TestOrphanScoringFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
index f44fabd..57f73c5 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityModel.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
index 8436b87..a97124b 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/SimilarityScoringFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
index 9c8aeb8..a2b94fe 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/CosineSimilarity.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
index 33c3a23..2d57fd7 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/DocVector.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
index b1c56d0..ff253fe 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/package-info.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
index 49dc835..902b715 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/package-info.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
index 7e4c359..2677f9e 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
index d6b6d1a..918af9b 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/package-info.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/package-info.java
index bbda418..c3c629d 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/package-info.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/package-info.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,11 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 /**
- * 
- */
-/**
- * Utility package for Lucene functions
- *
+ * Utility package for Lucene functions.
  */
 package org.apache.nutch.scoring.similarity.util;
diff --git a/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java b/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
index 3155853..4f3a92c 100644
--- a/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
+++ b/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer.tld;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java b/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
index b7f4963..95891dd 100644
--- a/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
+++ b/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.scoring.tld;
 
 import java.util.List;
diff --git a/src/plugin/urlfilter-automaton/ivy.xml b/src/plugin/urlfilter-automaton/ivy.xml
index 7c1968f..5ddf1db 100644
--- a/src/plugin/urlfilter-automaton/ivy.xml
+++ b/src/plugin/urlfilter-automaton/ivy.xml
@@ -36,7 +36,7 @@
   </publications>
 
   <dependencies>
-    <dependency org="dk.brics.automaton" name="automaton" rev="1.11-8" conf="*->default" />
+    <dependency org="dk.brics" name="automaton" rev="1.12-1" conf="*->default" />
   </dependencies>
   
 </ivy-module>
diff --git a/src/plugin/urlfilter-automaton/plugin.xml b/src/plugin/urlfilter-automaton/plugin.xml
index d0cc1ef..35509fa 100644
--- a/src/plugin/urlfilter-automaton/plugin.xml
+++ b/src/plugin/urlfilter-automaton/plugin.xml
@@ -25,7 +25,7 @@
       <library name="urlfilter-automaton.jar">
          <export name="*"/>
       </library>
-      <library name="automaton-1.11-8.jar"/>
+      <library name="automaton-1.12-1.jar"/>
    </runtime>
 
    <requires>
diff --git a/src/plugin/urlfilter-automaton/sample/Benchmarks.urls b/src/plugin/urlfilter-automaton/sample/Benchmarks.urls
index 40bf4ee..6a0e822 100644
--- a/src/plugin/urlfilter-automaton/sample/Benchmarks.urls
+++ b/src/plugin/urlfilter-automaton/sample/Benchmarks.urls
@@ -294,4 +294,24 @@
 -http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&amp;scy=FR&amp;slg=fr
 -http://store.interact-tv.com/store/product_info.php?cPath=9&amp;products_id=73
 +http://www.tversity.com/
--http://www.aspseek.org/index.php
\ No newline at end of file
+-http://www.aspseek.org/index.php
+-http://www.ilovejesus.com/myhome/holt/coloringbk/eternallife.gif
+-ftp://ftp.pitt.edu/group/student-activities/chess/PGN/Players/morphypg.zip
+-http://www.lib.utexas.edu/maps/africa/botswana_rel95.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/cer_man.jpg
+-http://www.lib.utexas.edu/maps/united_states/greenfield_io_83.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/aly_oli.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Huron.jpg
+-http://history.vineyard.net/mv1887.gif
+-http://www.uscg.mil/history/weblighthouses/hyannisfront.JPG
+-http://www.mytho-fleurs.com/images/vivaces/mentha%20aquatica.JPG
+-http://www.botany.hawaii.edu/faculty/carr/images/ech_cor.jpg
+-http://www.lib.utexas.edu/maps/historical/oakland_1917.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/den_sp1.jpg
+-http://www.lib.utexas.edu/maps/europe/sanmarino.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Brown.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/del_sub_hab.jpg
+-http://www.houseofmabel.com/programs/html3/LinkSure.zip
+-http://www.lib.utexas.edu/maps/states/california.gif
+-http://www.dot.state.oh.us/maps/CountyImages/Highland.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Darke.jpg
diff --git a/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java b/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
index 39fce4e..93cbac5 100644
--- a/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
+++ b/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java b/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
index a70a6b6..22ffb09 100644
--- a/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
+++ b/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/urlfilter-fast/README.md b/src/plugin/urlfilter-fast/README.md
new file mode 100644
index 0000000..46b293f
--- /dev/null
+++ b/src/plugin/urlfilter-fast/README.md
@@ -0,0 +1,59 @@
+
+Filters URLs based on a file of regular expressions using host/domains
+matching first. The default policy is to accept a URL if no matches
+are found.
+
+Rule Format:
+
+```
+Host www.example.org
+  DenyPath /path/to/be/excluded
+  DenyPath /some/other/path/excluded
+
+# Deny everything from *.example.com and example.com
+Domain example.com
+  DenyPath .*
+
+Domain example.org
+  DenyPathQuery /resource/.*?action=exclude
+```
+
+`Host` rules are evaluated before `Domain` rules. For `Host` rules the
+entire host name of a URL must match while the domain names in
+`Domain` rules are considered as matches if the domain is a suffix of
+the host name (consisting of complete host name parts).  Shorter
+domain suffixes are checked first, a single dot "`.`" as "domain name"
+can be used to specify global rules applied to every URL.
+
+E.g., for "www.example.com" the rules given above are looked up in the
+following order:
+
+1. check "www.example.com" whether host-based rules exist and whether one of them matches
+1. check "www.example.com" for domain-based rules
+1. check "example.com" for domain-based rules
+1. check "com" for domain-based rules
+1. check for global rules (domain name is ".")
+
+The first matching rule will reject the URL and no further rules are
+checked.  If no rule matches the URL is accepted.  URLs without a host
+name (e.g., <code>file:/path/file.txt</code> are checked for global
+rules only.  URLs which fail to be parsed as
+[java.net.URL](https://docs.oracle.com/javase/8/docs/api/java/net/URL.html)
+are always rejected.
+
+For rules either the URL path (`DenyPath`) or path and query
+(`DenyPathQuery`) are checked whether the given [Java Regular
+expression](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html)
+is found (see
+[Matcher.find()](https://docs.oracle.com/javase/8/docs/api/java/util/regex/Matcher.html#find--))
+in the URL path (and query).
+
+Rules are applied in the order of their definition. For better
+performance, regular expressions which are simpler/faster or match
+more URLs should be defined earlier.
+
+Comments in the rule file start with the `#` character and reach until
+the end of the line.
+
+The rules file is defined via the property `urlfilter.fast.file`,
+the default name is `fast-urlfilter.txt`.
diff --git a/src/plugin/urlfilter-fast/build.xml b/src/plugin/urlfilter-fast/build.xml
new file mode 100644
index 0000000..c22ca6e
--- /dev/null
+++ b/src/plugin/urlfilter-fast/build.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-fast" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-regex-filter/*.jar" />
+    </fileset>
+    <pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+  </path>
+
+  <!-- Compile test classes for dependencies -->
+  <target name="deps-test-compile">
+    <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample" includes="**/*.txt, **/*.urls"/>
+  </copy>
+
+</project>
diff --git a/src/plugin/urlfilter-fast/ivy.xml b/src/plugin/urlfilter-fast/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/src/plugin/urlfilter-fast/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>
diff --git a/src/plugin/urlfilter-fast/plugin.xml b/src/plugin/urlfilter-fast/plugin.xml
new file mode 100644
index 0000000..4e28cb3
--- /dev/null
+++ b/src/plugin/urlfilter-fast/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-fast"
+   name="Fast URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-fast.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-regex-filter"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.fast"
+              name="Nutch Fast URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="FastURLFilter"
+                      class="org.apache.nutch.urlfilter.fast.FastURLFilter"/>
+   </extension>
+</plugin>
diff --git a/src/plugin/urlfilter-fast/sample/Benchmarks.urls b/src/plugin/urlfilter-fast/sample/Benchmarks.urls
new file mode 100644
index 0000000..6a0e822
--- /dev/null
+++ b/src/plugin/urlfilter-fast/sample/Benchmarks.urls
@@ -0,0 +1,317 @@
++http://www.hostip.info/
+-http://www.elanceur.org/Articles/OntologieSurfaite.html
++http://www.opensymphony.com/quartz/
+-http://www.portletbridge.org/saxbenchmark/index.html
++http://www.lesmotsdelinfo.com/
++http://usefulinc.com/doap/
++http://www.codezoo.com/
++http://search.infocious.com/
+-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html
++http://www.brics.dk/%7Eamoeller/automaton/
++http://jazzz.com/wp.html
++http://www.maxkiesler.com/index.php
++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html
++http://www.alias-i.com/lingpipe/
+-http://johnny.ihackstuff.com/index.php?module=prodreviews
+-http://www.spurl.net/
++http://www.dropload.com/
++http://vivisimo.com/
++http://www.marumushi.com/apps/newsmap/newsmap.cfm
++http://www.ixquick.com/
+-http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html
++http://www.mail-archive.com/
++http://www.spymac.com/
+-http://browsers.evolt.org/
+-http://www.oswd.org/
++http://www.stayinvisible.com/index.pl
++http://java.sun.com/j2se/1.4.2/docs/api/index.html
++http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx
++http://www.bloglines.com/
+-http://www.fckeditor.net/
++http://search.msn.com/
+-http://www.grub.org/
++http://www.xml.com/pub/a/2000/11/29/schemas/part1.html
+-http://www.mnot.net/cache_docs/
+-http://www.furl.net/
++http://www.blogpulse.com/
++http://www.googlefight.com/
++http://www.rokulabs.com/
+-http://mightylegends.zapto.org/dvd/dvdauthor_howto.php
+-http://www.batbox.org/wrt54g-linux.html
+-http://en.wikipedia.org/wiki/%s
++http://www.sipcenter.com/
++http://www.merriampark.com/ld.htm
++http://anon.inf.tu-dresden.de/index_en.html
++http://www.pluck.com/
++http://www.tiddlywiki.com/
++http://www.jux2.com/
++http://clusty.com/
+-http://findability.org/
++http://www.searchengineshowdown.com/
++http://www.nhacks.com/email/index.php
++http://www.koders.com/
++http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf
++http://www.gmailwiki.com/index.php/Main_Page
++http://www.tadalist.com/
++http://www.net2ftp.com/
++http://www.streamload.com/
++http://www.lucazappa.com/brilliantMaker/buttonImage.php
++http://www.hybernaut.com/bdv/delicious-import.html
++http://www.gtmcknight.com/buttons/
++http://amb.vis.ne.jp/mozilla/scrapbook/
++http://g-metrics.com/index.php
+-http://tor.eff.org/
++http://www.search-this.com/search_engine_decoder.asp
++http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html
++http://www.adaptivepath.com/publications/essays/archives/000385.php
+-http://isnoop.net/gmail/
+-http://openweb.eu.org/
++http://www.mistergooddeal.com/
++http://javatoolbox.com/
+-http://www.freenews.fr/
++http://www.wikiwax.com/
+-http://today.java.net/pub/a/today/2005/04/21/farm.html
++http://users.skynet.be/J.Beever/pave.htm
++http://www.lundi8h.com/
++http://www.snap.com/
++http://www.goosee.com/puppy/index.shtml
+-http://www.softwarefreedom.org/index.html
+-http://y.20q.net/
++http://www.bitty.com/
++http://www.lafraise.com/
+-http://www.liquidinformation.org/
++http://www.searchtools.com/
++http://www.martinfowler.com/articles/injection.html
++http://pdos.csail.mit.edu/scigen/
+-http://developer.yahoo.net/blog/
++http://blogger-templates.blogspot.com/
++http://phpadsnew.com/two/
++http://www.langreiter.com/exec/yahoo-vs-google.html
+-http://www.dataparksearch.org/
+-http://www.yubnub.org/
+-http://www.fing.org/
+-http://www.swish-e.org/
+-http://www.openajax.net/wordpress/
++http://crypto.stanford.edu/PwdHash/
++http://www.html-kit.com/favicon/
+-http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1
++http://www.durhamtownship.com/
++http://jiwire.com/
++http://www.insilmaril.de/vym/
+-http://www.spreadshirt.net/
++http://www.goffice.com/
++http://www.writely.com/
++http://www.milindparikh.com/
++http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html
++http://www.wikyblog.com/Map/Guest/Home
+-http://www.kottke.org/05/08/googleos-webos
++http://www.rollyo.com/
++http://www.meebo.com/
++http://www.factbites.com/
++http://www.placeopedia.com/
++http://swoogle.umbc.edu/
++http://www.viaduc.com/
+-http://demo.wikiwyg.net/wikiwyg/demo/standalone/
++http://podcasts.yahoo.com/
+-http://beaglewiki.org/Main_Page
++http://yq.search.yahoo.com/
+-http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1
++http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html
++http://socialight.com/
++http://www.lexxe.com/
++http://www.xom.nu/
++http://www.turboprint.de/
++http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27
++http://www.wi-fiplanet.com/tutorials/article.php/3562391
++http://particletree.com/features/10-tips-to-a-better-form/
++http://www.songbirdnest.com/
+-http://www.w3.org/Talks/Tools/Slidy/
+-http://www.compassframework.org/display/SITE/Home
++http://motrech.blogspot.com/
++http://www.moteurzine.com/
++http://www.mex-search.com/
+-http://beta.previewseek.com/?mdc=y&amp;twin=n&amp;ilang=french
++http://www.goshme.com/
++http://rialto.application-servers.com/
++http://www.multe-pass.com/
++http://www.tailrank.com/
++http://www.vandertramp.com/INTERNETDOWN/
++http://www.letterjames.de/index.html
++http://code.google.com/index.html
++http://www.kritx.com/
++http://performancing.com/firefox
++http://www.mywebsearch.com/
+-http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1
++http://www.lukew.com/resources/articles/blogs2.asp
+-http://www.hyperwords.net/
++http://ajax.parish.ath.cx/translator/
++http://www.maplandia.com/
+-http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages
++http://onefeed.com/index.php
++http://www.file-swap.com/
+-http://opennlp.org/
++http://mindprod.com/jgloss/encoding.html
++http://code.google.com/webstats/index.html
++http://www.freeweb-hosting.com/google_pagerank_pr_checker/
+-http://www.framakey.org/
+-http://microformats.org/wiki/hreview
+-http://www.ashesandsnow.org/index2.html
+-http://uima-framework.sourceforge.net/
++http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html
+-http://www.anandtech.com/IT/showdoc.aspx?i=2523&amp;p=2
++http://fr.techcrunch.com/
+-http://developer.yahoo.net/yui/
++http://www.fredrikodman.com/
++http://www.mpirical.com/companion/mpirical_companion.html
++http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html
+-http://k9copy.free.fr/
+-http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
+-http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design
+-http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2
++http://blogokat.canalblog.com/archives/2005/11/02/882454.html
++http://robur.slu.se/jensl/xmlclitools/
+-http://www.internetactu.net/?p=6291
+-http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1
++http://www.memodata.com/2004/fr/alexandria/
+-http://presse-citron.net/?2006/01/23/654-joomla-pete-grave
++http://www.randomerror.com/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/
+-http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395
+-http://interstices.info/display.jsp?id=c_15918
++http://www.tech-invite.com/
++http://www.croczilla.com/zap
+-http://www.libervis.com/modules/wordpress/?p=13
++http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/
+-http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm
++http://www.influo.com/
++http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html
+-http://www.addnb.org/fr/docs/webinvisible.htm
+-http://manhack.net/
+-http://www.jibaku.net/
++http://www.pipologie.com/
++http://christophenoel.blogspot.com/
+-http://www.seekport.fr/seekbot/
++http://beta.exalead.com/
+-http://www.boolgum.fr/index.html
++http://www.kesako.canalblog.com/
++http://loran.blogspot.com/
++http://outils-recherche.blogspot.com/
++http://www.art-dept.com/artists/giacobbe/
++http://www.meggould.netfirms.com/site_seeingIII.htm
++http://www.freedpi.com/
++http://www.frenchfred.com/
++http://www.photoways.com/
+-http://freco.free.fr/index.htm
+-http://triturages.free.fr/index.htm
+-http://www.qsos.org/
++http://www.alvis.info/alvis/
++http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/
+-http://www.shinux.org/
++http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml
++http://www.kurobox.com/online/tiki-index.php
+-http://news.gmane.org/gmane.comp.misc.linkstation.linux
++http://www.imsbook.com/SIP-IMS-Standards-List.html
+-http://incubator.apache.org/directory/subprojects/snickers/
+-http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html
+-http://sourceforge.net/projects/cryptix-asn1/
+-http://sourceforge.net/projects/basn/
+-http://asn1.elibel.tm.fr/fr/index.htm
+-http://sourceforge.net/projects/a2j/
++http://www.degrouptest.com/
++http://interstices.info/
++http://louvre-boite.viabloga.com/news/18.shtml
+-http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html
++http://poiplace.oabsoftware.nl/
+-http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759
+-http://www.yoono.com/favorites.jsp?user-id=lquerel
+-http://www.librecours.org/cgi-bin/main
+-http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1
+-http://limo.sourceforge.net/
++http://www-scf.usc.edu/%7Emattmann/
++http://spaces.msn.com/members/famillezen/
+-http://photos.joune.org/
+-http://www.canon.fr/paperart/
++http://flash.eastweb.ru/files/20051024092150.swf
++http://www.xsltwiki.com/index.php/Main_Page
++http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/
+-http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31
++http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html
+-http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/
++http://www.aeliosfinance.com/
++http://www.capital-it.com/
+-http://www.tradedoubler.fr/pan/public/solutions/publisher
+-http://www.recherche.gouv.fr/technologie/concours/2006/index.htm
++http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/
++http://wanabo.com/
+-http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1
+-http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam
++http://aeliosfinance.com/
++http://www.centreincubation.com/
++http://www.franceincubation.com/
+-http://www.oseo.fr/
++http://www.i18nfaq.com/chardet.html
+-http://cpdetector.sourceforge.net/
++http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles
++http://chezlorry.ca/Accueil.htm
++http://cetnia.blogs.com/d_lires/
+-http://www.directwine.fr/
++http://www.new-phenix.com/
+-http://upnp.sourceforge.net/
+-http://www.pixmania.fr/
+-http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
++http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/
++http://www.stepnewz.com/sn/default.asp
++http://opquast.com/
+-http://www.freeplayer.org/
+-http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie
+-http://atomcomputer.free.fr/fbox/
+-http://www.internetactu.net/index.php?p=6100
+-http://mammouthland.free.fr/cours/css/genecss.php
+-http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1
++http://www-106.ibm.com/developerworks/xml/library/x-xapi.html
+-http://xml.apache.org/xalan-j/extensions.html
++http://developers.sun.com/foryourbusiness/jcc/
++http://blogs.sun.com/roller/page/roumen/Weblog
+-http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1
+-http://blog.developpez.com/index.php?blog=51&amp;p=1389&amp;more=1&amp;c=1&amp;tb=1&amp;pb=1
++http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/
++http://odur.let.rug.nl/%7Evannoord/
+-http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
+-http://artist.inist.fr/
++http://www.elra.info/
+-http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO
++http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability
++http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval
++http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/
++http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/
++http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/
++http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/
++http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html
+-http://www.lexique.org/
++http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/
++http://www.streamium.com/products/mx6000i/
+-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&amp;scy=FR&amp;slg=fr
+-http://store.interact-tv.com/store/product_info.php?cPath=9&amp;products_id=73
++http://www.tversity.com/
+-http://www.aspseek.org/index.php
+-http://www.ilovejesus.com/myhome/holt/coloringbk/eternallife.gif
+-ftp://ftp.pitt.edu/group/student-activities/chess/PGN/Players/morphypg.zip
+-http://www.lib.utexas.edu/maps/africa/botswana_rel95.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/cer_man.jpg
+-http://www.lib.utexas.edu/maps/united_states/greenfield_io_83.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/aly_oli.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Huron.jpg
+-http://history.vineyard.net/mv1887.gif
+-http://www.uscg.mil/history/weblighthouses/hyannisfront.JPG
+-http://www.mytho-fleurs.com/images/vivaces/mentha%20aquatica.JPG
+-http://www.botany.hawaii.edu/faculty/carr/images/ech_cor.jpg
+-http://www.lib.utexas.edu/maps/historical/oakland_1917.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/den_sp1.jpg
+-http://www.lib.utexas.edu/maps/europe/sanmarino.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Brown.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/del_sub_hab.jpg
+-http://www.houseofmabel.com/programs/html3/LinkSure.zip
+-http://www.lib.utexas.edu/maps/states/california.gif
+-http://www.dot.state.oh.us/maps/CountyImages/Highland.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Darke.jpg
diff --git a/src/plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt b/src/plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt
new file mode 100644
index 0000000..27a918b
--- /dev/null
+++ b/src/plugin/urlfilter-fast/sample/fast-urlfilter-benchmark.txt
@@ -0,0 +1,25 @@
+# port of urlfilter-regex benchmarks to urlfilter-fast
+# cf.
+#    src/plugin/urlfilter-regex/sample/Benchmarks.rules
+#    src/plugin/urlfilter-regex/sample/Benchmarks.urls
+
+# skip file:, ftp:, & mailto: urls
+# -^(file|ftp|mailto):
+
+# skip image and other suffixes we can't yet parse
+Domain .
+  DenyPath (?i)\.(?:gif|jpg|ico|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|exe|png)$
+
+# skip URLs containing certain characters as probable queries, etc.
+Domain .
+  DenyPathQuery [?*!@=]
+
+# skip .fr .org and .net domains
+Domain fr
+  DenyPath .*
+Domain org
+  DenyPath .*
+Domain net
+  DenyPath .*
+
+# accept every URL not matched by any rule
diff --git a/src/plugin/urlfilter-fast/sample/fast-urlfilter-test.txt b/src/plugin/urlfilter-fast/sample/fast-urlfilter-test.txt
new file mode 100644
index 0000000..9f26529
--- /dev/null
+++ b/src/plugin/urlfilter-fast/sample/fast-urlfilter-test.txt
@@ -0,0 +1,19 @@
+Host www.example.org
+  DenyPath ^/path/to/be/excluded
+  DenyPath ^/some/other/path/excluded
+
+# Deny everything from *.example.com and example.com
+Domain example.com
+  DenyPath .*
+
+Domain example.org
+  DenyPathQuery /resource/.*?action=exclude
+
+# exclude images from image server
+Host i.example.org
+  DenyPath (?i)\.jpe?g$
+
+# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
+Domain .
+  DenyPath (/.+?)/.*?\1/.*?\1/
+
diff --git a/src/plugin/urlfilter-fast/sample/test.urls b/src/plugin/urlfilter-fast/sample/test.urls
new file mode 100644
index 0000000..3aa4354
--- /dev/null
+++ b/src/plugin/urlfilter-fast/sample/test.urls
@@ -0,0 +1,21 @@
+-https://www.example.org/path/to/be/excluded
+-https://www.example.org/path/to/be/excluded/continued
+-https://www.example.org/some/other/path/excluded
++https://www.example.org/
++https://www.example.org/%20white%20space%20in%20path%20escaped/
+-https://www1.example.com/
+-https://www2.example.com/
+-https://www.subnet.example.com/
++https://www.examplex.com/
++https://www.example.co.uk/
++https://www.example.com.za/
+-https://www.example.org/resource/put?action=exclude
+-http://www.nutch.org/abcd/foo/bar/foo/bar/foo/
+-http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/
++http://www.nutch.org/abcd/foo1/bar1/zzz1/
+-https://i.example.org/394d46ef76ee5c1bbad1cb98b40dc463d322c94d/c=0-129-2047-1285/635969287686419433-WORLD-40943944.JPG?width=3200&height=1680&fit=crop
+-ftp://ftp.example.com/file1.txt
++ftp://ftp.example.org/file1.txt
++file:/path/file1.txt
++file:///path/file1.txt
+-file:/abcd/foo/bar/xyz/foo/bar/foo/
\ No newline at end of file
diff --git a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
new file mode 100644
index 0000000..d53a2fd
--- /dev/null
+++ b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/FastURLFilter.java
@@ -0,0 +1,315 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.fast;
+
+import com.google.common.collect.LinkedHashMultimap;
+import com.google.common.collect.Multimap;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLFilter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.invoke.MethodHandles;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.net.URL;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+/**
+ * Filters URLs based on a file of regular expressions using host/domains
+ * matching first. The default policy is to accept a URL if no matches are
+ * found.
+ *
+ * Rule Format:
+ * 
+ * <pre>
+ * Host www.example.org
+ *   DenyPath /path/to/be/excluded
+ *   DenyPath /some/other/path/excluded
+ *
+ * # Deny everything from *.example.com and example.com
+ * Domain example.com
+ *   DenyPath .*
+ *
+ * Domain example.org
+ *   DenyPathQuery /resource/.*?action=exclude
+ * </pre>
+ * 
+ * <code>Host</code> rules are evaluated before <code>Domain</code> rules. For
+ * <code>Host</code> rules the entire host name of a URL must match while the
+ * domain names in <code>Domain</code> rules are considered as matches if the
+ * domain is a suffix of the host name (consisting of complete host name parts).
+ * Shorter domain suffixes are checked first, a single dot
+ * &quot;<code>.</code>&quot; as &quot;domain name&quot; can be used to specify
+ * global rules applied to every URL.
+ * 
+ * E.g., for "www.example.com" the rules given above are looked up in the
+ * following order:
+ * <ol>
+ * <li>check "www.example.com" whether host-based rules exist and whether one of
+ * them matches</li>
+ * <li>check "www.example.com" for domain-based rules</li>
+ * <li>check "example.com" for domain-based rules</li>
+ * <li>check "com" for domain-based rules</li>
+ * <li>check for global rules (&quot;<code>Domain .</code>&quot;)</li>
+ * </ol>
+ * The first matching rule will reject the URL and no further rules are checked.
+ * If no rule matches the URL is accepted. URLs without a host name (e.g.,
+ * <code>file:/path/file.txt</code> are checked for global rules only. URLs
+ * which fail to be parsed as {@link java.net.URL} are always rejected.
+ * 
+ * For rules either the URL path (<code>DenyPath</code>) or path and query
+ * (<code>DenyPathQuery</code>) are checked whether the given
+ * {@link java.util.regex Java Regular expression} is found (see
+ * {@link java.util.regex.Matcher#find()}) in the URL path (and query).
+ * 
+ * Rules are applied in the order of their definition. For better performance,
+ * regular expressions which are simpler/faster or match more URLs should be
+ * defined earlier.
+ * 
+ * Comments in the rule file start with the <code>#</code> character and reach
+ * until the end of the line.
+ * 
+ * The rules file is defined via the property <code>urlfilter.fast.file</code>,
+ * the default name is <code>fast-urlfilter.txt</code>.
+ */
+public class FastURLFilter implements URLFilter {
+
+  protected static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  private Configuration conf;
+  public static final String URLFILTER_FAST_FILE = "urlfilter.fast.file";
+  private Multimap<String, Rule> hostRules = LinkedHashMultimap.create();
+  private Multimap<String, Rule> domainRules = LinkedHashMultimap.create();
+
+  private static final Pattern CATCH_ALL_RULE = Pattern
+      .compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$");
+
+  public FastURLFilter() {}
+
+  FastURLFilter(Reader rules) throws IOException, PatternSyntaxException {
+    reloadRules(rules);
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    try {
+      reloadRules();
+    } catch (Exception e) {
+      LOG.error(e.getMessage());
+      throw new RuntimeException(e.getMessage(), e);
+    }
+  }
+
+  @Override
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  @Override
+  public String filter(String url) {
+
+    URL u;
+
+    try {
+      u = new URL(url);
+    } catch (Exception e) {
+      LOG.debug("Rejected {} because failed to parse as URL: {}", url,
+          e.getMessage());
+      return null;
+    }
+
+    String hostname = u.getHost();
+
+    // first check for host-specific rules
+    for (Rule rule : hostRules.get(hostname)) {
+      if (rule.match(u)) {
+        return null;
+      }
+    }
+
+    // also look up domain rules for host name
+    for (Rule rule : domainRules.get(hostname)) {
+      if (rule.match(u)) {
+        return null;
+      }
+    }
+
+    // check suffixes of host name from longer to shorter:
+    // subdomains, domain, top-level domain
+    int start = 0;
+    int pos;
+    while ((pos = hostname.indexOf('.', start)) != -1) {
+      start = pos + 1;
+      String domain = hostname.substring(start);
+      for (Rule rule : domainRules.get(domain)) {
+        if (rule.match(u)) {
+          return null;
+        }
+      }
+    }
+
+    // finally check "global" rules defined for `Domain .`
+    for (Rule rule : domainRules.get(".")) {
+      if (rule.match(u)) {
+        return null;
+      }
+    }
+
+    // no reject rules found
+    return url;
+  }
+
+  public void reloadRules() throws IOException {
+    String fileRules = conf.get(URLFILTER_FAST_FILE);
+    try (Reader reader = conf.getConfResourceAsReader(fileRules)) {
+      reloadRules(reader);
+    }
+  }
+
+  private void reloadRules(Reader rules) throws IOException {
+    domainRules.clear();
+    hostRules.clear();
+
+    BufferedReader reader = new BufferedReader(rules);
+
+    String current = null;
+    boolean host = false;
+    int lineno = 0;
+
+    String line;
+    try {
+      while((line = reader.readLine()) != null) {
+        lineno++;
+        line = line.trim();
+
+        if (line.indexOf("#") != -1) {
+          // strip comments
+          line = line.substring(0, line.indexOf("#")).trim();
+        }
+
+        if (StringUtils.isBlank(line)) {
+          continue;
+        }
+
+        if (line.startsWith("Host")) {
+          host = true;
+          current =  line.split("\\s+")[1];
+        } else if (line.startsWith("Domain")) {
+          host = false;
+          current = line.split("\\s+")[1];
+        } else {
+          if (current == null) {
+            continue;
+          }
+
+          Rule rule = null;
+          try {
+            if (CATCH_ALL_RULE.matcher(line).matches()) {
+              rule = DenyAllRule.getInstance();
+            } else if (line.startsWith("DenyPathQuery")) {
+              rule = new DenyPathQueryRule(line.split("\\s+")[1]);
+            } else if (line.startsWith("DenyPath")) {
+                rule = new DenyPathRule(line.split("\\s+")[1]);
+            } else {
+              LOG.warn("Problem reading rule on line {}: {}", lineno, line);
+              continue;
+            }
+          } catch (Exception e) {
+            LOG.warn("Problem reading rule on line {}: {} - {}", lineno, line, e.getMessage());
+            continue;
+          }
+
+          if (host) {
+            LOG.trace("Adding host rule [{}] [{}]", current, rule);
+            hostRules.put(current, rule);
+          } else {
+            LOG.trace("Adding domain rule [{}] [{}]", current, rule);
+            domainRules.put(current, rule);
+          }
+        }
+      }
+    } catch (IOException e) {
+      LOG.warn("Caught exception while reading rules file at line {}: {}",
+          lineno, e.getMessage());
+      throw e;
+    }
+  }
+
+  public static class Rule {
+    protected Pattern pattern;
+
+    Rule() {}
+
+    public Rule(String regex) {
+      pattern = Pattern.compile(regex);
+    }
+
+    public boolean match(URL url) {
+      return pattern.matcher(url.toString()).find();
+    }
+
+    public String toString() {
+       return pattern.toString();
+    }
+  }
+
+  public static class DenyPathRule extends Rule {
+    public DenyPathRule(String regex) {
+      super(regex);
+    }
+
+    public boolean match(URL url) {
+      String haystack = url.getPath();
+      return pattern.matcher(haystack).find();
+    }
+  }
+
+  /** Rule for <code>DenyPath .*</code> or <code>DenyPath .?</code> */
+  public static class DenyAllRule extends Rule {
+
+    private static Rule instance = new DenyAllRule(".");
+
+    private DenyAllRule(String regex) {
+      super(regex);
+    }
+
+    public static Rule getInstance() {
+      return instance;
+    }
+
+    public boolean match(URL url) {
+      return true;
+    }
+  }
+
+  public static class DenyPathQueryRule extends Rule {
+    public DenyPathQueryRule(String regex) {
+      super(regex);
+    }
+
+    public boolean match(URL url) {
+      String haystack = url.getFile();
+      return pattern.matcher(haystack).find();
+    }
+  }
+}
diff --git a/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/package-info.java b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/package-info.java
new file mode 100644
index 0000000..d56f948
--- /dev/null
+++ b/src/plugin/urlfilter-fast/src/java/org/apache/nutch/urlfilter/fast/package-info.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin that first does fast exact suffix matches on host/domain
+ * names before applying regular expressions to the path component of a URL. See
+ * {@link org.apache.nutch.urlfilter.fast.FastURLFilter} for a description of
+ * the rule format.
+ */
+package org.apache.nutch.urlfilter.fast;
diff --git a/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
new file mode 100644
index 0000000..9609228
--- /dev/null
+++ b/src/plugin/urlfilter-fast/src/test/org/apache/nutch/urlfilter/fast/TestFastURLFilter.java
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.fast;
+
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class TestFastURLFilter extends RegexURLFilterBaseTest {
+
+  protected URLFilter getURLFilter(Reader rules) {
+    try {
+      return new FastURLFilter(rules);
+    } catch (IOException e) {
+      Assert.fail(e.toString());
+      return null;
+    }
+  }
+
+  @Test
+  public void test() {
+    test("fast-urlfilter-test.txt", "test.urls");
+    test("fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+  }
+
+  @Test
+  public void benchmark() {
+    bench(50, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+    bench(100, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+    bench(200, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+    bench(400, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+    bench(800, "fast-urlfilter-benchmark.txt", "Benchmarks.urls");
+  }
+
+}
diff --git a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
index fe04d84..d1d5caa 100644
--- a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
+++ b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/urlfilter-regex/sample/Benchmarks.urls b/src/plugin/urlfilter-regex/sample/Benchmarks.urls
index 40bf4ee..6a0e822 100644
--- a/src/plugin/urlfilter-regex/sample/Benchmarks.urls
+++ b/src/plugin/urlfilter-regex/sample/Benchmarks.urls
@@ -294,4 +294,24 @@
 -http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&amp;scy=FR&amp;slg=fr
 -http://store.interact-tv.com/store/product_info.php?cPath=9&amp;products_id=73
 +http://www.tversity.com/
--http://www.aspseek.org/index.php
\ No newline at end of file
+-http://www.aspseek.org/index.php
+-http://www.ilovejesus.com/myhome/holt/coloringbk/eternallife.gif
+-ftp://ftp.pitt.edu/group/student-activities/chess/PGN/Players/morphypg.zip
+-http://www.lib.utexas.edu/maps/africa/botswana_rel95.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/cer_man.jpg
+-http://www.lib.utexas.edu/maps/united_states/greenfield_io_83.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/aly_oli.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Huron.jpg
+-http://history.vineyard.net/mv1887.gif
+-http://www.uscg.mil/history/weblighthouses/hyannisfront.JPG
+-http://www.mytho-fleurs.com/images/vivaces/mentha%20aquatica.JPG
+-http://www.botany.hawaii.edu/faculty/carr/images/ech_cor.jpg
+-http://www.lib.utexas.edu/maps/historical/oakland_1917.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/den_sp1.jpg
+-http://www.lib.utexas.edu/maps/europe/sanmarino.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Brown.jpg
+-http://www.botany.hawaii.edu/faculty/carr/images/del_sub_hab.jpg
+-http://www.houseofmabel.com/programs/html3/LinkSure.zip
+-http://www.lib.utexas.edu/maps/states/california.gif
+-http://www.dot.state.oh.us/maps/CountyImages/Highland.jpg
+-http://www.dot.state.oh.us/maps/CountyImages/Darke.jpg
diff --git a/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java b/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
index 118cd90..9245a80 100644
--- a/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
+++ b/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java b/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
index b86181e..d86c712 100644
--- a/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
+++ b/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
index 53e5ce6..df5a5d8 100644
--- a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
+++ b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.urlfilter.suffix;
 
 import org.apache.hadoop.conf.Configuration;
diff --git a/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java b/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
index b09ca2f..eecb2b2 100644
--- a/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
+++ b/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java b/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
index 3e3b8bc..e34e087 100644
--- a/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
+++ b/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer.urlmeta;
 
 import org.apache.hadoop.conf.Configuration;
diff --git a/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java b/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
index 8c9efac..1b179ba 100644
--- a/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
+++ b/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.scoring.urlmeta;
 
 import java.util.Collection;
diff --git a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
index 7d1d3f0..ae860b6 100644
--- a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
+++ b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net.urlnormalizer.ajax;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java b/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
index b8addf0..5a13879 100644
--- a/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
+++ b/src/plugin/urlnormalizer-ajax/src/test/org/apache/nutch/net/urlnormalizer/ajax/TestAjaxURLNormalizer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net.urlnormalizer.ajax;
 
 import org.apache.hadoop.conf.Configuration;
diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index 6a33690..d0e8f5f 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net.urlnormalizer.basic;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 1829097..102b10c 100644
--- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net.urlnormalizer.basic;
 
 import java.net.MalformedURLException;
diff --git a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
index 86fea1b..ef83284 100644
--- a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
+++ b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java b/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java
index 03d510c..717471c 100644
--- a/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java
+++ b/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net.urlnormalizer.pass;
 
 import java.net.MalformedURLException;
diff --git a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
index e72b0d2..12ecbf4 100644
--- a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
+++ b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java b/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
index dbaf4d2..60ec55e 100644
--- a/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
+++ b/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java b/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
index 7626f60..11048c3 100644
--- a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
+++ b/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net.urlnormalizer.regex;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java b/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
index 2afbd75..9c1bf99 100644
--- a/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
+++ b/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.net.urlnormalizer.regex;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
index ae094aa..8d05f5e 100644
--- a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
+++ b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java b/src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java
index c0d6789..5d36fe9 100644
--- a/src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java
+++ b/src/test/org/apache/nutch/crawl/ContinuousCrawlTestUtil.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.IOException;
diff --git a/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java b/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
index 74c54d5..544d622 100644
--- a/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
+++ b/src/test/org/apache/nutch/crawl/CrawlDbUpdateTestDriver.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java b/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java
index 1d1f1e7..1417adf 100644
--- a/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java
+++ b/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/test/org/apache/nutch/crawl/DummyWritable.java b/src/test/org/apache/nutch/crawl/DummyWritable.java
index 94c27b5..d7d0d0a 100644
--- a/src/test/org/apache/nutch/crawl/DummyWritable.java
+++ b/src/test/org/apache/nutch/crawl/DummyWritable.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import org.apache.hadoop.io.IntWritable;
diff --git a/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java b/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
index d16c6bd..4150088 100644
--- a/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
+++ b/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.IOException;
diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
index db08380..82fefaf 100644
--- a/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
+++ b/src/test/org/apache/nutch/crawl/TestCrawlDbFilter.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.IOException;
diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java b/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
index 7188203..622fc98 100644
--- a/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
+++ b/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java b/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
index ed25fd6..375e331 100644
--- a/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
+++ b/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.io.IOException;
diff --git a/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java b/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
index de9e4ac..14fd05e 100644
--- a/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
+++ b/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.crawl;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java b/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java
index 256723b..95a5b41 100644
--- a/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java
+++ b/src/test/org/apache/nutch/indexer/TestIndexerMapReduce.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.indexer;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/test/org/apache/nutch/metadata/TestMetadata.java b/src/test/org/apache/nutch/metadata/TestMetadata.java
index f3a320d..0804e3e 100644
--- a/src/test/org/apache/nutch/metadata/TestMetadata.java
+++ b/src/test/org/apache/nutch/metadata/TestMetadata.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java b/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
index ae73ae1..1ec52de 100644
--- a/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
+++ b/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java b/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java
index 1f2c833..a0515c3 100644
--- a/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java
+++ b/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import org.apache.nutch.parse.Outlink;
diff --git a/src/test/org/apache/nutch/parse/TestOutlinks.java b/src/test/org/apache/nutch/parse/TestOutlinks.java
index 499a82d..78c051e 100644
--- a/src/test/org/apache/nutch/parse/TestOutlinks.java
+++ b/src/test/org/apache/nutch/parse/TestOutlinks.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import org.junit.Test;
diff --git a/src/test/org/apache/nutch/parse/TestParseData.java b/src/test/org/apache/nutch/parse/TestParseData.java
index 550a260..0dbbf78 100644
--- a/src/test/org/apache/nutch/parse/TestParseData.java
+++ b/src/test/org/apache/nutch/parse/TestParseData.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import org.apache.nutch.util.WritableTestUtils;
diff --git a/src/test/org/apache/nutch/parse/TestParseText.java b/src/test/org/apache/nutch/parse/TestParseText.java
index 241b293..3873632 100644
--- a/src/test/org/apache/nutch/parse/TestParseText.java
+++ b/src/test/org/apache/nutch/parse/TestParseText.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 import org.apache.nutch.util.WritableTestUtils;
diff --git a/src/test/org/apache/nutch/parse/TestParserFactory.java b/src/test/org/apache/nutch/parse/TestParserFactory.java
index 00c524e..c996ef7 100644
--- a/src/test/org/apache/nutch/parse/TestParserFactory.java
+++ b/src/test/org/apache/nutch/parse/TestParserFactory.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.parse;
 
 // Nutch imports
diff --git a/src/test/org/apache/nutch/plugin/HelloWorldExtension.java b/src/test/org/apache/nutch/plugin/HelloWorldExtension.java
index fa564c4..451815d 100644
--- a/src/test/org/apache/nutch/plugin/HelloWorldExtension.java
+++ b/src/test/org/apache/nutch/plugin/HelloWorldExtension.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.plugin;
 
 /**
diff --git a/src/test/org/apache/nutch/plugin/ITestExtension.java b/src/test/org/apache/nutch/plugin/ITestExtension.java
index b6aa81d..cf1729d 100644
--- a/src/test/org/apache/nutch/plugin/ITestExtension.java
+++ b/src/test/org/apache/nutch/plugin/ITestExtension.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/test/org/apache/nutch/plugin/SimpleTestPlugin.java b/src/test/org/apache/nutch/plugin/SimpleTestPlugin.java
index 080142d..5eb3450 100644
--- a/src/test/org/apache/nutch/plugin/SimpleTestPlugin.java
+++ b/src/test/org/apache/nutch/plugin/SimpleTestPlugin.java
@@ -1,5 +1,4 @@
 /*
-/**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -15,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.plugin;
 
 import org.apache.hadoop.conf.Configuration;
diff --git a/src/test/org/apache/nutch/plugin/TestPluginSystem.java b/src/test/org/apache/nutch/plugin/TestPluginSystem.java
index f757e0e..dba7c66 100644
--- a/src/test/org/apache/nutch/plugin/TestPluginSystem.java
+++ b/src/test/org/apache/nutch/plugin/TestPluginSystem.java
@@ -1,5 +1,4 @@
 /*
- /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
diff --git a/src/test/org/apache/nutch/protocol/TestContent.java b/src/test/org/apache/nutch/protocol/TestContent.java
index 1475cda..e6a2a0e 100644
--- a/src/test/org/apache/nutch/protocol/TestContent.java
+++ b/src/test/org/apache/nutch/protocol/TestContent.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.protocol;
 
 import org.apache.nutch.metadata.Metadata;
diff --git a/src/test/org/apache/nutch/service/TestNutchServer.java b/src/test/org/apache/nutch/service/TestNutchServer.java
index 4d42f7b..811285f 100644
--- a/src/test/org/apache/nutch/service/TestNutchServer.java
+++ b/src/test/org/apache/nutch/service/TestNutchServer.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.service;
 
 import java.lang.invoke.MethodHandles;
diff --git a/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java b/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
index 1429925..d49b993 100644
--- a/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
+++ b/src/test/org/apache/nutch/tools/TestCommonCrawlDataDumper.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.tools;
 
 //Junit imports
diff --git a/src/test/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java b/src/test/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java
index b4771d0..551a952 100644
--- a/src/test/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java
+++ b/src/test/org/apache/nutch/tools/proxy/AbstractTestbedHandler.java
@@ -1,5 +1,3 @@
-package org.apache.nutch.tools.proxy;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,6 +14,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.nutch.tools.proxy;
+
 import java.io.IOException;
 
 import javax.servlet.ServletException;
diff --git a/src/test/org/apache/nutch/tools/proxy/DelayHandler.java b/src/test/org/apache/nutch/tools/proxy/DelayHandler.java
index 58f1f43..1de71b5 100644
--- a/src/test/org/apache/nutch/tools/proxy/DelayHandler.java
+++ b/src/test/org/apache/nutch/tools/proxy/DelayHandler.java
@@ -1,5 +1,3 @@
-package org.apache.nutch.tools.proxy;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.nutch.tools.proxy;
 
 import java.io.IOException;
 import java.util.Random;
diff --git a/src/test/org/apache/nutch/tools/proxy/FakeHandler.java b/src/test/org/apache/nutch/tools/proxy/FakeHandler.java
index a40b199..915f1fe 100644
--- a/src/test/org/apache/nutch/tools/proxy/FakeHandler.java
+++ b/src/test/org/apache/nutch/tools/proxy/FakeHandler.java
@@ -1,5 +1,3 @@
-package org.apache.nutch.tools.proxy;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.nutch.tools.proxy;
 
 import java.io.IOException;
 import java.io.OutputStream;
diff --git a/src/test/org/apache/nutch/tools/proxy/LogDebugHandler.java b/src/test/org/apache/nutch/tools/proxy/LogDebugHandler.java
index 7db5b02..2b61953 100644
--- a/src/test/org/apache/nutch/tools/proxy/LogDebugHandler.java
+++ b/src/test/org/apache/nutch/tools/proxy/LogDebugHandler.java
@@ -1,5 +1,3 @@
-package org.apache.nutch.tools.proxy;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.nutch.tools.proxy;
 
 import java.lang.invoke.MethodHandles;
 import java.io.IOException;
diff --git a/src/test/org/apache/nutch/tools/proxy/NotFoundHandler.java b/src/test/org/apache/nutch/tools/proxy/NotFoundHandler.java
index ef439a6..75318db 100644
--- a/src/test/org/apache/nutch/tools/proxy/NotFoundHandler.java
+++ b/src/test/org/apache/nutch/tools/proxy/NotFoundHandler.java
@@ -1,5 +1,3 @@
-package org.apache.nutch.tools.proxy;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.nutch.tools.proxy;
 
 import java.io.IOException;
 
diff --git a/src/test/org/apache/nutch/tools/proxy/ProxyTestbed.java b/src/test/org/apache/nutch/tools/proxy/ProxyTestbed.java
index a6b2d8e..48f0fe9 100644
--- a/src/test/org/apache/nutch/tools/proxy/ProxyTestbed.java
+++ b/src/test/org/apache/nutch/tools/proxy/ProxyTestbed.java
@@ -1,5 +1,3 @@
-package org.apache.nutch.tools.proxy;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.nutch.tools.proxy;
 
 import java.lang.invoke.MethodHandles;
 import java.util.Arrays;
diff --git a/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java b/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java
index 3f4fb94..37821d5 100644
--- a/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java
+++ b/src/test/org/apache/nutch/tools/proxy/SegmentHandler.java
@@ -1,5 +1,3 @@
-package org.apache.nutch.tools.proxy;
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -16,6 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+package org.apache.nutch.tools.proxy;
 
 import java.lang.invoke.MethodHandles;
 import java.io.Closeable;
diff --git a/src/test/org/apache/nutch/util/DumpFileUtilTest.java b/src/test/org/apache/nutch/util/DumpFileUtilTest.java
index 03caa48..249d978 100644
--- a/src/test/org/apache/nutch/util/DumpFileUtilTest.java
+++ b/src/test/org/apache/nutch/util/DumpFileUtilTest.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import org.junit.Test;
diff --git a/src/test/org/apache/nutch/util/TestGZIPUtils.java b/src/test/org/apache/nutch/util/TestGZIPUtils.java
index a3d4610..fcda06c 100644
--- a/src/test/org/apache/nutch/util/TestGZIPUtils.java
+++ b/src/test/org/apache/nutch/util/TestGZIPUtils.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.io.IOException;
diff --git a/src/test/org/apache/nutch/util/TestMimeUtil.java b/src/test/org/apache/nutch/util/TestMimeUtil.java
index 72a42b4..6ebe766 100644
--- a/src/test/org/apache/nutch/util/TestMimeUtil.java
+++ b/src/test/org/apache/nutch/util/TestMimeUtil.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.io.File;
diff --git a/src/test/org/apache/nutch/util/TestNodeWalker.java b/src/test/org/apache/nutch/util/TestNodeWalker.java
index 8edf5ab..066bf1a 100644
--- a/src/test/org/apache/nutch/util/TestNodeWalker.java
+++ b/src/test/org/apache/nutch/util/TestNodeWalker.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.io.ByteArrayInputStream;
diff --git a/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java b/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java
index 9d8b07b..f86070d 100644
--- a/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java
+++ b/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import org.junit.Assert;
diff --git a/src/test/org/apache/nutch/util/TestStringUtil.java b/src/test/org/apache/nutch/util/TestStringUtil.java
index df021f0..d9398df 100644
--- a/src/test/org/apache/nutch/util/TestStringUtil.java
+++ b/src/test/org/apache/nutch/util/TestStringUtil.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import org.junit.Assert;
diff --git a/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java b/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java
index f2e8a5c..104907c 100644
--- a/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java
+++ b/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import org.junit.Assert;
diff --git a/src/test/org/apache/nutch/util/TestTableUtil.java b/src/test/org/apache/nutch/util/TestTableUtil.java
index fb07556..1f5512f 100644
--- a/src/test/org/apache/nutch/util/TestTableUtil.java
+++ b/src/test/org/apache/nutch/util/TestTableUtil.java
@@ -1,19 +1,19 @@
-/*******************************************************************************
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
- * 
+ *
  *     http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
- ******************************************************************************/
+ */
 package org.apache.nutch.util;
 
 import org.apache.nutch.util.TableUtil;
diff --git a/src/test/org/apache/nutch/util/TestURLUtil.java b/src/test/org/apache/nutch/util/TestURLUtil.java
index b1fdd5b..eaaf7d0 100644
--- a/src/test/org/apache/nutch/util/TestURLUtil.java
+++ b/src/test/org/apache/nutch/util/TestURLUtil.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import java.net.URL;
diff --git a/src/test/org/apache/nutch/util/WritableTestUtils.java b/src/test/org/apache/nutch/util/WritableTestUtils.java
index 0822603..3da2226 100644
--- a/src/test/org/apache/nutch/util/WritableTestUtils.java
+++ b/src/test/org/apache/nutch/util/WritableTestUtils.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 package org.apache.nutch.util;
 
 import org.apache.hadoop.io.*;