Merge pull request #520 from sebastian-nagel/NUTCH-2743

NUTCH-2743 Add list of Nutch properties (nutch-default.xml) to documentation
diff --git a/build.xml b/build.xml
index 8547d2b..5eb157e 100644
--- a/build.xml
+++ b/build.xml
@@ -209,6 +209,7 @@
       <packageset dir="${plugins.dir}/parse-swf/src/java"/>
       <packageset dir="${plugins.dir}/parse-tika/src/java"/>
       <packageset dir="${plugins.dir}/parse-zip/src/java"/>
+      <packageset dir="${plugins.dir}/parsefilter-debug/src/java"/>
       <packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
       <packageset dir="${plugins.dir}/parsefilter-regex/src/java"/>
       <packageset dir="${plugins.dir}/protocol-file/src/java"/>
@@ -717,6 +718,7 @@
       <packageset dir="${plugins.dir}/parse-swf/src/java"/>
       <packageset dir="${plugins.dir}/parse-tika/src/java"/>
       <packageset dir="${plugins.dir}/parse-zip/src/java"/>
+      <packageset dir="${plugins.dir}/parsefilter-debug/src/java"/>
       <packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
       <packageset dir="${plugins.dir}/parsefilter-regex/src/java"/>
       <packageset dir="${plugins.dir}/protocol-file/src/java"/>
@@ -1127,6 +1129,7 @@
         <source path="${plugins.dir}/parse-tika/src/test/" />
         <source path="${plugins.dir}/parse-zip/src/java/" />
         <source path="${plugins.dir}/parse-zip/src/test/" />
+        <source path="${plugins.dir}/parsefilter-debug/src/java/" />
         <source path="${plugins.dir}/parsefilter-naivebayes/src/java/" />
         <source path="${plugins.dir}/parsefilter-regex/src/java/" />
         <source path="${plugins.dir}/parsefilter-regex/src/test/" />
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 8c25091..b833288 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1221,6 +1221,30 @@
   </description>
 </property>
 
+<property>
+  <name>fetcher.redirect.dedupcache.seconds</name>
+  <value>-1</value>
+  <description>
+    The maximum time in seconds fetcher will cache redirects for
+    deduplication.  If the same redirect URL is seen again withing
+    this time it is skipped. This allows to avoid pathological cases
+    where many or most of the URLs of a host are redirected to the
+    same URL, eg. a page to login, accept cookies, indicating an
+    error.  A value less or equal zero disables redirect deduplication.
+    Caveat: This may break setting cookies via recursive redirect chains.
+  </description>
+</property>
+
+<property>
+  <name>fetcher.redirect.dedupcache.size</name>
+  <value>1000</value>
+  <description>
+    The maximum size of the cache to deduplicate redirects,
+    see `fetcher.redirect.dedupcache.seconds`.
+  </description>
+</property>
+
+
 <!-- SegmentReader -->
 <property>
   <name>segment.reader.content.recode</name>
diff --git a/default.properties b/default.properties
index f96c36b..e96c555 100644
--- a/default.properties
+++ b/default.properties
@@ -153,6 +153,7 @@
 # Parse Filter Plugins
 #
 plugins.parsefilter=\
+   org.apache.nutch.parsefilter.debug*:\
    org.apache.nutch.parse.headings*:\
    org.apache.nutch.parsefilter.naivebayes*:\
    org.apache.nutch.parsefilter.regex*:\
diff --git a/src/bin/crawl b/src/bin/crawl
index 331ee65..9b77ce4 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -370,8 +370,8 @@
   echo "CrawlDB update"
   __bin_nutch updatedb "${commonOptions[@]}" "$CRAWL_PATH"/crawldb  "$CRAWL_PATH"/segments/$SEGMENT
 
-# note that the link inversion - indexing routine can be done within the main loop
-# on a per segment basis
+  # note that the link inversion - indexing routine can be done within the main loop
+  # on a per segment basis
   echo "Link inversion"
   __bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
 
@@ -380,10 +380,7 @@
 
   if $INDEXFLAG; then
       echo "Indexing $SEGMENT to index"
-      __bin_nutch index "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
-
-      echo "Cleaning up index if possible"
-      __bin_nutch clean "${commonOptions[@]}" "$CRAWL_PATH"/crawldb
+      __bin_nutch index "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -deleteGone
   else
       echo "Skipping indexing ..."
   fi
diff --git a/src/bin/nutch b/src/bin/nutch
index 3a25738..244d812 100755
--- a/src/bin/nutch
+++ b/src/bin/nutch
@@ -103,6 +103,7 @@
   echo "  updatehostdb      update the host db with records from the crawl db"
   echo "  readhostdb        read / dump host db"
   echo "  sitemap           perform Sitemap processing"
+  echo "  showproperties    print Nutch/Hadoop configuration properties to stdout"
   echo " or"
   echo "  CLASSNAME         run the class named CLASSNAME"
   echo "Most commands print help when invoked w/o parameters."
@@ -311,6 +312,8 @@
   CLASS=org.apache.nutch.hostdb.ReadHostDb
 elif [ "$COMMAND" = "sitemap" ] ; then
   CLASS=org.apache.nutch.util.SitemapProcessor
+elif [ "$COMMAND" = "showproperties" ] ; then
+  CLASS=org.apache.nutch.tools.ShowProperties
 else
   CLASS=$COMMAND
 fi
diff --git a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
index 3c1003e..ce7b2b6 100644
--- a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
+++ b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
@@ -22,6 +22,7 @@
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.hadoop.conf.Configuration;
@@ -30,9 +31,13 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.google.common.base.Optional;
+import com.google.common.cache.Cache;
+import com.google.common.cache.CacheBuilder;
+
 /**
- * Convenience class - a collection of queues that keeps track of the total
- * number of items, and provides items eligible for fetching from any queue.
+ * A collection of queues that keeps track of the total number of items, and
+ * provides items eligible for fetching from any queue.
  */
 public class FetchItemQueues {
 
@@ -44,6 +49,8 @@
   private Set<String> queuesMaxExceptions = new HashSet<>();
   Iterator<Map.Entry<String, FetchItemQueue>> lastIterator = null;
   AtomicInteger totalSize = new AtomicInteger(0);
+  Cache<Text, Optional<String>> redirectDedupCache = null;
+
   int maxThreads;
   long crawlDelay;
   long minCrawlDelay;
@@ -77,6 +84,16 @@
     this.timelimit = conf.getLong("fetcher.timelimit", -1);
     this.maxExceptionsPerQueue = conf.getInt(
         "fetcher.max.exceptions.per.queue", -1);
+
+    int dedupRedirMaxTime = conf.getInt("fetcher.redirect.dedupcache.seconds",
+        -1);
+    int dedupRedirMaxSize = conf.getInt("fetcher.redirect.dedupcache.size",
+        1000);
+    if (dedupRedirMaxTime > 0 && dedupRedirMaxSize > 0) {
+      redirectDedupCache = CacheBuilder.newBuilder()
+          .maximumSize(dedupRedirMaxSize)
+          .expireAfterWrite(dedupRedirMaxTime, TimeUnit.SECONDS).build();
+    }
   }
 
   /**
@@ -246,6 +263,22 @@
     return 0;
   }
 
+  /**
+   * @param redirUrl
+   *          redirect target
+   * @return true if redirects are deduplicated and redirUrl has been queued
+   *         recently
+   */
+  public boolean redirectIsQueuedRecently(Text redirUrl) {
+    if (redirectDedupCache != null) {
+      if (redirectDedupCache.getIfPresent(redirUrl) != null) {
+        return true;
+      }
+      redirectDedupCache.put(redirUrl, Optional.absent());
+    }
+    return false;
+  }
+
   public synchronized void dump() {
     for (String id : queues.keySet()) {
       FetchItemQueue fiq = queues.get(id);
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index bc0d639..6cd1772 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -457,6 +457,8 @@
 
             if (redirecting && redirectCount > maxRedirect) {
               fetchQueues.finishFetchItem(fit);
+              context.getCounter("FetcherStatus", "redirect_count_exceeded")
+                  .increment(1);
               if (LOG.isInfoEnabled()) {
                 LOG.info("{} {} - redirect count exceeded {} ({})", getName(),
                     Thread.currentThread().getId(), fit.url,
@@ -592,6 +594,13 @@
 
   private FetchItem queueRedirect(Text redirUrl, FetchItem fit)
       throws ScoringFilterException {
+    if (fetchQueues.redirectIsQueuedRecently(redirUrl)) {
+      redirecting = false;
+      context.getCounter("FetcherStatus", "redirect_deduplicated").increment(1);
+      LOG.debug(" - ignoring redirect from {} to {} as duplicate", fit.url,
+          redirUrl);
+      return null;
+    }
     CrawlDatum newDatum = createRedirDatum(redirUrl, fit, CrawlDatum.STATUS_DB_UNFETCHED);
     fit = FetchItem.create(redirUrl, newDatum, queueMode);
     if (fit != null) {
diff --git a/src/java/org/apache/nutch/tools/ShowProperties.java b/src/java/org/apache/nutch/tools/ShowProperties.java
new file mode 100644
index 0000000..d7058d6
--- /dev/null
+++ b/src/java/org/apache/nutch/tools/ShowProperties.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.tools;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+/**
+ * Tool to list properties and their values set by the current Nutch
+ * configuration
+ */
+public class ShowProperties extends Configured implements Tool {
+
+  private String format3cols = "%-32s  %24s  %20s";
+
+  @Override
+  public int run(String[] arg0) {
+    Configuration conf = getConf();
+    List<Entry<String, String>> list = new ArrayList<>();
+    conf.iterator().forEachRemaining(list::add);
+    Collections.sort(list, (a, b) -> a.getKey().compareTo(b.getKey()));
+    System.out.println(
+        String.format(format3cols, "conf.name", "conf.value", "substituted.value"));
+    System.out.println(
+        "================================================================================");
+    for (Entry<String, String> e : list) {
+      String key = e.getKey();
+      String val = e.getValue();
+      String substitutedVal = conf.get(key);
+      if (val.equals(substitutedVal)) {
+        String format = String.format("%%-%ds  %%%ds", key.length(),
+            (80 - 2 - key.length()));
+        System.out.println(String.format(format, key, val));
+      } else {
+        String format = String.format("%%-%ds  %%%ds  %%18s", key.length(),
+            (60 - 2 - key.length()));
+        System.out
+            .println(String.format(format, key, val, substitutedVal));
+      }
+    }
+    return 0;
+  }
+
+  public static void main(String[] args) throws Exception {
+    System.exit(ToolRunner.run(NutchConfiguration.create(),
+        new ShowProperties(), args));
+  }
+
+}
diff --git a/src/java/org/apache/nutch/util/DomUtil.java b/src/java/org/apache/nutch/util/DomUtil.java
index 2461286..d0bfafd 100644
--- a/src/java/org/apache/nutch/util/DomUtil.java
+++ b/src/java/org/apache/nutch/util/DomUtil.java
@@ -22,7 +22,9 @@
 import java.io.OutputStream;
 import java.io.UnsupportedEncodingException;
 import java.lang.invoke.MethodHandles;
+import java.nio.charset.StandardCharsets;
 
+import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerConfigurationException;
 import javax.xml.transform.TransformerException;
@@ -33,6 +35,7 @@
 import org.apache.xerces.parsers.DOMParser;
 import org.w3c.dom.DocumentFragment;
 import org.w3c.dom.Element;
+import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
@@ -91,16 +94,12 @@
     try {
       transformer = transFactory.newTransformer();
       transformer.setOutputProperty("indent", "yes");
+      transformer.setOutputProperty(OutputKeys.ENCODING,
+          StandardCharsets.UTF_8.name());
       StreamResult result = new StreamResult(os);
       transformer.transform(source, result);
       os.flush();
-    } catch (UnsupportedEncodingException e1) {
-      LOG.error("Error: ", e1);
-    } catch (IOException e1) {
-      LOG.error("Error: ", e1);
-    } catch (TransformerConfigurationException e2) {
-      LOG.error("Error: ", e2);
-    } catch (TransformerException ex) {
+    } catch (IOException | TransformerException ex) {
       LOG.error("Error: ", ex);
     }
   }
@@ -108,7 +107,16 @@
   public static void saveDom(OutputStream os, DocumentFragment doc) {
     NodeList docChildren = doc.getChildNodes();
     for (int i = 0; i < docChildren.getLength(); i++) {
-      saveDom(os, (Element) docChildren.item(i));
+      Node child = docChildren.item(i);
+      if (child instanceof Element) {
+        saveDom(os, (Element) child);
+      } else {
+        try {
+          os.write(child.toString().getBytes(StandardCharsets.UTF_8));
+        } catch (IOException ex) {
+          LOG.error("Error: ", ex);
+        }
+      }
     }
   }
 }
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index d722b1a..a2a0dd7 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -67,6 +67,7 @@
     <ant dir="parse-swf" target="deploy"/>
     <ant dir="parse-tika" target="deploy"/>
     <ant dir="parse-zip" target="deploy"/>
+    <ant dir="parsefilter-debug" target="deploy"/>
     <ant dir="parsefilter-naivebayes" target="deploy"/>
     <ant dir="parsefilter-regex" target="deploy"/>
     <ant dir="protocol-file" target="deploy"/>
@@ -212,6 +213,7 @@
     <ant dir="parse-swf" target="clean"/>
     <ant dir="parse-tika" target="clean"/>
     <ant dir="parse-zip" target="clean"/>
+    <ant dir="parsefilter-debug" target="clean" />
     <ant dir="parsefilter-naivebayes" target="clean" />
     <ant dir="parsefilter-regex" target="clean"/>
     <ant dir="protocol-file" target="clean"/>
diff --git a/src/plugin/parsefilter-debug/build.xml b/src/plugin/parsefilter-debug/build.xml
new file mode 100644
index 0000000..1f175e4
--- /dev/null
+++ b/src/plugin/parsefilter-debug/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parsefilter-debug" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>
diff --git a/src/plugin/parsefilter-debug/ivy.xml b/src/plugin/parsefilter-debug/ivy.xml
new file mode 100644
index 0000000..dac80e6
--- /dev/null
+++ b/src/plugin/parsefilter-debug/ivy.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="https://nutch.apache.org/"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+</ivy-module>
diff --git a/src/plugin/parsefilter-debug/plugin.xml b/src/plugin/parsefilter-debug/plugin.xml
new file mode 100644
index 0000000..bc4a574
--- /dev/null
+++ b/src/plugin/parsefilter-debug/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="parsefilter-debug"
+   name="Debugging Parse Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="parsefilter-debug.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.htmlparsefilter.regex"
+        name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="DebugParseFilter" 
+                      class="org.apache.nutch.parsefilter.debug.DebugParseFilter">
+      </implementation>
+   </extension>
+
+</plugin>
diff --git a/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java
new file mode 100644
index 0000000..691f894
--- /dev/null
+++ b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parsefilter.debug;
+
+import java.io.ByteArrayOutputStream;
+import java.io.OutputStreamWriter;
+import java.lang.invoke.MethodHandles;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.DomUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
+
+/**
+ * Adds serialized DOM to parse data, useful for debugging, to understand how
+ * the parser implementation interprets a document (not only HTML).
+ */
+public class DebugParseFilter implements HtmlParseFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
+  private Configuration conf;
+
+  @Override
+  public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    DomUtil.saveDom(baos, doc);
+    Parse parse = parseResult.get(content.getUrl());
+    String dom = new String(baos.toByteArray(), StandardCharsets.UTF_8);
+    LOG.debug(dom);
+    parse.getData().getParseMeta().set("DOM", dom);
+    return parseResult;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+}
diff --git a/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/package-info.java b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/package-info.java
new file mode 100644
index 0000000..bbc24dd
--- /dev/null
+++ b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Adds serialized DOM to parse data, useful for debugging, to understand how
+ * the parser implementation interprets a document (not only HTML).
+ */
+package org.apache.nutch.parsefilter.debug;
+