Merge pull request #520 from sebastian-nagel/NUTCH-2743
NUTCH-2743 Add list of Nutch properties (nutch-default.xml) to documentation
diff --git a/build.xml b/build.xml
index 8547d2b..5eb157e 100644
--- a/build.xml
+++ b/build.xml
@@ -209,6 +209,7 @@
<packageset dir="${plugins.dir}/parse-swf/src/java"/>
<packageset dir="${plugins.dir}/parse-tika/src/java"/>
<packageset dir="${plugins.dir}/parse-zip/src/java"/>
+ <packageset dir="${plugins.dir}/parsefilter-debug/src/java"/>
<packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
<packageset dir="${plugins.dir}/parsefilter-regex/src/java"/>
<packageset dir="${plugins.dir}/protocol-file/src/java"/>
@@ -717,6 +718,7 @@
<packageset dir="${plugins.dir}/parse-swf/src/java"/>
<packageset dir="${plugins.dir}/parse-tika/src/java"/>
<packageset dir="${plugins.dir}/parse-zip/src/java"/>
+ <packageset dir="${plugins.dir}/parsefilter-debug/src/java"/>
<packageset dir="${plugins.dir}/parsefilter-naivebayes/src/java"/>
<packageset dir="${plugins.dir}/parsefilter-regex/src/java"/>
<packageset dir="${plugins.dir}/protocol-file/src/java"/>
@@ -1127,6 +1129,7 @@
<source path="${plugins.dir}/parse-tika/src/test/" />
<source path="${plugins.dir}/parse-zip/src/java/" />
<source path="${plugins.dir}/parse-zip/src/test/" />
+ <source path="${plugins.dir}/parsefilter-debug/src/java/" />
<source path="${plugins.dir}/parsefilter-naivebayes/src/java/" />
<source path="${plugins.dir}/parsefilter-regex/src/java/" />
<source path="${plugins.dir}/parsefilter-regex/src/test/" />
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 8c25091..b833288 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1221,6 +1221,30 @@
</description>
</property>
+<property>
+ <name>fetcher.redirect.dedupcache.seconds</name>
+ <value>-1</value>
+ <description>
+ The maximum time in seconds fetcher will cache redirects for
+ deduplication. If the same redirect URL is seen again withing
+ this time it is skipped. This allows to avoid pathological cases
+ where many or most of the URLs of a host are redirected to the
+ same URL, eg. a page to login, accept cookies, indicating an
+ error. A value less or equal zero disables redirect deduplication.
+ Caveat: This may break setting cookies via recursive redirect chains.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.redirect.dedupcache.size</name>
+ <value>1000</value>
+ <description>
+ The maximum size of the cache to deduplicate redirects,
+ see `fetcher.redirect.dedupcache.seconds`.
+ </description>
+</property>
+
+
<!-- SegmentReader -->
<property>
<name>segment.reader.content.recode</name>
diff --git a/default.properties b/default.properties
index f96c36b..e96c555 100644
--- a/default.properties
+++ b/default.properties
@@ -153,6 +153,7 @@
# Parse Filter Plugins
#
plugins.parsefilter=\
+ org.apache.nutch.parsefilter.debug*:\
org.apache.nutch.parse.headings*:\
org.apache.nutch.parsefilter.naivebayes*:\
org.apache.nutch.parsefilter.regex*:\
diff --git a/src/bin/crawl b/src/bin/crawl
index 331ee65..9b77ce4 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -370,8 +370,8 @@
echo "CrawlDB update"
__bin_nutch updatedb "${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT
-# note that the link inversion - indexing routine can be done within the main loop
-# on a per segment basis
+ # note that the link inversion - indexing routine can be done within the main loop
+ # on a per segment basis
echo "Link inversion"
__bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
@@ -380,10 +380,7 @@
if $INDEXFLAG; then
echo "Indexing $SEGMENT to index"
- __bin_nutch index "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
-
- echo "Cleaning up index if possible"
- __bin_nutch clean "${commonOptions[@]}" "$CRAWL_PATH"/crawldb
+ __bin_nutch index "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -deleteGone
else
echo "Skipping indexing ..."
fi
diff --git a/src/bin/nutch b/src/bin/nutch
index 3a25738..244d812 100755
--- a/src/bin/nutch
+++ b/src/bin/nutch
@@ -103,6 +103,7 @@
echo " updatehostdb update the host db with records from the crawl db"
echo " readhostdb read / dump host db"
echo " sitemap perform Sitemap processing"
+ echo " showproperties print Nutch/Hadoop configuration properties to stdout"
echo " or"
echo " CLASSNAME run the class named CLASSNAME"
echo "Most commands print help when invoked w/o parameters."
@@ -311,6 +312,8 @@
CLASS=org.apache.nutch.hostdb.ReadHostDb
elif [ "$COMMAND" = "sitemap" ] ; then
CLASS=org.apache.nutch.util.SitemapProcessor
+elif [ "$COMMAND" = "showproperties" ] ; then
+ CLASS=org.apache.nutch.tools.ShowProperties
else
CLASS=$COMMAND
fi
diff --git a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
index 3c1003e..ce7b2b6 100644
--- a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
+++ b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
@@ -22,6 +22,7 @@
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.hadoop.conf.Configuration;
@@ -30,9 +31,13 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.google.common.base.Optional;
+import com.google.common.cache.Cache;
+import com.google.common.cache.CacheBuilder;
+
/**
- * Convenience class - a collection of queues that keeps track of the total
- * number of items, and provides items eligible for fetching from any queue.
+ * A collection of queues that keeps track of the total number of items, and
+ * provides items eligible for fetching from any queue.
*/
public class FetchItemQueues {
@@ -44,6 +49,8 @@
private Set<String> queuesMaxExceptions = new HashSet<>();
Iterator<Map.Entry<String, FetchItemQueue>> lastIterator = null;
AtomicInteger totalSize = new AtomicInteger(0);
+ Cache<Text, Optional<String>> redirectDedupCache = null;
+
int maxThreads;
long crawlDelay;
long minCrawlDelay;
@@ -77,6 +84,16 @@
this.timelimit = conf.getLong("fetcher.timelimit", -1);
this.maxExceptionsPerQueue = conf.getInt(
"fetcher.max.exceptions.per.queue", -1);
+
+ int dedupRedirMaxTime = conf.getInt("fetcher.redirect.dedupcache.seconds",
+ -1);
+ int dedupRedirMaxSize = conf.getInt("fetcher.redirect.dedupcache.size",
+ 1000);
+ if (dedupRedirMaxTime > 0 && dedupRedirMaxSize > 0) {
+ redirectDedupCache = CacheBuilder.newBuilder()
+ .maximumSize(dedupRedirMaxSize)
+ .expireAfterWrite(dedupRedirMaxTime, TimeUnit.SECONDS).build();
+ }
}
/**
@@ -246,6 +263,22 @@
return 0;
}
+ /**
+ * @param redirUrl
+ * redirect target
+ * @return true if redirects are deduplicated and redirUrl has been queued
+ * recently
+ */
+ public boolean redirectIsQueuedRecently(Text redirUrl) {
+ if (redirectDedupCache != null) {
+ if (redirectDedupCache.getIfPresent(redirUrl) != null) {
+ return true;
+ }
+ redirectDedupCache.put(redirUrl, Optional.absent());
+ }
+ return false;
+ }
+
public synchronized void dump() {
for (String id : queues.keySet()) {
FetchItemQueue fiq = queues.get(id);
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
index bc0d639..6cd1772 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThread.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -457,6 +457,8 @@
if (redirecting && redirectCount > maxRedirect) {
fetchQueues.finishFetchItem(fit);
+ context.getCounter("FetcherStatus", "redirect_count_exceeded")
+ .increment(1);
if (LOG.isInfoEnabled()) {
LOG.info("{} {} - redirect count exceeded {} ({})", getName(),
Thread.currentThread().getId(), fit.url,
@@ -592,6 +594,13 @@
private FetchItem queueRedirect(Text redirUrl, FetchItem fit)
throws ScoringFilterException {
+ if (fetchQueues.redirectIsQueuedRecently(redirUrl)) {
+ redirecting = false;
+ context.getCounter("FetcherStatus", "redirect_deduplicated").increment(1);
+ LOG.debug(" - ignoring redirect from {} to {} as duplicate", fit.url,
+ redirUrl);
+ return null;
+ }
CrawlDatum newDatum = createRedirDatum(redirUrl, fit, CrawlDatum.STATUS_DB_UNFETCHED);
fit = FetchItem.create(redirUrl, newDatum, queueMode);
if (fit != null) {
diff --git a/src/java/org/apache/nutch/tools/ShowProperties.java b/src/java/org/apache/nutch/tools/ShowProperties.java
new file mode 100644
index 0000000..d7058d6
--- /dev/null
+++ b/src/java/org/apache/nutch/tools/ShowProperties.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.tools;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+/**
+ * Tool to list properties and their values set by the current Nutch
+ * configuration
+ */
+public class ShowProperties extends Configured implements Tool {
+
+ private String format3cols = "%-32s %24s %20s";
+
+ @Override
+ public int run(String[] arg0) {
+ Configuration conf = getConf();
+ List<Entry<String, String>> list = new ArrayList<>();
+ conf.iterator().forEachRemaining(list::add);
+ Collections.sort(list, (a, b) -> a.getKey().compareTo(b.getKey()));
+ System.out.println(
+ String.format(format3cols, "conf.name", "conf.value", "substituted.value"));
+ System.out.println(
+ "================================================================================");
+ for (Entry<String, String> e : list) {
+ String key = e.getKey();
+ String val = e.getValue();
+ String substitutedVal = conf.get(key);
+ if (val.equals(substitutedVal)) {
+ String format = String.format("%%-%ds %%%ds", key.length(),
+ (80 - 2 - key.length()));
+ System.out.println(String.format(format, key, val));
+ } else {
+ String format = String.format("%%-%ds %%%ds %%18s", key.length(),
+ (60 - 2 - key.length()));
+ System.out
+ .println(String.format(format, key, val, substitutedVal));
+ }
+ }
+ return 0;
+ }
+
+ public static void main(String[] args) throws Exception {
+ System.exit(ToolRunner.run(NutchConfiguration.create(),
+ new ShowProperties(), args));
+ }
+
+}
diff --git a/src/java/org/apache/nutch/util/DomUtil.java b/src/java/org/apache/nutch/util/DomUtil.java
index 2461286..d0bfafd 100644
--- a/src/java/org/apache/nutch/util/DomUtil.java
+++ b/src/java/org/apache/nutch/util/DomUtil.java
@@ -22,7 +22,9 @@
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.lang.invoke.MethodHandles;
+import java.nio.charset.StandardCharsets;
+import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
@@ -33,6 +35,7 @@
import org.apache.xerces.parsers.DOMParser;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
+import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
@@ -91,16 +94,12 @@
try {
transformer = transFactory.newTransformer();
transformer.setOutputProperty("indent", "yes");
+ transformer.setOutputProperty(OutputKeys.ENCODING,
+ StandardCharsets.UTF_8.name());
StreamResult result = new StreamResult(os);
transformer.transform(source, result);
os.flush();
- } catch (UnsupportedEncodingException e1) {
- LOG.error("Error: ", e1);
- } catch (IOException e1) {
- LOG.error("Error: ", e1);
- } catch (TransformerConfigurationException e2) {
- LOG.error("Error: ", e2);
- } catch (TransformerException ex) {
+ } catch (IOException | TransformerException ex) {
LOG.error("Error: ", ex);
}
}
@@ -108,7 +107,16 @@
public static void saveDom(OutputStream os, DocumentFragment doc) {
NodeList docChildren = doc.getChildNodes();
for (int i = 0; i < docChildren.getLength(); i++) {
- saveDom(os, (Element) docChildren.item(i));
+ Node child = docChildren.item(i);
+ if (child instanceof Element) {
+ saveDom(os, (Element) child);
+ } else {
+ try {
+ os.write(child.toString().getBytes(StandardCharsets.UTF_8));
+ } catch (IOException ex) {
+ LOG.error("Error: ", ex);
+ }
+ }
}
}
}
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index d722b1a..a2a0dd7 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -67,6 +67,7 @@
<ant dir="parse-swf" target="deploy"/>
<ant dir="parse-tika" target="deploy"/>
<ant dir="parse-zip" target="deploy"/>
+ <ant dir="parsefilter-debug" target="deploy"/>
<ant dir="parsefilter-naivebayes" target="deploy"/>
<ant dir="parsefilter-regex" target="deploy"/>
<ant dir="protocol-file" target="deploy"/>
@@ -212,6 +213,7 @@
<ant dir="parse-swf" target="clean"/>
<ant dir="parse-tika" target="clean"/>
<ant dir="parse-zip" target="clean"/>
+ <ant dir="parsefilter-debug" target="clean" />
<ant dir="parsefilter-naivebayes" target="clean" />
<ant dir="parsefilter-regex" target="clean"/>
<ant dir="protocol-file" target="clean"/>
diff --git a/src/plugin/parsefilter-debug/build.xml b/src/plugin/parsefilter-debug/build.xml
new file mode 100644
index 0000000..1f175e4
--- /dev/null
+++ b/src/plugin/parsefilter-debug/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parsefilter-debug" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+</project>
diff --git a/src/plugin/parsefilter-debug/ivy.xml b/src/plugin/parsefilter-debug/ivy.xml
new file mode 100644
index 0000000..dac80e6
--- /dev/null
+++ b/src/plugin/parsefilter-debug/ivy.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="https://nutch.apache.org/"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+</ivy-module>
diff --git a/src/plugin/parsefilter-debug/plugin.xml b/src/plugin/parsefilter-debug/plugin.xml
new file mode 100644
index 0000000..bc4a574
--- /dev/null
+++ b/src/plugin/parsefilter-debug/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="parsefilter-debug"
+ name="Debugging Parse Filter"
+ version="1.0.0"
+ provider-name="nutch.org">
+
+ <runtime>
+ <library name="parsefilter-debug.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
+ <extension id="org.apache.nutch.htmlparsefilter.regex"
+ name="Nutch Parser Filter" point="org.apache.nutch.parse.HtmlParseFilter">
+ <implementation id="DebugParseFilter"
+ class="org.apache.nutch.parsefilter.debug.DebugParseFilter">
+ </implementation>
+ </extension>
+
+</plugin>
diff --git a/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java
new file mode 100644
index 0000000..691f894
--- /dev/null
+++ b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/DebugParseFilter.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parsefilter.debug;
+
+import java.io.ByteArrayOutputStream;
+import java.io.OutputStreamWriter;
+import java.lang.invoke.MethodHandles;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.DomUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.DocumentFragment;
+
+/**
+ * Adds serialized DOM to parse data, useful for debugging, to understand how
+ * the parser implementation interprets a document (not only HTML).
+ */
+public class DebugParseFilter implements HtmlParseFilter {
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
+ private Configuration conf;
+
+ @Override
+ public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DomUtil.saveDom(baos, doc);
+ Parse parse = parseResult.get(content.getUrl());
+ String dom = new String(baos.toByteArray(), StandardCharsets.UTF_8);
+ LOG.debug(dom);
+ parse.getData().getParseMeta().set("DOM", dom);
+ return parseResult;
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ @Override
+ public Configuration getConf() {
+ return conf;
+ }
+}
diff --git a/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/package-info.java b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/package-info.java
new file mode 100644
index 0000000..bbc24dd
--- /dev/null
+++ b/src/plugin/parsefilter-debug/src/java/org/apache/nutch/parsefilter/debug/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Adds serialized DOM to parse data, useful for debugging, to understand how
+ * the parser implementation interprets a document (not only HTML).
+ */
+package org.apache.nutch.parsefilter.debug;
+