blob: dce17b9f8a5f0233cf4da277bb1421b80c3a0f19 [file] [log] [blame]
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<!-- NewPage -->
<html lang="en">
<head>
<!-- Generated by javadoc (1.8.0_221) on Tue Jan 19 12:28:02 PST 2021 -->
<title>Overview (apache-nutch 1.18 API)</title>
<meta name="date" content="2021-01-19">
<link rel="stylesheet" type="text/css" href="stylesheet.css" title="Style">
<script type="text/javascript" src="script.js"></script>
</head>
<body>
<script type="text/javascript"><!--
try {
if (location.href.indexOf('is-external=true') == -1) {
parent.document.title="Overview (apache-nutch 1.18 API)";
}
}
catch(err) {
}
//-->
</script>
<noscript>
<div>JavaScript is disabled on your browser.</div>
</noscript>
<!-- ========= START OF TOP NAVBAR ======= -->
<div class="topNav"><a name="navbar.top">
<!-- -->
</a>
<div class="skipNav"><a href="#skip.navbar.top" title="Skip navigation links">Skip navigation links</a></div>
<a name="navbar.top.firstrow">
<!-- -->
</a>
<ul class="navList" title="Navigation">
<li class="navBarCell1Rev">Overview</li>
<li>Package</li>
<li>Class</li>
<li>Use</li>
<li><a href="overview-tree.html">Tree</a></li>
<li><a href="deprecated-list.html">Deprecated</a></li>
<li><a href="index-all.html">Index</a></li>
<li><a href="help-doc.html">Help</a></li>
</ul>
</div>
<div class="subNav">
<ul class="navList">
<li>Prev</li>
<li>Next</li>
</ul>
<ul class="navList">
<li><a href="index.html?overview-summary.html" target="_top">Frames</a></li>
<li><a href="overview-summary.html" target="_top">No&nbsp;Frames</a></li>
</ul>
<ul class="navList" id="allclasses_navbar_top">
<li><a href="allclasses-noframe.html">All&nbsp;Classes</a></li>
</ul>
<div>
<script type="text/javascript"><!--
allClassesLink = document.getElementById("allclasses_navbar_top");
if(window==top) {
allClassesLink.style.display = "block";
}
else {
allClassesLink.style.display = "none";
}
//-->
</script>
</div>
<a name="skip.navbar.top">
<!-- -->
</a></div>
<!-- ========= END OF TOP NAVBAR ========= -->
<div class="header">
<h1 class="title">apache-nutch 1.18 API</h1>
</div>
<div class="header">
<div class="subTitle">
<div class="block">Apache Nutch is a highly extensible and scalable open source web crawler software project.</div>
</div>
<p>See: <a href="#overview.description">Description</a></p>
</div>
<div class="contentContainer">
<table class="overviewSummary" border="0" cellpadding="3" cellspacing="0" summary="Core table, listing packages, and an explanation">
<caption><span>Core</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Package</th>
<th class="colLast" scope="col">Description</th>
</tr>
<tbody>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/crawl/package-summary.html">org.apache.nutch.crawl</a></td>
<td class="colLast">
<div class="block">Crawl control code and tools to run the crawler.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/exchange/package-summary.html">org.apache.nutch.exchange</a></td>
<td class="colLast">
<div class="block">Control code for exchange component, which acts in indexing job and decides to
which index writer a document should be routed, based on plugins behavior.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/fetcher/package-summary.html">org.apache.nutch.fetcher</a></td>
<td class="colLast">
<div class="block">The Nutch robot.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/hostdb/package-summary.html">org.apache.nutch.hostdb</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/indexer/package-summary.html">org.apache.nutch.indexer</a></td>
<td class="colLast">
<div class="block">Index content, configure and run indexing and cleaning jobs to
add, update, and delete documents from an index.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/metadata/package-summary.html">org.apache.nutch.metadata</a></td>
<td class="colLast">
<div class="block">A Multi-valued Metadata container, and set
of constant fields for Nutch Metadata.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/net/package-summary.html">org.apache.nutch.net</a></td>
<td class="colLast">
<div class="block">Web-related interfaces: URL <a href="org/apache/nutch/net/URLFilter.html" title="interface in org.apache.nutch.net"><code>filters</code></a>
and <a href="org/apache/nutch/net/URLNormalizer.html" title="interface in org.apache.nutch.net"><code>normalizers</code></a>.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/net/protocols/package-summary.html">org.apache.nutch.net.protocols</a></td>
<td class="colLast">
<div class="block">Helper classes related to the <a href="org/apache/nutch/protocol/Protocol.html" title="interface in org.apache.nutch.protocol"><code>Protocol</code></a>
interface, see also <a href="org/apache/nutch/protocol/package-summary.html"><code>org.apache.nutch.protocol</code></a>.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/parse/package-summary.html">org.apache.nutch.parse</a></td>
<td class="colLast">
<div class="block">The <a href="org/apache/nutch/parse/Parse.html" title="interface in org.apache.nutch.parse"><code>Parse</code></a> interface and related classes.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/plugin/package-summary.html">org.apache.nutch.plugin</a></td>
<td class="colLast">
<div class="block">The Nutch <a href="org/apache/nutch/plugin/Pluggable.html" title="interface in org.apache.nutch.plugin"><code>Plugin</code></a> System.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/protocol/package-summary.html">org.apache.nutch.protocol</a></td>
<td class="colLast">
<div class="block">Classes related to the <a href="org/apache/nutch/protocol/Protocol.html" title="interface in org.apache.nutch.protocol"><code>Protocol</code></a> interface,
see also <a href="org/apache/nutch/net/protocols/package-summary.html"><code>org.apache.nutch.net.protocols</code></a>.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/publisher/package-summary.html">org.apache.nutch.publisher</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/rabbitmq/package-summary.html">org.apache.nutch.rabbitmq</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/scoring/package-summary.html">org.apache.nutch.scoring</a></td>
<td class="colLast">
<div class="block">The <a href="org/apache/nutch/scoring/ScoringFilter.html" title="interface in org.apache.nutch.scoring"><code>ScoringFilter</code></a> interface.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/scoring/webgraph/package-summary.html">org.apache.nutch.scoring.webgraph</a></td>
<td class="colLast">
<div class="block">Scoring implementation based on link analysis
(<a href="org/apache/nutch/scoring/webgraph/LinkRank.html" title="class in org.apache.nutch.scoring.webgraph"><code>LinkRank</code></a>),
see <a href="org/apache/nutch/scoring/webgraph/WebGraph.html" title="class in org.apache.nutch.scoring.webgraph"><code>WebGraph</code></a>.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/segment/package-summary.html">org.apache.nutch.segment</a></td>
<td class="colLast">
<div class="block">A segment stores all data from on generate/fetch/update cycle:
fetch list, protocol status, raw content, parsed content, and extracted outgoing links.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/service/package-summary.html">org.apache.nutch.service</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/service/impl/package-summary.html">org.apache.nutch.service.impl</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/service/model/request/package-summary.html">org.apache.nutch.service.model.request</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/service/model/response/package-summary.html">org.apache.nutch.service.model.response</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/service/resources/package-summary.html">org.apache.nutch.service.resources</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/tools/package-summary.html">org.apache.nutch.tools</a></td>
<td class="colLast">
<div class="block">Miscellaneous tools.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/tools/arc/package-summary.html">org.apache.nutch.tools.arc</a></td>
<td class="colLast">
<div class="block">Tools to read the
<a href="http://archive.org/web/researcher/ArcFileFormat.php">Arc file format</a>.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/tools/warc/package-summary.html">org.apache.nutch.tools.warc</a></td>
<td class="colLast">
<div class="block">Tools to import / export between Nutch segments and
<a href="http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf">
WARC archives</a>.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/util/package-summary.html">org.apache.nutch.util</a></td>
<td class="colLast">
<div class="block">Miscellaneous utility classes.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/util/domain/package-summary.html">org.apache.nutch.util.domain</a></td>
<td class="colLast">
<div class="block">Classes for domain name analysis.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/webui/package-summary.html">org.apache.nutch.webui</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/webui/client/package-summary.html">org.apache.nutch.webui.client</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/webui/client/impl/package-summary.html">org.apache.nutch.webui.client.impl</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/webui/client/model/package-summary.html">org.apache.nutch.webui.client.model</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/webui/config/package-summary.html">org.apache.nutch.webui.config</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/webui/model/package-summary.html">org.apache.nutch.webui.model</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/webui/pages/package-summary.html">org.apache.nutch.webui.pages</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/webui/pages/assets/package-summary.html">org.apache.nutch.webui.pages.assets</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/webui/pages/components/package-summary.html">org.apache.nutch.webui.pages.components</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/webui/pages/crawls/package-summary.html">org.apache.nutch.webui.pages.crawls</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/webui/pages/instances/package-summary.html">org.apache.nutch.webui.pages.instances</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/webui/pages/menu/package-summary.html">org.apache.nutch.webui.pages.menu</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/webui/pages/seed/package-summary.html">org.apache.nutch.webui.pages.seed</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/webui/pages/settings/package-summary.html">org.apache.nutch.webui.pages.settings</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/webui/service/package-summary.html">org.apache.nutch.webui.service</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/webui/service/impl/package-summary.html">org.apache.nutch.webui.service.impl</a></td>
<td class="colLast">&nbsp;</td>
</tr>
</tbody>
</table>
</div>
<div class="contentContainer">
<table class="overviewSummary" border="0" cellpadding="3" cellspacing="0" summary="Plugins API table, listing packages, and an explanation">
<caption><span>Plugins API</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Package</th>
<th class="colLast" scope="col">Description</th>
</tr>
<tbody>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/protocol/http/api/package-summary.html">org.apache.nutch.protocol.http.api</a></td>
<td class="colLast">
<div class="block">Common API used by HTTP plugins (<a href="org/apache/nutch/protocol/http/package-summary.html"><code>http</code></a>,
<a href="org/apache/nutch/protocol/httpclient/package-summary.html"><code>httpclient</code></a>)</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/urlfilter/api/package-summary.html">org.apache.nutch.urlfilter.api</a></td>
<td class="colLast">
<div class="block">Generic <a href="org/apache/nutch/net/URLFilter.html" title="interface in org.apache.nutch.net"><code>URL filter</code></a> library,
abstracting away from regular expression implementations.</div>
</td>
</tr>
</tbody>
</table>
</div>
<div class="contentContainer">
<table class="overviewSummary" border="0" cellpadding="3" cellspacing="0" summary="Protocol Plugins table, listing packages, and an explanation">
<caption><span>Protocol Plugins</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Package</th>
<th class="colLast" scope="col">Description</th>
</tr>
<tbody>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/protocol/file/package-summary.html">org.apache.nutch.protocol.file</a></td>
<td class="colLast">
<div class="block">Protocol plugin which supports retrieving local file resources.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/protocol/ftp/package-summary.html">org.apache.nutch.protocol.ftp</a></td>
<td class="colLast">
<div class="block">Protocol plugin which supports retrieving documents via the ftp protocol.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/protocol/htmlunit/package-summary.html">org.apache.nutch.protocol.htmlunit</a></td>
<td class="colLast">
<div class="block">Protocol plugin which supports retrieving documents via the http protocol.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/protocol/http/package-summary.html">org.apache.nutch.protocol.http</a></td>
<td class="colLast">
<div class="block">Protocol plugin which supports retrieving documents via the http protocol.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/protocol/httpclient/package-summary.html">org.apache.nutch.protocol.httpclient</a></td>
<td class="colLast">
<div class="block">Protocol plugin which supports retrieving documents via the HTTP and
HTTPS protocols, optionally with Basic, Digest and NTLM authentication
schemes for web server as well as proxy server.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/protocol/interactiveselenium/package-summary.html">org.apache.nutch.protocol.interactiveselenium</a></td>
<td class="colLast">
<div class="block">Protocol plugin which supports retrieving documents via selenium.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/protocol/interactiveselenium/handlers/package-summary.html">org.apache.nutch.protocol.interactiveselenium.handlers</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/protocol/okhttp/package-summary.html">org.apache.nutch.protocol.okhttp</a></td>
<td class="colLast">
<div class="block">Protocol plugin based on <a href="https://github.com/square/okhttp">okhttp</a>, supports http, https, http/2.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/protocol/selenium/package-summary.html">org.apache.nutch.protocol.selenium</a></td>
<td class="colLast">
<div class="block">Protocol plugin which supports retrieving documents via selenium.</div>
</td>
</tr>
</tbody>
</table>
</div>
<div class="contentContainer">
<table class="overviewSummary" border="0" cellpadding="3" cellspacing="0" summary="URL Filter Plugins table, listing packages, and an explanation">
<caption><span>URL Filter Plugins</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Package</th>
<th class="colLast" scope="col">Description</th>
</tr>
<tbody>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/urlfilter/automaton/package-summary.html">org.apache.nutch.urlfilter.automaton</a></td>
<td class="colLast">
<div class="block">
URL filter plugin based on
<a href="https://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State
Automata for Java<sup>TM</sup>.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/urlfilter/domain/package-summary.html">org.apache.nutch.urlfilter.domain</a></td>
<td class="colLast">
<div class="block">URL filter plugin to include only URLs which match an element in a given list of
domain suffixes, domain names, and/or host names.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/urlfilter/domaindenylist/package-summary.html">org.apache.nutch.urlfilter.domaindenylist</a></td>
<td class="colLast">
<div class="block">URL filter plugin to exclude URLs by domain suffixes, domain names, and/or host names.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/urlfilter/fast/package-summary.html">org.apache.nutch.urlfilter.fast</a></td>
<td class="colLast">
<div class="block">URL filter plugin that first does fast exact suffix matches on host/domain
names before applying regular expressions to the path component of a URL.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/urlfilter/ignoreexempt/package-summary.html">org.apache.nutch.urlfilter.ignoreexempt</a></td>
<td class="colLast">
<div class="block">URL filter plugin which identifies exemptions to external urls when
when external urls are set to ignore.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/urlfilter/prefix/package-summary.html">org.apache.nutch.urlfilter.prefix</a></td>
<td class="colLast">
<div class="block">URL filter plugin to include only URLs which match one of a given list of URL prefixes.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/urlfilter/regex/package-summary.html">org.apache.nutch.urlfilter.regex</a></td>
<td class="colLast">
<div class="block">URL filter plugin to include and/or exclude URLs matching Java regular expressions.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/urlfilter/suffix/package-summary.html">org.apache.nutch.urlfilter.suffix</a></td>
<td class="colLast">
<div class="block">URL filter plugin to either exclude or include only URLs which match
one of the given (path) suffixes.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/urlfilter/validator/package-summary.html">org.apache.nutch.urlfilter.validator</a></td>
<td class="colLast">
<div class="block">URL filter plugin that validates given urls.</div>
</td>
</tr>
</tbody>
</table>
</div>
<div class="contentContainer">
<table class="overviewSummary" border="0" cellpadding="3" cellspacing="0" summary="URL Normalizer Plugins table, listing packages, and an explanation">
<caption><span>URL Normalizer Plugins</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Package</th>
<th class="colLast" scope="col">Description</th>
</tr>
<tbody>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/net/urlnormalizer/ajax/package-summary.html">org.apache.nutch.net.urlnormalizer.ajax</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/net/urlnormalizer/basic/package-summary.html">org.apache.nutch.net.urlnormalizer.basic</a></td>
<td class="colLast">
<div class="block">URL normalizer performing basic normalizations:
remove default ports, e.g., port 80 for <code>http://</code> URLs
remove needless slashes and dot segments in the path component
remove anchors
use percent-encoding (only) where needed
E.g.,
<code>https://www.example.org/a/../b//./select%2Dlang.php?lang=español#anchor<code>
is normalized to <code>https://www.example.org/b/select-lang.php?lang=espa%C3%B1ol</code>
Optional and configurable normalizations are:
convert Internationalized Domain Names (IDNs) uniquely either to the
ASCII (Punycode) or Unicode representation, see property
<code>urlnormalizer.basic.host.idn</code>
remove a trailing dot from host names, see property
<code>urlnormalizer.basic.host.trim-trailing-dot</code>
</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/net/urlnormalizer/host/package-summary.html">org.apache.nutch.net.urlnormalizer.host</a></td>
<td class="colLast">
<div class="block">URL normalizer renaming hosts to a canonical form listed in the
configuration file.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/net/urlnormalizer/pass/package-summary.html">org.apache.nutch.net.urlnormalizer.pass</a></td>
<td class="colLast">
<div class="block">URL normalizer dummy which does not change URLs.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/net/urlnormalizer/protocol/package-summary.html">org.apache.nutch.net.urlnormalizer.protocol</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/net/urlnormalizer/querystring/package-summary.html">org.apache.nutch.net.urlnormalizer.querystring</a></td>
<td class="colLast">
<div class="block">URL normalizer which sort the elements in the query part to avoid duplicates
by permutations.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/net/urlnormalizer/regex/package-summary.html">org.apache.nutch.net.urlnormalizer.regex</a></td>
<td class="colLast">
<div class="block">URL normalizer with configurable rules based on regular expressions
(<a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html?is-external=true" title="class or interface in java.util.regex"><code>Pattern</code></a>).</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/net/urlnormalizer/slash/package-summary.html">org.apache.nutch.net.urlnormalizer.slash</a></td>
<td class="colLast">&nbsp;</td>
</tr>
</tbody>
</table>
</div>
<div class="contentContainer">
<table class="overviewSummary" border="0" cellpadding="3" cellspacing="0" summary="Scoring Plugins table, listing packages, and an explanation">
<caption><span>Scoring Plugins</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Package</th>
<th class="colLast" scope="col">Description</th>
</tr>
<tbody>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/scoring/depth/package-summary.html">org.apache.nutch.scoring.depth</a></td>
<td class="colLast">
<div class="block">Scoring filter to stop crawling at a configurable depth
(number of "hops" from seed URLs).</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/scoring/link/package-summary.html">org.apache.nutch.scoring.link</a></td>
<td class="colLast">
<div class="block">Scoring filter used in conjunction with
<a href="org/apache/nutch/scoring/webgraph/WebGraph.html" title="class in org.apache.nutch.scoring.webgraph"><code>WebGraph</code></a>.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/scoring/opic/package-summary.html">org.apache.nutch.scoring.opic</a></td>
<td class="colLast">
<div class="block">Scoring filter implementing a variant of the Online Page Importance Computation
(OPIC) algorithm.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/scoring/orphan/package-summary.html">org.apache.nutch.scoring.orphan</a></td>
<td class="colLast">
<div class="block">Scoring filter to modify score or status of orphaned pages (no inlinks found
for a configurable amount of time).</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/scoring/similarity/package-summary.html">org.apache.nutch.scoring.similarity</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/scoring/similarity/cosine/package-summary.html">org.apache.nutch.scoring.similarity.cosine</a></td>
<td class="colLast">
<div class="block">Implements the cosine similarity metric for scoring relevant documents</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/scoring/similarity/util/package-summary.html">org.apache.nutch.scoring.similarity.util</a></td>
<td class="colLast">
<div class="block">Utility package for Lucene functions.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/scoring/tld/package-summary.html">org.apache.nutch.scoring.tld</a></td>
<td class="colLast">
<div class="block">Top Level Domain Scoring plugin.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/scoring/urlmeta/package-summary.html">org.apache.nutch.scoring.urlmeta</a></td>
<td class="colLast">
<div class="block">
URL Meta Tag Scoring Plugin</div>
</td>
</tr>
</tbody>
</table>
</div>
<div class="contentContainer">
<table class="overviewSummary" border="0" cellpadding="3" cellspacing="0" summary="Parse Plugins table, listing packages, and an explanation">
<caption><span>Parse Plugins</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Package</th>
<th class="colLast" scope="col">Description</th>
</tr>
<tbody>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/parse/ext/package-summary.html">org.apache.nutch.parse.ext</a></td>
<td class="colLast">
<div class="block">Parse wrapper to run external command to do the parsing.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/parse/feed/package-summary.html">org.apache.nutch.parse.feed</a></td>
<td class="colLast">
<div class="block">Parse RSS feeds.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/parse/html/package-summary.html">org.apache.nutch.parse.html</a></td>
<td class="colLast">
<div class="block">An HTML document parsing plugin.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/parse/js/package-summary.html">org.apache.nutch.parse.js</a></td>
<td class="colLast">
<div class="block">Parser and parse filter plugin to extract all (possible) links
from JavaScript files and embedded JavaScript code snippets.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/parse/swf/package-summary.html">org.apache.nutch.parse.swf</a></td>
<td class="colLast">
<div class="block">Parse Flash SWF files.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/parse/tika/package-summary.html">org.apache.nutch.parse.tika</a></td>
<td class="colLast">
<div class="block">Parse various document formats with help of
<a href="https://tika.apache.org/">Apache Tika</a>.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/parse/zip/package-summary.html">org.apache.nutch.parse.zip</a></td>
<td class="colLast">
<div class="block">Parse ZIP files: embedded files are recursively passed to appropriate parsers.</div>
</td>
</tr>
</tbody>
</table>
</div>
<div class="contentContainer">
<table class="overviewSummary" border="0" cellpadding="3" cellspacing="0" summary="Parse Filter Plugins table, listing packages, and an explanation">
<caption><span>Parse Filter Plugins</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Package</th>
<th class="colLast" scope="col">Description</th>
</tr>
<tbody>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/parse/headings/package-summary.html">org.apache.nutch.parse.headings</a></td>
<td class="colLast">
<div class="block">Parse filter to extract headings (h1, h2, etc.) from DOM parse tree.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/parse/metatags/package-summary.html">org.apache.nutch.parse.metatags</a></td>
<td class="colLast">
<div class="block">Parse filter to extract meta tags: keywords, description, etc.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/parsefilter/debug/package-summary.html">org.apache.nutch.parsefilter.debug</a></td>
<td class="colLast">
<div class="block">Adds serialized DOM to parse data, useful for debugging, to understand how
the parser implementation interprets a document (not only HTML).</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/parsefilter/naivebayes/package-summary.html">org.apache.nutch.parsefilter.naivebayes</a></td>
<td class="colLast">
<div class="block">Html Parse filter that classifies the outlinks from the parseresult as
relevant or irrelevant based on the parseText's relevancy (using a training
file where you can give positive and negative example texts see the
description of parsefilter.naivebayes.trainfile) and if found irrelevent
it gives the link a second chance if it contains any of the words from the
list given in parsefilter.naivebayes.wordlist.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/parsefilter/regex/package-summary.html">org.apache.nutch.parsefilter.regex</a></td>
<td class="colLast">
<div class="block">RegexParseFilter.</div>
</td>
</tr>
</tbody>
</table>
</div>
<div class="contentContainer">
<table class="overviewSummary" border="0" cellpadding="3" cellspacing="0" summary="Publisher Plugins table, listing packages, and an explanation">
<caption><span>Publisher Plugins</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Package</th>
<th class="colLast" scope="col">Description</th>
</tr>
<tbody>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/publisher/rabbitmq/package-summary.html">org.apache.nutch.publisher.rabbitmq</a></td>
<td class="colLast">
<div class="block">Publisher package to implement queues</div>
</td>
</tr>
</tbody>
</table>
</div>
<div class="contentContainer">
<table class="overviewSummary" border="0" cellpadding="3" cellspacing="0" summary="Exchange Plugins table, listing packages, and an explanation">
<caption><span>Exchange Plugins</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Package</th>
<th class="colLast" scope="col">Description</th>
</tr>
<tbody>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/exchange/jexl/package-summary.html">org.apache.nutch.exchange.jexl</a></td>
<td class="colLast">
<div class="block">Plugin of Exchange component based on JEXL expressions.</div>
</td>
</tr>
</tbody>
</table>
</div>
<div class="contentContainer">
<table class="overviewSummary" border="0" cellpadding="3" cellspacing="0" summary="Indexing Filter Plugins table, listing packages, and an explanation">
<caption><span>Indexing Filter Plugins</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Package</th>
<th class="colLast" scope="col">Description</th>
</tr>
<tbody>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/indexer/anchor/package-summary.html">org.apache.nutch.indexer.anchor</a></td>
<td class="colLast">
<div class="block">An indexing plugin for inbound anchor text.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/indexer/basic/package-summary.html">org.apache.nutch.indexer.basic</a></td>
<td class="colLast">
<div class="block">A basic indexing plugin, adds basic fields: url, host, title, content, etc.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/indexer/feed/package-summary.html">org.apache.nutch.indexer.feed</a></td>
<td class="colLast">
<div class="block">Indexing filter to index meta data from RSS feeds.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/indexer/filter/package-summary.html">org.apache.nutch.indexer.filter</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/indexer/geoip/package-summary.html">org.apache.nutch.indexer.geoip</a></td>
<td class="colLast">
<div class="block">This plugin implements an indexing filter which takes
advantage of the
<a href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/indexer/jexl/package-summary.html">org.apache.nutch.indexer.jexl</a></td>
<td class="colLast">
<div class="block">This plugin implements a dynamic indexing filter which uses JEXL
expressions to allow filtering based on the page's metadata</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/indexer/links/package-summary.html">org.apache.nutch.indexer.links</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/indexer/metadata/package-summary.html">org.apache.nutch.indexer.metadata</a></td>
<td class="colLast">
<div class="block">Indexing filter to add document metadata to the index.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/indexer/more/package-summary.html">org.apache.nutch.indexer.more</a></td>
<td class="colLast">
<div class="block">A more indexing plugin, adds "more" index fields:
last modified date, MIME type, content length.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/indexer/replace/package-summary.html">org.apache.nutch.indexer.replace</a></td>
<td class="colLast">
<div class="block">Indexing filter to allow pattern replacements on metadata.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/indexer/staticfield/package-summary.html">org.apache.nutch.indexer.staticfield</a></td>
<td class="colLast">
<div class="block">A simple plugin called at indexing that adds fields with static data.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/indexer/subcollection/package-summary.html">org.apache.nutch.indexer.subcollection</a></td>
<td class="colLast">
<div class="block">Indexing filter to assign documents to subcollections.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/indexer/tld/package-summary.html">org.apache.nutch.indexer.tld</a></td>
<td class="colLast">
<div class="block">Top Level Domain Indexing plugin.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/indexer/urlmeta/package-summary.html">org.apache.nutch.indexer.urlmeta</a></td>
<td class="colLast">
<div class="block">
URL Meta Tag Indexing Plugin</div>
</td>
</tr>
</tbody>
</table>
</div>
<div class="contentContainer">
<table class="overviewSummary" border="0" cellpadding="3" cellspacing="0" summary="Indexer Plugins table, listing packages, and an explanation">
<caption><span>Indexer Plugins</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Package</th>
<th class="colLast" scope="col">Description</th>
</tr>
<tbody>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/indexwriter/cloudsearch/package-summary.html">org.apache.nutch.indexwriter.cloudsearch</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/indexwriter/csv/package-summary.html">org.apache.nutch.indexwriter.csv</a></td>
<td class="colLast">
<div class="block">Index writer plugin to write a plain CSV file.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/indexwriter/dummy/package-summary.html">org.apache.nutch.indexwriter.dummy</a></td>
<td class="colLast">
<div class="block">Index writer plugin for debugging, writes pairs of &lt;action, url&gt; to a
text file, action is one of "add", "update", or "delete".</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/indexwriter/elastic/package-summary.html">org.apache.nutch.indexwriter.elastic</a></td>
<td class="colLast">
<div class="block">Index writer plugin for <a href="https://www.elastic.co/products/elasticsearch">Elasticsearch</a>.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/indexwriter/kafka/package-summary.html">org.apache.nutch.indexwriter.kafka</a></td>
<td class="colLast">
<div class="block">Index writer plugin to produce JSON messages to <a href="https://kafka.apache.org/">Kafka</a>.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/indexwriter/rabbit/package-summary.html">org.apache.nutch.indexwriter.rabbit</a></td>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/indexwriter/solr/package-summary.html">org.apache.nutch.indexwriter.solr</a></td>
<td class="colLast">
<div class="block">Index writer plugin for <a href="http://lucene.apache.org/solr/">Apache Solr</a>.</div>
</td>
</tr>
</tbody>
</table>
</div>
<div class="contentContainer">
<table class="overviewSummary" border="0" cellpadding="3" cellspacing="0" summary="Misc. Plugins table, listing packages, and an explanation">
<caption><span>Misc. Plugins</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Package</th>
<th class="colLast" scope="col">Description</th>
</tr>
<tbody>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/analysis/lang/package-summary.html">org.apache.nutch.analysis.lang</a></td>
<td class="colLast">
<div class="block">Text document language identifier.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/any23/package-summary.html">org.apache.nutch.any23</a></td>
<td class="colLast">
<div class="block">This packages uses the <a href="https://any23.apache.org/">Apache Any23</a> library
for parsing and extracting structured data in RDF format from a
variety of Web documents.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/apache/nutch/collection/package-summary.html">org.apache.nutch.collection</a></td>
<td class="colLast">
<div class="block">
Subcollection is a subset of an index.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><a href="org/apache/nutch/microformats/reltag/package-summary.html">org.apache.nutch.microformats.reltag</a></td>
<td class="colLast">
<div class="block">
A microformats <a href="http://www.microformats.org/wiki/Rel-Tag">Rel-Tag</a>
Parser/Indexer/Querier plugin.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><a href="org/creativecommons/nutch/package-summary.html">org.creativecommons.nutch</a></td>
<td class="colLast">
<div class="block">Sample plugins that parse and index Creative Commons medadata.</div>
</td>
</tr>
</tbody>
</table>
</div>
<div class="contentContainer"><a name="overview.description">
<!-- -->
</a>
<div class="block"><p>Apache Nutch is a highly extensible and scalable open source web crawler software project.</p>
<p>Nutch is a project of the Apache Software Foundation and is part of the larger Apache community of developers and users.</p></div>
</div>
<!-- ======= START OF BOTTOM NAVBAR ====== -->
<div class="bottomNav"><a name="navbar.bottom">
<!-- -->
</a>
<div class="skipNav"><a href="#skip.navbar.bottom" title="Skip navigation links">Skip navigation links</a></div>
<a name="navbar.bottom.firstrow">
<!-- -->
</a>
<ul class="navList" title="Navigation">
<li class="navBarCell1Rev">Overview</li>
<li>Package</li>
<li>Class</li>
<li>Use</li>
<li><a href="overview-tree.html">Tree</a></li>
<li><a href="deprecated-list.html">Deprecated</a></li>
<li><a href="index-all.html">Index</a></li>
<li><a href="help-doc.html">Help</a></li>
</ul>
</div>
<div class="subNav">
<ul class="navList">
<li>Prev</li>
<li>Next</li>
</ul>
<ul class="navList">
<li><a href="index.html?overview-summary.html" target="_top">Frames</a></li>
<li><a href="overview-summary.html" target="_top">No&nbsp;Frames</a></li>
</ul>
<ul class="navList" id="allclasses_navbar_bottom">
<li><a href="allclasses-noframe.html">All&nbsp;Classes</a></li>
</ul>
<div>
<script type="text/javascript"><!--
allClassesLink = document.getElementById("allclasses_navbar_bottom");
if(window==top) {
allClassesLink.style.display = "block";
}
else {
allClassesLink.style.display = "none";
}
//-->
</script>
</div>
<a name="skip.navbar.bottom">
<!-- -->
</a></div>
<!-- ======== END OF BOTTOM NAVBAR ======= -->
<p class="legalCopy"><small>Copyright &copy; 2021 The Apache Software Foundation</small></p>
</body>
</html>