| <!DOCTYPE HTML> |
| <!-- NewPage --> |
| <html lang="en"> |
| <head> |
| <!-- Generated by javadoc (11.0.27) on Wed Jul 16 12:49:41 CEST 2025 --> |
| <title>All Packages (apache-nutch 1.21 API)</title> |
| <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> |
| <meta name="dc.created" content="2025-07-16"> |
| <link rel="stylesheet" type="text/css" href="stylesheet.css" title="Style"> |
| <link rel="stylesheet" type="text/css" href="jquery/jquery-ui.min.css" title="Style"> |
| <link rel="stylesheet" type="text/css" href="jquery-ui.overrides.css" title="Style"> |
| <script type="text/javascript" src="script.js"></script> |
| <script type="text/javascript" src="jquery/jszip/dist/jszip.min.js"></script> |
| <script type="text/javascript" src="jquery/jszip-utils/dist/jszip-utils.min.js"></script> |
| <!--[if IE]> |
| <script type="text/javascript" src="jquery/jszip-utils/dist/jszip-utils-ie.min.js"></script> |
| <![endif]--> |
| <script type="text/javascript" src="jquery/jquery-3.7.1.min.js"></script> |
| <script type="text/javascript" src="jquery/jquery-ui.min.js"></script> |
| </head> |
| <body> |
| <script type="text/javascript"><!-- |
| try { |
| if (location.href.indexOf('is-external=true') == -1) { |
| parent.document.title="All Packages (apache-nutch 1.21 API)"; |
| } |
| } |
| catch(err) { |
| } |
| //--> |
| var pathtoroot = "./"; |
| var useModuleDirectories = false; |
| loadScripts(document, 'script');</script> |
| <noscript> |
| <div>JavaScript is disabled on your browser.</div> |
| </noscript> |
| <header role="banner"> |
| <nav role="navigation"> |
| <div class="fixedNav"> |
| <!-- ========= START OF TOP NAVBAR ======= --> |
| <div class="topNav"><a id="navbar.top"> |
| <!-- --> |
| </a> |
| <div class="skipNav"><a href="#skip.navbar.top" title="Skip navigation links">Skip navigation links</a></div> |
| <a id="navbar.top.firstrow"> |
| <!-- --> |
| </a> |
| <ul class="navList" title="Navigation"> |
| <li><a href="index.html">Overview</a></li> |
| <li>Package</li> |
| <li>Class</li> |
| <li>Use</li> |
| <li><a href="overview-tree.html">Tree</a></li> |
| <li><a href="deprecated-list.html">Deprecated</a></li> |
| <li><a href="index-all.html">Index</a></li> |
| <li><a href="help-doc.html">Help</a></li> |
| </ul> |
| </div> |
| <div class="subNav"> |
| <ul class="navList" id="allclasses_navbar_top"> |
| <li><a href="allclasses.html">All Classes</a></li> |
| </ul> |
| <ul class="navListSearch"> |
| <li><label for="search">SEARCH:</label> |
| <input type="text" id="search" value="search" disabled="disabled"> |
| <input type="reset" id="reset" value="reset" disabled="disabled"> |
| </li> |
| </ul> |
| <div> |
| <script type="text/javascript"><!-- |
| allClassesLink = document.getElementById("allclasses_navbar_top"); |
| if(window==top) { |
| allClassesLink.style.display = "block"; |
| } |
| else { |
| allClassesLink.style.display = "none"; |
| } |
| //--> |
| </script> |
| <noscript> |
| <div>JavaScript is disabled on your browser.</div> |
| </noscript> |
| </div> |
| <a id="skip.navbar.top"> |
| <!-- --> |
| </a></div> |
| <!-- ========= END OF TOP NAVBAR ========= --> |
| </div> |
| <div class="navPadding"> </div> |
| <script type="text/javascript"><!-- |
| $('.navPadding').css('padding-top', $('.fixedNav').css("height")); |
| //--> |
| </script> |
| </nav> |
| </header> |
| <main role="main"> |
| <div class="header"> |
| <h1 title="All&nbsp;Packages" class="title">All Packages</h1> |
| </div> |
| <div class="allPackagesContainer"> |
| <ul class="blockList"> |
| <li class="blockList"> |
| <table class="packagesSummary"> |
| <caption><span>Package Summary</span><span class="tabEnd"> </span></caption> |
| <tr> |
| <th class="colFirst" scope="col">Package</th> |
| <th class="colLast" scope="col">Description</th> |
| </tr> |
| <tbody> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/analysis/lang/package-summary.html">org.apache.nutch.analysis.lang</a></th> |
| <td class="colLast"> |
| <div class="block">Text document language identifier.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/collection/package-summary.html">org.apache.nutch.collection</a></th> |
| <td class="colLast"> |
| <div class="block">Subcollection is a subset of an index.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/crawl/package-summary.html">org.apache.nutch.crawl</a></th> |
| <td class="colLast"> |
| <div class="block">Crawl control code and tools to run the crawler.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/exchange/package-summary.html">org.apache.nutch.exchange</a></th> |
| <td class="colLast"> |
| <div class="block">Control code for exchange component, which acts in indexing job and decides to |
| which index writer a document should be routed, based on plugins behavior.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/exchange/jexl/package-summary.html">org.apache.nutch.exchange.jexl</a></th> |
| <td class="colLast"> |
| <div class="block">Plugin of Exchange component based on JEXL expressions.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/fetcher/package-summary.html">org.apache.nutch.fetcher</a></th> |
| <td class="colLast"> |
| <div class="block">The Nutch multi-threaded fetching module</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/hostdb/package-summary.html">org.apache.nutch.hostdb</a></th> |
| <td class="colLast"> </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexer/package-summary.html">org.apache.nutch.indexer</a></th> |
| <td class="colLast"> |
| <div class="block">Index content, configure and run indexing and cleaning jobs to |
| add, update, and delete documents from an index.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexer/anchor/package-summary.html">org.apache.nutch.indexer.anchor</a></th> |
| <td class="colLast"> |
| <div class="block">An indexing plugin for inbound anchor text.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexer/arbitrary/package-summary.html">org.apache.nutch.indexer.arbitrary</a></th> |
| <td class="colLast"> |
| <div class="block">Indexing filter to add document arbitrary data to the index |
| from the output of a user-specified class.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexer/basic/package-summary.html">org.apache.nutch.indexer.basic</a></th> |
| <td class="colLast"> |
| <div class="block">A basic indexing plugin, adds basic fields: url, host, title, content, etc.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexer/feed/package-summary.html">org.apache.nutch.indexer.feed</a></th> |
| <td class="colLast"> |
| <div class="block">Indexing filter to index meta data from RSS feeds.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexer/filter/package-summary.html">org.apache.nutch.indexer.filter</a></th> |
| <td class="colLast"> </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexer/geoip/package-summary.html">org.apache.nutch.indexer.geoip</a></th> |
| <td class="colLast"> |
| <div class="block">This plugin implements an indexing filter which takes |
| advantage of the |
| <a href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexer/jexl/package-summary.html">org.apache.nutch.indexer.jexl</a></th> |
| <td class="colLast"> |
| <div class="block">This plugin implements a dynamic indexing filter which uses JEXL |
| expressions to allow filtering based on the page's metadata</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexer/links/package-summary.html">org.apache.nutch.indexer.links</a></th> |
| <td class="colLast"> </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexer/metadata/package-summary.html">org.apache.nutch.indexer.metadata</a></th> |
| <td class="colLast"> |
| <div class="block">Indexing filter to add document metadata to the index.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexer/more/package-summary.html">org.apache.nutch.indexer.more</a></th> |
| <td class="colLast"> |
| <div class="block">A more indexing plugin, adds "more" index fields:last modified |
| date, MIME type, content length.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexer/replace/package-summary.html">org.apache.nutch.indexer.replace</a></th> |
| <td class="colLast"> |
| <div class="block">Indexing filter to allow pattern replacements on metadata.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexer/staticfield/package-summary.html">org.apache.nutch.indexer.staticfield</a></th> |
| <td class="colLast"> |
| <div class="block">A simple plugin called at indexing that adds fields with static data.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexer/subcollection/package-summary.html">org.apache.nutch.indexer.subcollection</a></th> |
| <td class="colLast"> |
| <div class="block">Indexing filter to assign documents to subcollections.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexer/tld/package-summary.html">org.apache.nutch.indexer.tld</a></th> |
| <td class="colLast"> |
| <div class="block">Top Level Domain Indexing plugin.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexer/urlmeta/package-summary.html">org.apache.nutch.indexer.urlmeta</a></th> |
| <td class="colLast"> |
| <div class="block">URL Meta Tag Indexing Plugin</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexwriter/cloudsearch/package-summary.html">org.apache.nutch.indexwriter.cloudsearch</a></th> |
| <td class="colLast"> </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexwriter/csv/package-summary.html">org.apache.nutch.indexwriter.csv</a></th> |
| <td class="colLast"> |
| <div class="block">Index writer plugin to write a plain CSV file.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexwriter/dummy/package-summary.html">org.apache.nutch.indexwriter.dummy</a></th> |
| <td class="colLast"> |
| <div class="block">Index writer plugin for debugging, writes pairs of <action, url> to a |
| text file, action is one of "add", "update", or "delete".</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexwriter/elastic/package-summary.html">org.apache.nutch.indexwriter.elastic</a></th> |
| <td class="colLast"> |
| <div class="block">Index writer plugin for <a href="https://www.elastic.co/products/elasticsearch">Elasticsearch</a>.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexwriter/kafka/package-summary.html">org.apache.nutch.indexwriter.kafka</a></th> |
| <td class="colLast"> |
| <div class="block">Index writer plugin to produce JSON messages to <a href="https://kafka.apache.org/">Kafka</a>.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexwriter/opensearch1x/package-summary.html">org.apache.nutch.indexwriter.opensearch1x</a></th> |
| <td class="colLast"> |
| <div class="block">Index writer plugin for <a href="https://opensearch.org">OpenSearch</a>.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexwriter/rabbit/package-summary.html">org.apache.nutch.indexwriter.rabbit</a></th> |
| <td class="colLast"> </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/indexwriter/solr/package-summary.html">org.apache.nutch.indexwriter.solr</a></th> |
| <td class="colLast"> |
| <div class="block">Index writer plugin for <a href="http://lucene.apache.org/solr/">Apache Solr</a>.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/metadata/package-summary.html">org.apache.nutch.metadata</a></th> |
| <td class="colLast"> |
| <div class="block">A Multi-valued Metadata container, and set |
| of constant fields for Nutch Metadata.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/microformats/reltag/package-summary.html">org.apache.nutch.microformats.reltag</a></th> |
| <td class="colLast"> |
| <div class="block">A microformats <a href="http://www.microformats.org/wiki/Rel-Tag">Rel-Tag</a> |
| Parser/Indexer/Querier plugin.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/net/package-summary.html">org.apache.nutch.net</a></th> |
| <td class="colLast"> |
| <div class="block">Web-related interfaces: URL <a href="org/apache/nutch/net/URLFilter.html" title="interface in org.apache.nutch.net"><code>filters</code></a> |
| and <a href="org/apache/nutch/net/URLNormalizer.html" title="interface in org.apache.nutch.net"><code>normalizers</code></a>.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/net/protocols/package-summary.html">org.apache.nutch.net.protocols</a></th> |
| <td class="colLast"> |
| <div class="block">Helper classes related to the <a href="org/apache/nutch/protocol/Protocol.html" title="interface in org.apache.nutch.protocol"><code>Protocol</code></a> |
| interface, see also <a href="org/apache/nutch/protocol/package-summary.html"><code>org.apache.nutch.protocol</code></a>.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/net/urlnormalizer/ajax/package-summary.html">org.apache.nutch.net.urlnormalizer.ajax</a></th> |
| <td class="colLast"> </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/net/urlnormalizer/basic/package-summary.html">org.apache.nutch.net.urlnormalizer.basic</a></th> |
| <td class="colLast"> |
| <div class="block">URL normalizer performing basic normalizations: |
| |
| remove default ports, e.g., port 80 for <code>http://</code> URLs |
| remove needless slashes and dot segments in the path component |
| remove anchors |
| use percent-encoding (only) where needed |
| |
| |
| E.g., |
| <code>https://www.example.org/a/../b//./select%2Dlang.php?lang=español#anchor</code> |
| is normalized to <code>https://www.example.org/b/select-lang.php?lang=espa%C3%B1ol</code> |
| |
| Optional and configurable normalizations are: |
| |
| convert Internationalized Domain Names (IDNs) uniquely either to the |
| ASCII (Punycode) or Unicode representation, see property |
| <code>urlnormalizer.basic.host.idn</code> |
| remove a trailing dot from host names, see property |
| <code>urlnormalizer.basic.host.trim-trailing-dot</code> |
| </div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/net/urlnormalizer/host/package-summary.html">org.apache.nutch.net.urlnormalizer.host</a></th> |
| <td class="colLast"> |
| <div class="block">URL normalizer renaming hosts to a canonical form listed in the |
| configuration file.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/net/urlnormalizer/pass/package-summary.html">org.apache.nutch.net.urlnormalizer.pass</a></th> |
| <td class="colLast"> |
| <div class="block">URL normalizer dummy which does not change URLs.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/net/urlnormalizer/protocol/package-summary.html">org.apache.nutch.net.urlnormalizer.protocol</a></th> |
| <td class="colLast"> |
| <div class="block">URL normalizer to normalize the protocol for all URLs of a given host or |
| domain.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/net/urlnormalizer/querystring/package-summary.html">org.apache.nutch.net.urlnormalizer.querystring</a></th> |
| <td class="colLast"> |
| <div class="block">URL normalizer which sort the elements in the query part to avoid duplicates |
| by permutations.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/net/urlnormalizer/regex/package-summary.html">org.apache.nutch.net.urlnormalizer.regex</a></th> |
| <td class="colLast"> |
| <div class="block">URL normalizer with configurable rules based on regular expressions |
| (<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/util/regex/Pattern.html?is-external=true" title="class or interface in java.util.regex" class="externalLink"><code>Pattern</code></a>).</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/net/urlnormalizer/slash/package-summary.html">org.apache.nutch.net.urlnormalizer.slash</a></th> |
| <td class="colLast"> </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/parse/package-summary.html">org.apache.nutch.parse</a></th> |
| <td class="colLast"> |
| <div class="block">The <a href="org/apache/nutch/parse/Parse.html" title="interface in org.apache.nutch.parse"><code>Parse</code></a> interface and related classes.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/parse/ext/package-summary.html">org.apache.nutch.parse.ext</a></th> |
| <td class="colLast"> |
| <div class="block">Parse wrapper to run external command to do the parsing.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/parse/feed/package-summary.html">org.apache.nutch.parse.feed</a></th> |
| <td class="colLast"> |
| <div class="block">Parse RSS feeds.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/parse/headings/package-summary.html">org.apache.nutch.parse.headings</a></th> |
| <td class="colLast"> |
| <div class="block">Parse filter to extract headings (h1, h2, etc.) from DOM parse tree.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/parse/html/package-summary.html">org.apache.nutch.parse.html</a></th> |
| <td class="colLast"> |
| <div class="block">An HTML document parsing plugin.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/parse/js/package-summary.html">org.apache.nutch.parse.js</a></th> |
| <td class="colLast"> |
| <div class="block">Parser and parse filter plugin to extract all (possible) links |
| from JavaScript files and embedded JavaScript code snippets.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/parse/metatags/package-summary.html">org.apache.nutch.parse.metatags</a></th> |
| <td class="colLast"> |
| <div class="block">Parse filter to extract meta tags: keywords, description, etc.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/parse/tika/package-summary.html">org.apache.nutch.parse.tika</a></th> |
| <td class="colLast"> |
| <div class="block">Parse various document formats with help of |
| <a href="https://tika.apache.org/">Apache Tika</a>.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/parse/zip/package-summary.html">org.apache.nutch.parse.zip</a></th> |
| <td class="colLast"> |
| <div class="block">Parse ZIP files: embedded files are recursively passed to appropriate parsers.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/parsefilter/debug/package-summary.html">org.apache.nutch.parsefilter.debug</a></th> |
| <td class="colLast"> |
| <div class="block">Adds serialized DOM to parse data, useful for debugging, to understand how |
| the parser implementation interprets a document (not only HTML).</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/parsefilter/naivebayes/package-summary.html">org.apache.nutch.parsefilter.naivebayes</a></th> |
| <td class="colLast"> |
| <div class="block">Html Parse filter that classifies the outlinks from the parseresult as |
| relevant or irrelevant based on the parseText's relevancy (using a training |
| file where you can give positive and negative example texts see the |
| description of parsefilter.naivebayes.trainfile) and if found irrelevent |
| it gives the link a second chance if it contains any of the words from the |
| list given in parsefilter.naivebayes.wordlist.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/parsefilter/regex/package-summary.html">org.apache.nutch.parsefilter.regex</a></th> |
| <td class="colLast"> |
| <div class="block">RegexParseFilter.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/plugin/package-summary.html">org.apache.nutch.plugin</a></th> |
| <td class="colLast"> |
| <div class="block">The Nutch <a href="org/apache/nutch/plugin/Pluggable.html" title="interface in org.apache.nutch.plugin"><code>Plugin</code></a> System.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/protocol/package-summary.html">org.apache.nutch.protocol</a></th> |
| <td class="colLast"> |
| <div class="block">Classes related to the <a href="org/apache/nutch/protocol/Protocol.html" title="interface in org.apache.nutch.protocol"><code>Protocol</code></a> interface, |
| see also <a href="org/apache/nutch/net/protocols/package-summary.html"><code>org.apache.nutch.net.protocols</code></a>.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/protocol/file/package-summary.html">org.apache.nutch.protocol.file</a></th> |
| <td class="colLast"> |
| <div class="block">Protocol plugin which supports retrieving local file resources.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/protocol/ftp/package-summary.html">org.apache.nutch.protocol.ftp</a></th> |
| <td class="colLast"> |
| <div class="block">Protocol plugin which supports retrieving documents via the ftp protocol.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/protocol/htmlunit/package-summary.html">org.apache.nutch.protocol.htmlunit</a></th> |
| <td class="colLast"> |
| <div class="block">Protocol plugin which supports retrieving documents via HTTP/HTTPS using |
| <a href="https://www.selenium.dev/">Selenium</a> and the |
| <a href="https://github.com/SeleniumHQ/htmlunit-driver">HtmlUnitDriver web |
| driver</a> for the for the |
| <a href="https://htmlunit.sourceforge.io/">HtmlUnit</a> headless browser.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/protocol/http/package-summary.html">org.apache.nutch.protocol.http</a></th> |
| <td class="colLast"> |
| <div class="block">Protocol plugin which supports retrieving documents via the http protocol.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/protocol/http/api/package-summary.html">org.apache.nutch.protocol.http.api</a></th> |
| <td class="colLast"> |
| <div class="block">Common API used by HTTP plugins (<a href="org/apache/nutch/protocol/http/package-summary.html"><code>http</code></a>, |
| <a href="org/apache/nutch/protocol/httpclient/package-summary.html"><code>httpclient</code></a>, etc.)</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/protocol/httpclient/package-summary.html">org.apache.nutch.protocol.httpclient</a></th> |
| <td class="colLast"> |
| <div class="block">Protocol plugin which supports retrieving documents via the HTTP andHTTPS |
| protocols, optionally with Basic, Digest and NTLM authentication schemes for |
| web server as well as proxy server.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/protocol/interactiveselenium/package-summary.html">org.apache.nutch.protocol.interactiveselenium</a></th> |
| <td class="colLast"> |
| <div class="block">Protocol plugin which supports retrieving documents using and interacting |
| with <a href="https://www.selenium.dev/">Selenium</a>.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/protocol/interactiveselenium/handlers/package-summary.html">org.apache.nutch.protocol.interactiveselenium.handlers</a></th> |
| <td class="colLast"> |
| <div class="block">Handler implementations to interact with |
| <a href="https://www.selenium.dev/">Selenium</a> for |
| <a href="org/apache/nutch/protocol/interactiveselenium/package-summary.html"><code>org.apache.nutch.protocol.interactiveselenium</code></a>.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/protocol/okhttp/package-summary.html">org.apache.nutch.protocol.okhttp</a></th> |
| <td class="colLast"> |
| <div class="block">Protocol plugin for HTTP/HTTPS based on |
| <a href="https://github.com/square/okhttp">okhttp</a>, supports HTTP 1.1 |
| and/or http/2.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/protocol/selenium/package-summary.html">org.apache.nutch.protocol.selenium</a></th> |
| <td class="colLast"> |
| <div class="block">Protocol plugin which supports retrieving documents via |
| <a href="https://www.selenium.dev/">Selenium</a>.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/publisher/package-summary.html">org.apache.nutch.publisher</a></th> |
| <td class="colLast"> </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/publisher/rabbitmq/package-summary.html">org.apache.nutch.publisher.rabbitmq</a></th> |
| <td class="colLast"> |
| <div class="block">Publisher package to implement queues</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/rabbitmq/package-summary.html">org.apache.nutch.rabbitmq</a></th> |
| <td class="colLast"> </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/scoring/package-summary.html">org.apache.nutch.scoring</a></th> |
| <td class="colLast"> |
| <div class="block">The <a href="org/apache/nutch/scoring/ScoringFilter.html" title="interface in org.apache.nutch.scoring"><code>ScoringFilter</code></a> interface.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/scoring/depth/package-summary.html">org.apache.nutch.scoring.depth</a></th> |
| <td class="colLast"> |
| <div class="block">Scoring filter to stop crawling at a configurable depth |
| (number of "hops" from seed URLs).</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/scoring/link/package-summary.html">org.apache.nutch.scoring.link</a></th> |
| <td class="colLast"> |
| <div class="block">Scoring filter used in conjunction with |
| <a href="org/apache/nutch/scoring/webgraph/WebGraph.html" title="class in org.apache.nutch.scoring.webgraph"><code>WebGraph</code></a>.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/scoring/metadata/package-summary.html">org.apache.nutch.scoring.metadata</a></th> |
| <td class="colLast"> |
| <div class="block">Metadata Scoring Plugin</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/scoring/opic/package-summary.html">org.apache.nutch.scoring.opic</a></th> |
| <td class="colLast"> |
| <div class="block">Scoring filter implementing a variant of the Online Page Importance Computation |
| (OPIC) algorithm.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/scoring/orphan/package-summary.html">org.apache.nutch.scoring.orphan</a></th> |
| <td class="colLast"> |
| <div class="block">Scoring filter to modify score or status of orphaned pages (no inlinks found |
| for a configurable amount of time).</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/scoring/similarity/package-summary.html">org.apache.nutch.scoring.similarity</a></th> |
| <td class="colLast"> </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/scoring/similarity/cosine/package-summary.html">org.apache.nutch.scoring.similarity.cosine</a></th> |
| <td class="colLast"> |
| <div class="block">Implements the cosine similarity metric for scoring relevant documents</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/scoring/similarity/util/package-summary.html">org.apache.nutch.scoring.similarity.util</a></th> |
| <td class="colLast"> |
| <div class="block">Utility package for Lucene functions.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/scoring/urlmeta/package-summary.html">org.apache.nutch.scoring.urlmeta</a></th> |
| <td class="colLast"> |
| <div class="block">URL Meta Tag Scoring Plugin</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/scoring/webgraph/package-summary.html">org.apache.nutch.scoring.webgraph</a></th> |
| <td class="colLast"> |
| <div class="block">Scoring implementation based on link analysis |
| (<a href="org/apache/nutch/scoring/webgraph/LinkRank.html" title="class in org.apache.nutch.scoring.webgraph"><code>LinkRank</code></a>), |
| see <a href="org/apache/nutch/scoring/webgraph/WebGraph.html" title="class in org.apache.nutch.scoring.webgraph"><code>WebGraph</code></a>.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/segment/package-summary.html">org.apache.nutch.segment</a></th> |
| <td class="colLast"> |
| <div class="block">A segment stores all data from on generate/fetch/update cycle: |
| fetch list, protocol status, raw content, parsed content, and extracted outgoing links.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/service/package-summary.html">org.apache.nutch.service</a></th> |
| <td class="colLast"> </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/service/impl/package-summary.html">org.apache.nutch.service.impl</a></th> |
| <td class="colLast"> </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/service/model/request/package-summary.html">org.apache.nutch.service.model.request</a></th> |
| <td class="colLast"> </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/service/model/response/package-summary.html">org.apache.nutch.service.model.response</a></th> |
| <td class="colLast"> </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/service/resources/package-summary.html">org.apache.nutch.service.resources</a></th> |
| <td class="colLast"> </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/tools/package-summary.html">org.apache.nutch.tools</a></th> |
| <td class="colLast"> |
| <div class="block">Miscellaneous tools.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/tools/arc/package-summary.html">org.apache.nutch.tools.arc</a></th> |
| <td class="colLast"> |
| <div class="block">Tools to read the |
| <a href="http://archive.org/web/researcher/ArcFileFormat.php">Arc file format</a>.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/tools/warc/package-summary.html">org.apache.nutch.tools.warc</a></th> |
| <td class="colLast"> |
| <div class="block">Tools to import / export between Nutch segments and |
| <a href="http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf"> |
| WARC archives</a>.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/urlfilter/api/package-summary.html">org.apache.nutch.urlfilter.api</a></th> |
| <td class="colLast"> |
| <div class="block">Generic <a href="org/apache/nutch/net/URLFilter.html" title="interface in org.apache.nutch.net"><code>URL filter</code></a> library, |
| abstracting away from regular expression implementations.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/urlfilter/automaton/package-summary.html">org.apache.nutch.urlfilter.automaton</a></th> |
| <td class="colLast"> |
| <div class="block">URL filter plugin based on |
| <a href="https://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State |
| Automata for Java<sup>TM</sup>.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/urlfilter/domain/package-summary.html">org.apache.nutch.urlfilter.domain</a></th> |
| <td class="colLast"> |
| <div class="block">URL filter plugin to include only URLs which match an element in a given list of |
| domain suffixes, domain names, and/or host names.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/urlfilter/domaindenylist/package-summary.html">org.apache.nutch.urlfilter.domaindenylist</a></th> |
| <td class="colLast"> |
| <div class="block">URL filter plugin to exclude URLs by domain suffixes, domain names, and/or host names.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/urlfilter/fast/package-summary.html">org.apache.nutch.urlfilter.fast</a></th> |
| <td class="colLast"> |
| <div class="block">URL filter plugin that first does fast exact suffix matches on host/domain |
| names before applying regular expressions to the path component of a URL.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/urlfilter/ignoreexempt/package-summary.html">org.apache.nutch.urlfilter.ignoreexempt</a></th> |
| <td class="colLast"> |
| <div class="block">URL filter plugin which identifies exemptions to external urls when |
| when external urls are set to ignore.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/urlfilter/prefix/package-summary.html">org.apache.nutch.urlfilter.prefix</a></th> |
| <td class="colLast"> |
| <div class="block">URL filter plugin to include only URLs which match one of a given list of URL prefixes.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/urlfilter/regex/package-summary.html">org.apache.nutch.urlfilter.regex</a></th> |
| <td class="colLast"> |
| <div class="block">URL filter plugin to include and/or exclude URLs matching Java regular expressions.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/urlfilter/suffix/package-summary.html">org.apache.nutch.urlfilter.suffix</a></th> |
| <td class="colLast"> |
| <div class="block">URL filter plugin to either exclude or include only URLs which match |
| one of the given (path) suffixes.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/urlfilter/validator/package-summary.html">org.apache.nutch.urlfilter.validator</a></th> |
| <td class="colLast"> |
| <div class="block">URL filter plugin that validates given urls.</div> |
| </td> |
| </tr> |
| <tr class="altColor"> |
| <th class="colFirst" scope="row"><a href="org/apache/nutch/util/package-summary.html">org.apache.nutch.util</a></th> |
| <td class="colLast"> |
| <div class="block">Miscellaneous utility classes.</div> |
| </td> |
| </tr> |
| <tr class="rowColor"> |
| <th class="colFirst" scope="row"><a href="org/creativecommons/nutch/package-summary.html">org.creativecommons.nutch</a></th> |
| <td class="colLast"> |
| <div class="block">Sample plugins that parse and index Creative Commons metadata.</div> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| </li> |
| </ul> |
| </div> |
| </main> |
| <footer role="contentinfo"> |
| <nav role="navigation"> |
| <!-- ======= START OF BOTTOM NAVBAR ====== --> |
| <div class="bottomNav"><a id="navbar.bottom"> |
| <!-- --> |
| </a> |
| <div class="skipNav"><a href="#skip.navbar.bottom" title="Skip navigation links">Skip navigation links</a></div> |
| <a id="navbar.bottom.firstrow"> |
| <!-- --> |
| </a> |
| <ul class="navList" title="Navigation"> |
| <li><a href="index.html">Overview</a></li> |
| <li>Package</li> |
| <li>Class</li> |
| <li>Use</li> |
| <li><a href="overview-tree.html">Tree</a></li> |
| <li><a href="deprecated-list.html">Deprecated</a></li> |
| <li><a href="index-all.html">Index</a></li> |
| <li><a href="help-doc.html">Help</a></li> |
| </ul> |
| </div> |
| <div class="subNav"> |
| <ul class="navList" id="allclasses_navbar_bottom"> |
| <li><a href="allclasses.html">All Classes</a></li> |
| </ul> |
| <div> |
| <script type="text/javascript"><!-- |
| allClassesLink = document.getElementById("allclasses_navbar_bottom"); |
| if(window==top) { |
| allClassesLink.style.display = "block"; |
| } |
| else { |
| allClassesLink.style.display = "none"; |
| } |
| //--> |
| </script> |
| <noscript> |
| <div>JavaScript is disabled on your browser.</div> |
| </noscript> |
| </div> |
| <a id="skip.navbar.bottom"> |
| <!-- --> |
| </a></div> |
| <!-- ======== END OF BOTTOM NAVBAR ======= --> |
| </nav> |
| <p class="legalCopy"><small>Copyright © 2025 The Apache Software Foundation</small></p> |
| </footer> |
| </body> |
| </html> |