| <?xml version="1.0"?> |
| <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> |
| <!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| <!-- Do not modify this file directly. Instead, copy entries that you --> |
| <!-- wish to modify from this file into nutch-site.xml and change them --> |
| <!-- there. If nutch-site.xml does not already exist, create it. --> |
| |
| <configuration> |
| |
| <!-- general properties --> |
| |
| <property> |
| <name>store.ip.address</name> |
| <value>false</value> |
| <description>Enables us to capture the specific IP address |
| (InetSocketAddress) of the host which we connect to via |
| the given protocol. Currently supported is protocol-ftp and |
| http. |
| </description> |
| </property> |
| |
| <!-- file properties --> |
| |
| <property> |
| <name>file.content.limit</name> |
| <value>1048576</value> |
| <description>The length limit for downloaded content using the file:// |
| protocol, in bytes. If this value is nonnegative (>=0), content longer |
| than it will be truncated; otherwise, no truncation at all. Do not |
| confuse this setting with the http.content.limit setting. |
| </description> |
| </property> |
| |
| <property> |
| <name>file.crawl.parent</name> |
| <value>true</value> |
| <description>The crawler is not restricted to the directories that you specified in the |
| Urls file but it is jumping into the parent directories as well. For your own crawlings you can |
| change this behavior (set to false) the way that only directories beneath the directories that you specify get |
| crawled.</description> |
| </property> |
| |
| <property> |
| <name>file.crawl.redirect_noncanonical</name> |
| <value>true</value> |
| <description> |
| If true, protocol-file treats non-canonical file names as |
| redirects and does not canonicalize file names internally. A file |
| name containing symbolic links as path elements is then not |
| resolved and "fetched" but recorded as redirect with the |
| canonical name (all links on path are resolved) as redirect |
| target. |
| </description> |
| </property> |
| |
| <property> |
| <name>file.content.ignored</name> |
| <value>true</value> |
| <description>If true, no file content will be saved during fetch. |
| And it is probably what we want to set most of time, since file:// URLs |
| are meant to be local and we can always use them directly at parsing |
| and indexing stages. Otherwise file contents will be saved. |
| !! NO IMPLEMENTED YET !! |
| </description> |
| </property> |
| |
| <!-- HTTP properties --> |
| |
| <property> |
| <name>http.agent.name</name> |
| <value></value> |
| <description>HTTP 'User-Agent' request header. MUST NOT be empty - |
| please set this to a single word uniquely related to your organization. |
| |
| NOTE: You should also check other related properties: |
| |
| http.robots.agents |
| http.agent.description |
| http.agent.url |
| http.agent.email |
| http.agent.version |
| |
| and set their values appropriately. |
| |
| </description> |
| </property> |
| |
| <property> |
| <name>http.robots.agents</name> |
| <value></value> |
| <description>Any other agents, apart from 'http.agent.name', that the robots |
| parser would look for in robots.txt. Multiple agents can be provided using |
| comma as a delimiter. eg. mybot,foo-spider,bar-crawler |
| |
| The ordering of agents does NOT matter and the robots parser would make |
| decision based on the agent which matches first to the robots rules. |
| Also, there is NO need to add a wildcard (ie. "*") to this string as the |
| robots parser would smartly take care of a no-match situation. |
| |
| If no value is specified, by default HTTP agent (ie. 'http.agent.name') |
| would be used for user agent matching by the robots parser. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.robot.rules.whitelist</name> |
| <value></value> |
| <description>Comma separated list of hostnames or IP addresses to ignore |
| robot rules parsing for. Use with care and only if you are explicitly |
| allowed by the site owner to ignore the site's robots.txt! |
| </description> |
| </property> |
| |
| <property> |
| <name>http.robots.403.allow</name> |
| <value>true</value> |
| <description>Some servers return HTTP status 403 (Forbidden) if |
| /robots.txt doesn't exist. This should probably mean that we are |
| allowed to crawl the site nonetheless. If this is set to false, |
| then such sites will be treated as forbidden.</description> |
| </property> |
| |
| <property> |
| <name>http.agent.description</name> |
| <value></value> |
| <description>Further description of our bot- this text is used in |
| the User-Agent header. It appears in parenthesis after the agent name. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.url</name> |
| <value></value> |
| <description>A URL to advertise in the User-Agent header. This will |
| appear in parenthesis after the agent name. Custom dictates that this |
| should be a URL of a page explaining the purpose and behavior of this |
| crawler. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.email</name> |
| <value></value> |
| <description>An email address to advertise in the HTTP 'From' request |
| header and User-Agent header. A good practice is to mangle this |
| address (e.g. 'info at example dot com') to avoid spamming. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.version</name> |
| <value>Nutch-1.16</value> |
| <description>A version string to advertise in the User-Agent |
| header.</description> |
| </property> |
| |
| <property> |
| <name>http.agent.rotate</name> |
| <value>false</value> |
| <description> |
| If true, instead of http.agent.name, alternating agent names are |
| chosen from a list provided via http.agent.rotate.file. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.rotate.file</name> |
| <value>agents.txt</value> |
| <description> |
| File containing alternative user agent names to be used instead of |
| http.agent.name on a rotating basis if http.agent.rotate is true. |
| Each line of the file should contain exactly one agent |
| specification including name, version, description, URL, etc. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.host.cookie.file</name> |
| <value>cookies.txt</value> |
| <description> |
| File containing per-host configured cookies. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.host</name> |
| <value></value> |
| <description>Name or IP address of the host on which the Nutch crawler |
| would be running. Currently this is used by 'protocol-httpclient' |
| plugin. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.timeout</name> |
| <value>10000</value> |
| <description>The default network timeout, in milliseconds.</description> |
| </property> |
| |
| <property> |
| <name>http.content.limit</name> |
| <value>1048576</value> |
| <description>The length limit for downloaded content using the http/https |
| protocols, in bytes. If this value is nonnegative (>=0), content longer |
| than it will be truncated; otherwise, no truncation at all. Do not |
| confuse this setting with the file.content.limit setting. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.time.limit</name> |
| <value>-1</value> |
| <description>The time limit in seconds to fetch a single document. |
| If this value is nonnegative (>=0), the HTTP protocol implementation |
| will stop reading from a socket after http.time.limit seconds have |
| been spent for fetching this document. The HTTP response is then |
| marked as truncated. The http.time.limit should be set to a longer |
| time period than http.timeout, as it applies to the entire duration |
| to fetch a document, not only the network timeout of a single I/O |
| operation. Note: supported only by protocol-okhttp. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.partial.truncated</name> |
| <value>false</value> |
| <description> |
| If true the HTTP protocol implementation may store the content of |
| partial fetches and mark the response as truncated instead of |
| throwing an exception which will cause the fetch to fail. This |
| allows to use the data which has already been fetched, instead of |
| retrying the fetch later. Note: supported only by protocol-okhttp. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.tls.certificates.check</name> |
| <value>false</value> |
| <description> |
| Whether to check the TLS/SSL server certificates for validity. |
| If true invalid (e.g., self-signed or expired) certificates are |
| rejected and the https connection is failed. If false insecure |
| TLS/SSL connections are allowed. Note that this property is |
| currently not supported by all http/https protocol plugins. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.proxy.host</name> |
| <value></value> |
| <description>The proxy hostname. If empty, no proxy is used.</description> |
| </property> |
| |
| <property> |
| <name>http.proxy.port</name> |
| <value></value> |
| <description>The proxy port.</description> |
| </property> |
| |
| <property> |
| <name>http.proxy.username</name> |
| <value></value> |
| <description>Username for proxy. This will be used by |
| 'protocol-httpclient', if the proxy server requests basic, digest |
| and/or NTLM authentication. To use this, 'protocol-httpclient' must |
| be present in the value of 'plugin.includes' property. |
| NOTE: For NTLM authentication, do not prefix the username with the |
| domain, i.e. 'susam' is correct whereas 'DOMAIN\susam' is incorrect. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.proxy.password</name> |
| <value></value> |
| <description>Password for proxy. This will be used by |
| 'protocol-httpclient', if the proxy server requests basic, digest |
| and/or NTLM authentication. To use this, 'protocol-httpclient' must |
| be present in the value of 'plugin.includes' property. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.proxy.realm</name> |
| <value></value> |
| <description>Authentication realm for proxy. Do not define a value |
| if realm is not required or authentication should take place for any |
| realm. NTLM does not use the notion of realms. Specify the domain name |
| of NTLM authentication as the value for this property. To use this, |
| 'protocol-httpclient' must be present in the value of |
| 'plugin.includes' property. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.auth.file</name> |
| <value>httpclient-auth.xml</value> |
| <description>Authentication configuration file for |
| 'protocol-httpclient' plugin. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.proxy.type</name> |
| <value>HTTP</value> |
| <description> |
| Proxy type: HTTP or SOCKS (cf. java.net.Proxy.Type). |
| Note: supported by protocol-okhttp. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.proxy.exception.list</name> |
| <value></value> |
| <description>A comma separated list of hosts that don't use the proxy |
| (e.g. intranets). Example: www.apache.org</description> |
| </property> |
| |
| <property> |
| <name>http.redirect.max</name> |
| <value>0</value> |
| <description>The maximum number of redirects the fetcher will follow when |
| trying to fetch a page. If set to negative or 0, fetcher won't immediately |
| follow redirected URLs, instead it will record them for later fetching. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.useHttp11</name> |
| <value>true</value> |
| <description> |
| If true, use HTTP 1.1, if false use HTTP 1.0 . |
| </description> |
| </property> |
| |
| <property> |
| <name>http.useHttp2</name> |
| <value>false</value> |
| <description> |
| If true try HTTP/2 and fall-back to HTTP/1.1 if HTTP/2 not |
| supported, if false use always HTTP/1.1. |
| |
| NOTE: HTTP/2 is currently only supported by protocol-okhttp and |
| requires at runtime Java 9 or a modified Java 8 with support for |
| ALPN (Application Layer Protocol Negotiation). |
| </description> |
| </property> |
| |
| <property> |
| <name>http.accept.language</name> |
| <value>en-us,en-gb,en;q=0.7,*;q=0.3</value> |
| <description>Value of the "Accept-Language" request header field. |
| This allows selecting non-English language as default one to retrieve. |
| It is a useful setting for search engines build for certain national group. |
| To send requests without "Accept-Language" header field, thi property must |
| be configured to contain a space character because an empty property does |
| not overwrite the default. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.accept</name> |
| <value>text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8</value> |
| <description>Value of the "Accept" request header field. A space character |
| as value will cause that no "Accept" header field is sent in the request. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.accept.charset</name> |
| <value>utf-8,iso-8859-1;q=0.7,*;q=0.7</value> |
| <description>Value of the "Accept-Charset" request header field. A space character |
| as value will cause that no "Accept-Charset" header field is sent in the request. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.store.responsetime</name> |
| <value>true</value> |
| <description>Enables us to record the response time of the |
| host which is the time period between start connection to end |
| connection of a pages host. The response time in milliseconds |
| is stored in CrawlDb in CrawlDatum's meta data under key "_rs_" |
| </description> |
| </property> |
| |
| <property> |
| <name>http.enable.if.modified.since.header</name> |
| <value>true</value> |
| <description>Whether Nutch sends an HTTP If-Modified-Since header. It reduces |
| bandwidth when enabled by not downloading pages that respond with an HTTP |
| Not-Modified header. URL's that are not downloaded are not passed through |
| parse or indexing filters. If you regularly modify filters, you should force |
| Nutch to also download unmodified pages by disabling this feature. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.enable.cookie.header</name> |
| <value>true</value> |
| <description>Whether Nutch sends an HTTP Cookie header. The cookie value |
| is read from the CrawlDatum Cookie metadata field. |
| </description> |
| </property> |
| |
| <!-- FTP properties --> |
| |
| <property> |
| <name>ftp.username</name> |
| <value>anonymous</value> |
| <description>ftp login username.</description> |
| </property> |
| |
| <property> |
| <name>ftp.password</name> |
| <value>anonymous@example.com</value> |
| <description>ftp login password.</description> |
| </property> |
| |
| <property> |
| <name>ftp.content.limit</name> |
| <value>1048576</value> |
| <description>The length limit for downloaded content, in bytes. |
| If this value is nonnegative (>=0), content longer than it will be truncated; |
| otherwise, no truncation at all. |
| Caution: classical ftp RFCs never defines partial transfer and, in fact, |
| some ftp servers out there do not handle client side forced close-down very |
| well. Our implementation tries its best to handle such situations smoothly. |
| </description> |
| </property> |
| |
| <property> |
| <name>ftp.timeout</name> |
| <value>60000</value> |
| <description>Default timeout for ftp client socket, in millisec. |
| Please also see ftp.keep.connection below.</description> |
| </property> |
| |
| <property> |
| <name>ftp.server.timeout</name> |
| <value>100000</value> |
| <description>An estimation of ftp server idle time, in millisec. |
| Typically it is 120000 millisec for many ftp servers out there. |
| Better be conservative here. Together with ftp.timeout, it is used to |
| decide if we need to delete (annihilate) current ftp.client instance and |
| force to start another ftp.client instance anew. This is necessary because |
| a fetcher thread may not be able to obtain next request from queue in time |
| (due to idleness) before our ftp client times out or remote server |
| disconnects. Used only when ftp.keep.connection is true (please see below). |
| </description> |
| </property> |
| |
| <property> |
| <name>ftp.keep.connection</name> |
| <value>false</value> |
| <description>Whether to keep ftp connection. Useful if crawling same host |
| again and again. When set to true, it avoids connection, login and dir list |
| parser setup for subsequent urls. If it is set to true, however, you must |
| make sure (roughly): |
| (1) ftp.timeout is less than ftp.server.timeout |
| (2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay) |
| Otherwise there will be too many "delete client because idled too long" |
| messages in thread logs.</description> |
| </property> |
| |
| <property> |
| <name>ftp.follow.talk</name> |
| <value>false</value> |
| <description>Whether to log dialogue between our client and remote |
| server. Useful for debugging.</description> |
| </property> |
| |
| <!-- web db properties --> |
| <property> |
| <name>db.fetch.interval.default</name> |
| <value>2592000</value> |
| <description>The default number of seconds between re-fetches of a page (30 days). |
| </description> |
| </property> |
| |
| <property> |
| <name>db.fetch.interval.max</name> |
| <value>7776000</value> |
| <description>The maximum number of seconds between re-fetches of a page |
| (90 days). After this period every page in the db will be re-tried, no |
| matter what is its status. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.class</name> |
| <value>org.apache.nutch.crawl.DefaultFetchSchedule</value> |
| <description>The implementation of fetch schedule. DefaultFetchSchedule simply |
| adds the original fetchInterval to the last fetch time, regardless of |
| page changes, whereas AdaptiveFetchSchedule (see below) tries to adapt |
| to the rate at which a given page is changed. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.inc_rate</name> |
| <value>0.4</value> |
| <description>If a page is unmodified, its fetchInterval will be |
| increased by this rate. This value should not |
| exceed 0.5, otherwise the algorithm becomes unstable.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.dec_rate</name> |
| <value>0.2</value> |
| <description>If a page is modified, its fetchInterval will be |
| decreased by this rate. This value should not |
| exceed 0.5, otherwise the algorithm becomes unstable.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.min_interval</name> |
| <value>60.0</value> |
| <description>Minimum fetchInterval, in seconds.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.max_interval</name> |
| <value>31536000.0</value> |
| <description>Maximum fetchInterval, in seconds (365 days). |
| NOTE: this is limited by db.fetch.interval.max. Pages with |
| fetchInterval larger than db.fetch.interval.max |
| will be fetched anyway.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.sync_delta</name> |
| <value>true</value> |
| <description>If true, try to synchronize with the time of page change. |
| by shifting the next fetchTime by a fraction (sync_rate) of the difference |
| between the last modification time, and the last fetch time.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.sync_delta_rate</name> |
| <value>0.3</value> |
| <description>See sync_delta for description. This value should not |
| exceed 0.5, otherwise the algorithm becomes unstable.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.mime.file</name> |
| <value>adaptive-mimetypes.txt</value> |
| <description>The configuration file for the MimeAdaptiveFetchSchedule. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.update.additions.allowed</name> |
| <value>true</value> |
| <description>If true, updatedb will add newly discovered URLs, if false |
| only already existing URLs in the CrawlDb will be updated and no new |
| URLs will be added. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.preserve.backup</name> |
| <value>true</value> |
| <description>If true, updatedb will keep a backup of the previous CrawlDB |
| version in the old directory. In case of disaster, one can rename old to |
| current and restore the CrawlDB to its previous state. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.update.purge.404</name> |
| <value>false</value> |
| <description>If true, updatedb will add purge records with status DB_GONE |
| from the CrawlDB. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.update.purge.orphans</name> |
| <value>false</value> |
| <description>If true, updatedb will permanently delete URL's marked |
| as orphan from the CrawlDb. The plugin scoring-orphan needs to be |
| activated to get records marked as orphan. See the plugin's options |
| elsewhere in this document. |
| </description> |
| </property> |
| |
| <property> |
| <name>crawldb.url.normalizers</name> |
| <value>false</value> |
| <description> |
| !Temporary, can be overwritten with the command line! |
| Normalize urls when updating crawldb |
| </description> |
| </property> |
| |
| <property> |
| <name>crawldb.url.filters</name> |
| <value>false</value> |
| <description> |
| !Temporary, can be overwritten with the command line! |
| Filter urls when updating crawldb |
| </description> |
| </property> |
| |
| <property> |
| <name>db.update.max.inlinks</name> |
| <value>10000</value> |
| <description>Maximum number of inlinks to take into account when updating |
| a URL score in the crawlDB. Only the best scoring inlinks are kept. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.ignore.internal.links</name> |
| <value>false</value> |
| <description>If true, outlinks leading from a page to internal hosts or domain |
| will be ignored. This is an effective way to limit the crawl to include |
| only initially injected hosts or domains, without creating complex URLFilters. |
| See 'db.ignore.external.links.mode'. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.ignore.external.links</name> |
| <value>false</value> |
| <description>If true, outlinks leading from a page to external hosts or domain |
| will be ignored. This is an effective way to limit the crawl to include |
| only initially injected hosts or domains, without creating complex URLFilters. |
| See 'db.ignore.external.links.mode'. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.ignore.also.redirects</name> |
| <value>true</value> |
| <description>If true, the fetcher checks redirects the same way as |
| links when ignoring internal or external links. Set to false to |
| follow redirects despite the values for db.ignore.external.links and |
| db.ignore.internal.links. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.ignore.external.links.mode</name> |
| <value>byHost</value> |
| <description>Alternative value is byDomain</description> |
| </property> |
| |
| <property> |
| <name>db.ignore.external.exemptions.file</name> |
| <value>db-ignore-external-exemptions.txt</value> |
| <description> |
| This file contains exemption rules used by 'urlfiter-ignoreexempt' plugin |
| </description> |
| </property> |
| |
| <property> |
| <name>db.injector.overwrite</name> |
| <value>false</value> |
| <description>Whether existing records in the CrawlDB will be overwritten |
| by injected records. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.injector.update</name> |
| <value>false</value> |
| <description>If true existing records in the CrawlDB will be updated with |
| injected records. Old meta data is preserved. The db.injector.overwrite |
| parameter has precedence. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.score.injected</name> |
| <value>1.0</value> |
| <description>The score of new pages added by the injector. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.score.link.external</name> |
| <value>1.0</value> |
| <description>The score factor for new pages added due to a link from |
| another host relative to the referencing page's score. Scoring plugins |
| may use this value to affect initial scores of external links. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.score.link.internal</name> |
| <value>1.0</value> |
| <description>The score factor for pages added due to a link from the |
| same host, relative to the referencing page's score. Scoring plugins |
| may use this value to affect initial scores of internal links. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.score.count.filtered</name> |
| <value>false</value> |
| <description>The score value passed to newly discovered pages is |
| calculated as a fraction of the original page score divided by the |
| number of outlinks. If this option is false, only the outlinks that passed |
| URLFilters will count, if it's true then all outlinks will count. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.max.outlinks.per.page</name> |
| <value>100</value> |
| <description>The maximum number of outlinks that we'll process for a page. |
| If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks |
| will be processed for a page; otherwise, all outlinks will be processed. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.max.outlink.length</name> |
| <value>4096</value> |
| <description> |
| The maximum length in characters accepted for outlinks before |
| applying URL normalizers and filters. If this value is |
| nonnegative (>=0), only URLs with a length in characters less or |
| equal than db.max.outlink.length are accepted and then passed to |
| URL normalizers and filters. Doing the length check beforehand |
| avoids that normalizers or filters hang up on overlong URLs. |
| Note: this property is only used to check URLs found as outlinks |
| and redirects, but not for injected URLs. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.parsemeta.to.crawldb</name> |
| <value></value> |
| <description>Comma-separated list of parse metadata keys to transfer to the crawldb (NUTCH-779). |
| Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang' |
| will copy both the key 'lang' and its value to the corresponding entry in the crawldb. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.fetch.retry.max</name> |
| <value>3</value> |
| <description>The maximum number of times a url that has encountered |
| recoverable errors is generated for fetch.</description> |
| </property> |
| |
| <property> |
| <name>db.signature.class</name> |
| <value>org.apache.nutch.crawl.MD5Signature</value> |
| <description>The default implementation of a page signature. Signatures |
| created with this implementation will be used for duplicate detection |
| and removal.</description> |
| </property> |
| |
| <property> |
| <name>db.signature.text_profile.min_token_len</name> |
| <value>2</value> |
| <description>Minimum token length to be included in the signature. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.signature.text_profile.quant_rate</name> |
| <value>0.01</value> |
| <description>Profile frequencies will be rounded down to a multiple of |
| QUANT = (int)(QUANT_RATE * maxFreq), where maxFreq is a maximum token |
| frequency. If maxFreq > 1 then QUANT will be at least 2, which means that |
| for longer texts tokens with frequency 1 will always be discarded. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.stats.score.quantiles</name> |
| <value>.01,.05,.1,.2,.25,.3,.4,.5,.6,.7,.75,.8,.9,.95,.99</value> |
| <description> |
| Quantiles of the distribution of CrawlDatum scores shown in the |
| CrawlDb statistics (command `readdb -stats'). Comma-separated |
| list of floating point numbers. |
| </description> |
| </property> |
| |
| <!-- linkdb properties --> |
| |
| <property> |
| <name>linkdb.max.inlinks</name> |
| <value>10000</value> |
| <description>Maximum number of Inlinks per URL to be kept in LinkDb. |
| If "invertlinks" finds more inlinks than this number, only the first |
| N inlinks will be stored, and the rest will be discarded. |
| </description> |
| </property> |
| |
| <property> |
| <name>linkdb.ignore.internal.links</name> |
| <value>true</value> |
| <description>If true, when adding new links to a page, links from |
| the same host are ignored. This is an effective way to limit the |
| size of the link database, keeping only the highest quality |
| links. |
| </description> |
| </property> |
| |
| <property> |
| <name>linkdb.ignore.external.links</name> |
| <value>false</value> |
| <description>If true, when adding new links to a page, links from |
| the a different host are ignored. |
| </description> |
| </property> |
| |
| <property> |
| <name>linkdb.max.anchor.length</name> |
| <value>100</value> |
| <description> |
| The maximum number of characters permitted for anchor texts stored |
| in LinkDb. |
| </description> |
| </property> |
| |
| <!-- generate properties --> |
| |
| <property> |
| <name>generate.max.count</name> |
| <value>-1</value> |
| <description>The maximum number of urls in a single |
| fetchlist. -1 if unlimited. The urls are counted according |
| to the value of the parameter generate.count.mode. |
| </description> |
| </property> |
| |
| <property> |
| <name>generate.count.mode</name> |
| <value>host</value> |
| <description>Determines how the URLs are counted for generate.max.count. |
| Default value is 'host' but can be 'domain'. Note that we do not count |
| per IP in the new version of the Generator. |
| </description> |
| </property> |
| |
| <property> |
| <name>generate.update.crawldb</name> |
| <value>false</value> |
| <description>For highly-concurrent environments, where several |
| generate/fetch/update cycles may overlap, setting this to true ensures |
| that generate will create different fetchlists even without intervening |
| updatedb-s, at the cost of running an additional job to update CrawlDB. |
| If false, running generate twice without intervening updatedb will |
| generate identical fetchlists. See also crawl.gen.delay which defines |
| how long items already generated are blocked.</description> |
| </property> |
| |
| <property> |
| <name>generate.min.score</name> |
| <value>0</value> |
| <description>Select only entries with a score larger than |
| generate.min.score.</description> |
| </property> |
| |
| <property> |
| <name>generate.min.interval</name> |
| <value>-1</value> |
| <description>Select only entries with a retry interval lower than |
| generate.min.interval. A value of -1 disables this check.</description> |
| </property> |
| |
| <property> |
| <name>generate.hostdb</name> |
| <value></value> |
| <description>Path to HostDB, required for the generate.max.count.expr |
| and generate.fetch.delay.expr properties. |
| See https://issues.apache.org/jira/browse/NUTCH-2368</description> |
| </property> |
| |
| <property> |
| <name>generate.fetch.delay.expr</name> |
| <value></value> |
| <description>Controls variable fetcher.server.delay via a Jexl expression and |
| HostDB information. It allows you to alter fetch delay based on HostDB data. |
| See https://issues.apache.org/jira/browse/NUTCH-2368</description> |
| </property> |
| |
| <property> |
| <name>generate.max.count.expr</name> |
| <value></value> |
| <description>Controls variable generate.max.count via a Jexl expression and |
| HostDB information. It allows you to alter maxCount based on HostDB data. |
| See https://issues.apache.org/jira/browse/NUTCH-2368</description> |
| </property> |
| |
| <property> |
| <name>generate.restrict.status</name> |
| <value></value> |
| <description>Select only entries of this status, see |
| https://issues.apache.org/jira/browse/NUTCH-1248</description> |
| </property> |
| |
| <!-- urlpartitioner properties --> |
| |
| <property> |
| <name>partition.url.mode</name> |
| <value>byHost</value> |
| <description>Determines how to partition URLs. Default value is 'byHost', |
| also takes 'byDomain' or 'byIP'. |
| </description> |
| </property> |
| |
| <property> |
| <name>crawl.gen.delay</name> |
| <value>604800000</value> |
| <description> |
| This value, expressed in milliseconds, defines how long we should keep the lock on records |
| in CrawlDb that were just selected for fetching. If these records are not updated |
| in the meantime, the lock is canceled, i.e. they become eligible for selecting again. |
| Default value of this is 7 days (604800000 ms). If generate.update.crawldb is false |
| the property crawl.gen.delay has no effect. |
| </description> |
| </property> |
| |
| <!-- fetcher properties --> |
| |
| <property> |
| <name>fetcher.server.delay</name> |
| <value>5.0</value> |
| <description>The number of seconds the fetcher will delay between |
| successive requests to the same server. Note that this might get |
| overridden by a Crawl-Delay from a robots.txt and is used ONLY if |
| fetcher.threads.per.queue is set to 1. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.server.min.delay</name> |
| <value>0.0</value> |
| <description>The minimum number of seconds the fetcher will delay between |
| successive requests to the same server. This value is applicable ONLY |
| if fetcher.threads.per.queue is greater than 1 (i.e. the host blocking |
| is turned off).</description> |
| </property> |
| |
| <property> |
| <name>fetcher.max.crawl.delay</name> |
| <value>30</value> |
| <description> |
| If the Crawl-Delay in robots.txt is set to greater than this value (in |
| seconds) then the fetcher will skip this page, generating an error report. |
| If set to -1 the fetcher will never skip such pages and will wait the |
| amount of time retrieved from robots.txt Crawl-Delay, however long that |
| might be. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.threads.fetch</name> |
| <value>10</value> |
| <description>The number of FetcherThreads the fetcher should use. |
| This is also determines the maximum number of requests that are |
| made at once (each FetcherThread handles one connection). The total |
| number of threads running in distributed mode will be the number of |
| fetcher threads * number of nodes as fetcher has one map task per node. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.threads.per.queue</name> |
| <value>1</value> |
| <description>This number is the maximum number of threads that |
| should be allowed to access a queue at one time. Setting it to |
| a value > 1 will cause the Crawl-Delay value from robots.txt to |
| be ignored and the value of fetcher.server.min.delay to be used |
| as a delay between successive requests to the same server instead |
| of fetcher.server.delay. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.queue.mode</name> |
| <value>byHost</value> |
| <description>Determines how to put URLs into queues. Default value |
| is 'byHost', also takes 'byDomain' or 'byIP'. Crawl delays are |
| implemented on the level of fetcher queues. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.verbose</name> |
| <value>false</value> |
| <description>If true, fetcher will log more verbosely.</description> |
| </property> |
| |
| <property> |
| <name>http.log.exceptions.suppress.stack</name> |
| <value>java.net.UnknownHostException,java.net.NoRouteToHostException</value> |
| <description>Comma-separated list of exceptions not shown with full |
| stack trace in logs of fetcher and HTTP protocol implementations. |
| The logs may shrink in size significantly, e.g., when for a large |
| unrestriced web crawl unknown hosts are logged shortly without full |
| stack trace. The full class name of the exception class (extending |
| Throwable) including the package path must be specified.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.parse</name> |
| <value>false</value> |
| <description>If true, fetcher will parse content. Default is false, which means |
| that a separate parsing step is required after fetching is finished.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.store.content</name> |
| <value>true</value> |
| <description>If true, fetcher will store content.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.signature</name> |
| <value>false</value> |
| <description>If true, fetcher will generate the signature for |
| successfully fetched documents even if the content is not parsed by |
| fetcher (see property fetcher.parse). Default is false, which means |
| that the signature is calculated when parsing either by the fetcher |
| or during the parsing step. Note that a non-parsing fetcher can |
| only generate signatures based on the binary content and not on the |
| textual content. An appropriate signature class should be chosen |
| (see property db.signature.class). |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.timelimit.mins</name> |
| <value>-1</value> |
| <description>This is the number of minutes allocated to the fetching. |
| Once this value is reached, any remaining entry from the input URL list is skipped |
| and all active queues are emptied. The default value of -1 deactivates the time limit. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.max.exceptions.per.queue</name> |
| <value>-1</value> |
| <description>The maximum number of protocol-level exceptions (e.g. timeouts) per |
| host (or IP) queue. Once this value is reached, any remaining entries from this |
| queue are purged, effectively stopping the fetching from this host/IP. The default |
| value of -1 deactivates this limit. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.throughput.threshold.pages</name> |
| <value>-1</value> |
| <description>The threshold of minimum pages per second. If the fetcher downloads less |
| pages per second than the configured threshold, the fetcher stops, preventing slow queue's |
| from stalling the throughput. This threshold must be an integer. This can be useful when |
| fetcher.timelimit.mins is hard to determine. The default value of -1 disables this check. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.throughput.threshold.retries</name> |
| <value>5</value> |
| <description>The number of times the fetcher.throughput.threshold.pages is allowed to be exceeded. |
| This settings prevents accidental slow downs from immediately killing the fetcher thread. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.throughput.threshold.check.after</name> |
| <value>5</value> |
| <description>The number of minutes after which the throughput check is enabled.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.threads.timeout.divisor</name> |
| <value>2</value> |
| <description>(EXPERT)The thread time-out divisor to use. By default threads have a time-out |
| value of mapreduce.task.timeout / 2. Increase this setting if the fetcher waits too |
| long before killing hanged threads. Be careful, a too high setting (+8) will most likely kill the |
| fetcher threads prematurely. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.queue.depth.multiplier</name> |
| <value>50</value> |
| <description>(EXPERT)The fetcher buffers the incoming URLs into queues based on the [host|domain|IP] |
| (see param fetcher.queue.mode). The depth of the queue is the number of threads times the value of this parameter. |
| A large value requires more memory but can improve the performance of the fetch when the order of the URLS in the fetch list |
| is not optimal. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.follow.outlinks.depth</name> |
| <value>-1</value> |
| <description>(EXPERT)When fetcher.parse is true and this value is greater than 0 the fetcher will extract outlinks |
| and follow until the desired depth is reached. A value of 1 means all generated pages are fetched and their first degree |
| outlinks are fetched and parsed too. Be careful, this feature is in itself agnostic of the state of the CrawlDB and does not |
| know about already fetched pages. A setting larger than 2 will most likely fetch home pages twice in the same fetch cycle. |
| It is highly recommended to set db.ignore.external.links to true to restrict the outlink follower to URL's within the same |
| domain. When disabled (false) the feature is likely to follow duplicates even when depth=1. |
| A value of -1 of 0 disables this feature. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.follow.outlinks.num.links</name> |
| <value>4</value> |
| <description>(EXPERT)The number of outlinks to follow when fetcher.follow.outlinks.depth is enabled. Be careful, this can multiply |
| the total number of pages to fetch. This works with fetcher.follow.outlinks.depth.divisor, by default settings the followed outlinks |
| at depth 1 is 8, not 4. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.follow.outlinks.depth.divisor</name> |
| <value>2</value> |
| <description>(EXPERT)The divisor of fetcher.follow.outlinks.num.links per fetcher.follow.outlinks.depth. This decreases the number |
| of outlinks to follow by increasing depth. The formula used is: outlinks = floor(divisor / depth * num.links). This prevents |
| exponential growth of the fetch list. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.follow.outlinks.ignore.external</name> |
| <value>true</value> |
| <description>Whether to ignore or follow external links. Set db.ignore.external.links to false and this to true to store outlinks |
| in the output but not follow them. If db.ignore.external.links is true this directive is ignored. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.bandwidth.target</name> |
| <value>-1</value> |
| <description>Target bandwidth in kilobits per sec for each mapper instance. This is used to adjust the number of |
| fetching threads automatically (up to fetcher.maxNum.threads). A value of -1 deactivates the functionality, in which case |
| the number of fetching threads is fixed (see fetcher.threads.fetch).</description> |
| </property> |
| |
| <property> |
| <name>fetcher.maxNum.threads</name> |
| <value>25</value> |
| <description>Max number of fetch threads allowed when using fetcher.bandwidth.target. Defaults to fetcher.threads.fetch if unspecified or |
| set to a value lower than it. </description> |
| </property> |
| |
| <property> |
| <name>fetcher.bandwidth.target.check.everyNSecs</name> |
| <value>30</value> |
| <description>(EXPERT) Value in seconds which determines how frequently we should reassess the optimal number of fetch threads when using |
| fetcher.bandwidth.target. Defaults to 30 and must be at least 1.</description> |
| </property> |
| |
| <property> |
| |
| <name>fetcher.store.robotstxt</name> |
| <value>false</value> |
| <description>If true (and fetcher.store.content is also true), |
| fetcher will store the robots.txt response content and status for |
| debugging or archival purposes. The robots.txt is added to the |
| content/ folder of the fetched segment. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.publisher</name> |
| <value>false</value> |
| <description>Set this value to true if you want to use an implementation of the Publisher/Subscriber model. Make sure to set corresponding |
| Publisher implementation specific properties</description> |
| </property> |
| |
| <property> |
| <name>fetcher.filter.urls</name> |
| <value>false</value> |
| <description>Whether fetcher will filter URLs (with the configured URL filters).</description> |
| </property> |
| |
| <property> |
| <name>fetcher.normalize.urls</name> |
| <value>false</value> |
| <description>Whether fetcher will normalize URLs (with the configured URL normalizers).</description> |
| </property> |
| |
| <!-- any23 plugin properties --> |
| |
| <property> |
| <name>any23.extractors</name> |
| <value>html-microdata</value> |
| <description>Comma-separated list of Any23 extractors (a list of extractors is available here: http://any23.apache.org/getting-started.html)</description> |
| </property> |
| |
| <property> |
| <name>any23.content_types</name> |
| <value>text/html,application/xhtml+xml</value> |
| <description>Comma-separated list of content-types onto which Any23 extractors should be applied (see http://www.iana.org/assignments/media-types/). If empty, all content-types are supported.</description> |
| </property> |
| |
| <!-- moreindexingfilter plugin properties --> |
| |
| <property> |
| <name>moreIndexingFilter.indexMimeTypeParts</name> |
| <value>true</value> |
| <description>Determines whether the index-more plugin will split the mime-type |
| in sub parts, this requires the type field to be multi valued. Set to true for backward |
| compatibility. False will not split the mime-type. |
| </description> |
| </property> |
| |
| <property> |
| <name>moreIndexingFilter.mapMimeTypes</name> |
| <value>false</value> |
| <description>Determines whether MIME-type mapping is enabled. It takes a |
| plain text file with mapped MIME-types. With it the user can map both |
| application/xhtml+xml and text/html to the same target MIME-type so it |
| can be treated equally in an index. See conf/contenttype-mapping.txt. |
| </description> |
| </property> |
| |
| <property> |
| <name>moreIndexingFilter.mapMimeTypes.field</name> |
| <value></value> |
| <description>It's used if moreIndexingFilter.mapMimeTypes is true. Indicates the field |
| where the mapped MIME-type must be written. If it's empty or unset, the content of the field "type" |
| will be replaced by the mapped MIME-type. |
| </description> |
| </property> |
| |
| <!-- AnchorIndexing filter plugin properties --> |
| |
| <property> |
| <name>anchorIndexingFilter.deduplicate</name> |
| <value>false</value> |
| <description>With this enabled the indexer will case-insensitive deduplicate anchors |
| before indexing. This prevents possible hundreds or thousands of identical anchors for |
| a given page to be indexed but will affect the search scoring (i.e. tf=1.0f). |
| </description> |
| </property> |
| |
| <!-- indexingfilter plugin properties --> |
| |
| <property> |
| <name>indexingfilter.order</name> |
| <value></value> |
| <description>The order by which index filters are applied. |
| If empty, all available index filters (as dictated by properties |
| plugin-includes and plugin-excludes above) are loaded and applied in system |
| defined order. If not empty, only named filters are loaded and applied |
| in given order. For example, if this property has value: |
| org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter |
| then BasicIndexingFilter is applied first, and MoreIndexingFilter second. |
| |
| Filter ordering might have impact on result if one filter depends on output of |
| another filter. |
| </description> |
| </property> |
| |
| <property> |
| <name>indexer.score.power</name> |
| <value>0.5</value> |
| <description>Determines the power of link analyis scores. Each |
| pages's boost is set to <i>score<sup>scorePower</sup></i> where |
| <i>score</i> is its link analysis score and <i>scorePower</i> is the |
| value of this parameter. This is compiled into indexes, so, when |
| this is changed, pages must be re-indexed for it to take |
| effect.</description> |
| </property> |
| |
| <property> |
| <name>indexer.max.title.length</name> |
| <value>100</value> |
| <description>The maximum number of characters of a title that are indexed. A value of -1 disables this check. |
| </description> |
| </property> |
| |
| <property> |
| <name>indexer.max.content.length</name> |
| <value>-1</value> |
| <description>The maximum number of characters of a content that are indexed. |
| Content beyond the limit is truncated. A value of -1 disables this check. |
| </description> |
| </property> |
| |
| <property> |
| <name>indexer.add.domain</name> |
| <value>false</value> |
| <description>Whether to add the domain field to a NutchDocument.</description> |
| </property> |
| |
| <property> |
| <name>indexer.skip.notmodified</name> |
| <value>false</value> |
| <description>Whether the indexer will skip records with a db_notmodified status. |
| </description> |
| </property> |
| |
| <property> |
| <name>indexer.delete.robots.noindex</name> |
| <value>false</value> |
| <description>Whether the indexer will delete documents marked by robots=noindex |
| </description> |
| </property> |
| |
| <property> |
| <name>indexer.delete.skipped.by.indexingfilter</name> |
| <value>false</value> |
| <description>Whether the indexer will delete documents that were skipped by indexing filters |
| </description> |
| </property> |
| |
| <property> |
| <name>indexer.indexwriters.file</name> |
| <value>index-writers.xml</value> |
| <description>The configuration file for index writers.</description> |
| </property> |
| |
| <!-- Exchanges properties --> |
| |
| <property> |
| <name>exchanges.exchanges.file</name> |
| <value>exchanges.xml</value> |
| <description>The configuration file used by the Exchange component.</description> |
| </property> |
| |
| <!-- URL normalizer properties --> |
| |
| <property> |
| <name>urlnormalizer.order</name> |
| <value>org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer</value> |
| <description>Order in which normalizers will run. If any of these isn't |
| activated it will be silently skipped. If other normalizers not on the |
| list are activated, they will run in random order after the ones |
| specified here are run. |
| </description> |
| </property> |
| |
| <property> |
| <name>urlnormalizer.regex.file</name> |
| <value>regex-normalize.xml</value> |
| <description>Name of the config file used by the RegexUrlNormalizer class. |
| </description> |
| </property> |
| |
| <property> |
| <name>urlnormalizer.loop.count</name> |
| <value>1</value> |
| <description>Optionally loop through normalizers several times, to make |
| sure that all transformations have been performed. |
| </description> |
| </property> |
| |
| <!-- mime properties --> |
| |
| <!-- |
| <property> |
| <name>mime.types.file</name> |
| <value>tika-mimetypes.xml</value> |
| <description>Name of file in CLASSPATH containing filename extension and |
| magic sequence to mime types mapping information. Overrides the default Tika config |
| if specified. |
| </description> |
| </property> |
| --> |
| |
| <property> |
| <name>mime.type.magic</name> |
| <value>true</value> |
| <description>Defines if the mime content type detector uses magic resolution. |
| </description> |
| </property> |
| |
| <!-- plugin properties --> |
| |
| <property> |
| <name>plugin.folders</name> |
| <value>plugins</value> |
| <description>Directories where nutch plugins are located. Each |
| element may be a relative or absolute path. If absolute, it is used |
| as is. If relative, it is searched for on the classpath.</description> |
| </property> |
| |
| <property> |
| <name>plugin.auto-activation</name> |
| <value>true</value> |
| <description>Defines if some plugins that are not activated regarding |
| the plugin.includes and plugin.excludes properties must be automatically |
| activated if they are needed by some active plugins. |
| </description> |
| </property> |
| |
| <property> |
| <name>plugin.includes</name> |
| <value>protocol-http|urlfilter-(regex|validator)|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value> |
| <description>Regular expression naming plugin directory names to |
| include. Any plugin not matching this expression is excluded. |
| By default Nutch includes plugins to crawl HTML and various other |
| document formats via HTTP/HTTPS and indexing the crawled content |
| into Solr. More plugins are available to support more indexing |
| backends, to fetch ftp:// and file:// URLs, for focused crawling, |
| and many other use cases. |
| </description> |
| </property> |
| |
| <property> |
| <name>plugin.excludes</name> |
| <value></value> |
| <description>Regular expression naming plugin directory names to exclude. |
| </description> |
| </property> |
| |
| <property> |
| <name>urlmeta.tags</name> |
| <value></value> |
| <description> |
| To be used in conjunction with features introduced in NUTCH-655, which allows |
| for custom metatags to be injected alongside your crawl URLs. Specifying those |
| custom tags here will allow for their propagation into a pages outlinks, as |
| well as allow for them to be included as part of an index. |
| Values should be comma-delimited. ("tag1,tag2,tag3") Do not pad the tags with |
| white-space at their boundaries, if you are using anything earlier than Hadoop-0.21. |
| </description> |
| </property> |
| |
| <!-- parser properties --> |
| |
| <property> |
| <name>parse.plugin.file</name> |
| <value>parse-plugins.xml</value> |
| <description>The name of the file that defines the associations between |
| content-types and parsers.</description> |
| </property> |
| |
| <property> |
| <name>parser.character.encoding.default</name> |
| <value>windows-1252</value> |
| <description>The character encoding to fall back to when no other information |
| is available</description> |
| </property> |
| |
| <property> |
| <name>encodingdetector.charset.min.confidence</name> |
| <value>-1</value> |
| <description>A integer between 0-100 indicating minimum confidence value |
| for charset auto-detection. Any negative value disables auto-detection. |
| </description> |
| </property> |
| |
| <property> |
| <name>parser.caching.forbidden.policy</name> |
| <value>content</value> |
| <description>If a site (or a page) requests through its robot metatags |
| that it should not be shown as cached content, apply this policy. Currently |
| three keywords are recognized: "none" ignores any "noarchive" directives. |
| "content" doesn't show the content, but shows summaries (snippets). |
| "all" doesn't show either content or summaries.</description> |
| </property> |
| |
| <property> |
| <name>parser.html.impl</name> |
| <value>neko</value> |
| <description>HTML Parser implementation. Currently the following keywords |
| are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup. |
| </description> |
| </property> |
| |
| <property> |
| <name>parser.html.form.use_action</name> |
| <value>false</value> |
| <description>If true, HTML parser will collect URLs from form action |
| attributes. This may lead to undesirable behavior (submitting empty |
| forms during next fetch cycle). If false, form action attribute will |
| be ignored.</description> |
| </property> |
| |
| <property> |
| <name>parser.html.outlinks.ignore_tags</name> |
| <value></value> |
| <description>Comma separated list of HTML tags, from which outlinks |
| shouldn't be extracted. Nutch takes links from: a, area, form, frame, |
| iframe, script, link, img. If you add any of those tags here, it |
| won't be taken. Default is empty list. Probably reasonable value |
| for most people would be "img,script,link".</description> |
| </property> |
| |
| <property> |
| <name>parser.html.outlinks.htmlnode_metadata_name</name> |
| <value></value> |
| <description>if not empty, the source nodename of a found outlink will |
| be set in the metadata with this name into the outlink</description> |
| </property> |
| |
| <property> |
| <name>parser.html.line.separators</name> |
| <value>article,aside,blockquote,canvas,dd,div,dl,dt,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hr,li,main,nav,noscript,ol,output,p,pre,section,table,tfoot,ul,video</value> |
| <description>Comma separated list of HTML tags. Newline will be added to the |
| parsed text after these tages. |
| The default list above are the block-level HTML elements. |
| Tags must be in lower case. |
| To disable this feature, leave the list empty.</description> |
| </property> |
| |
| <property> |
| <name>htmlparsefilter.order</name> |
| <value></value> |
| <description>The order by which HTMLParse filters are applied. |
| If empty, all available HTMLParse filters (as dictated by properties |
| plugin-includes and plugin-excludes above) are loaded and applied in system |
| defined order. If not empty, only named filters are loaded and applied |
| in given order. |
| HTMLParse filter ordering MAY have an impact |
| on end result, as some filters could rely on the metadata generated by a previous filter. |
| </description> |
| </property> |
| |
| <property> |
| <name>parsefilter.naivebayes.trainfile</name> |
| <value>naivebayes-train.txt</value> |
| <description>Set the name of the file to be used for Naive Bayes training. The format will be: |
| Each line contains two tab separated parts |
| There are two columns/parts: |
| 1. "1" or "0", "1" for relevant and "0" for irrelevant documents. |
| 2. Text (text that will be used for training) |
| |
| Each row will be considered a new "document" for the classifier. |
| CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this classifier. |
| </description> |
| </property> |
| |
| <property> |
| <name>parsefilter.naivebayes.wordlist</name> |
| <value>naivebayes-wordlist.txt</value> |
| <description>Put the name of the file you want to be used as a list of |
| important words to be matched in the url for the model filter. The format should be one word per line. |
| </description> |
| </property> |
| |
| <property> |
| <name>parser.timeout</name> |
| <value>30</value> |
| <description>Timeout in seconds for the parsing of a document, otherwise treats it as an exception and |
| moves on the the following documents. This parameter is applied to any Parser implementation. |
| Set to -1 to deactivate, bearing in mind that this could cause |
| the parsing to crash because of a very long or corrupted document. |
| </description> |
| </property> |
| |
| <property> |
| <name>parse.filter.urls</name> |
| <value>true</value> |
| <description>Whether the parser will filter URLs (with the configured URL filters).</description> |
| </property> |
| |
| <property> |
| <name>parse.normalize.urls</name> |
| <value>true</value> |
| <description>Whether the parser will normalize URLs (with the configured URL normalizers).</description> |
| </property> |
| |
| <property> |
| <name>parser.skip.truncated</name> |
| <value>true</value> |
| <description>Boolean value for whether we should skip parsing for truncated documents. By default this |
| property is activated due to extremely high levels of CPU which parsing can sometimes take. |
| </description> |
| </property> |
| |
| <property> |
| <name>parser.store.text</name> |
| <value>true</value> |
| <description>If true (default value), parser will store parse text (parse_text directory within the segment).</description> |
| </property> |
| |
| |
| <!-- |
| <property> |
| <name>tika.htmlmapper.classname</name> |
| <value>org.apache.tika.parser.html.IdentityHtmlMapper</value> |
| <description>Classname of Tika HTMLMapper to use. Influences the elements included in the DOM and hence |
| the behavior of the HTMLParseFilters. |
| </description> |
| </property> |
| --> |
| |
| <property> |
| <name>tika.config.file</name> |
| <value>tika-config.xml</value> |
| <description>Nutch-specific Tika config file</description> |
| </property> |
| |
| <property> |
| <name>tika.uppercase.element.names</name> |
| <value>true</value> |
| <description>Determines whether TikaParser should uppercase the element name while generating the DOM |
| for a page, as done by Neko (used per default by parse-html)(see NUTCH-1592). |
| </description> |
| </property> |
| |
| <property> |
| <name>tika.extractor</name> |
| <value>none</value> |
| <description> |
| Which text extraction algorithm to use. Valid values are: boilerpipe or none. |
| </description> |
| </property> |
| |
| <property> |
| <name>tika.extractor.boilerpipe.algorithm</name> |
| <value>ArticleExtractor</value> |
| <description> |
| Which Boilerpipe algorithm to use. Valid values are: DefaultExtractor, ArticleExtractor |
| or CanolaExtractor. |
| </description> |
| </property> |
| |
| <property> |
| <name>tika.extractor.boilerpipe.mime.types</name> |
| <value>text/html,application/xhtml+xml</value> |
| <description> |
| Comma-separated list of MIME types accepted for Boilerpipe extraction, |
| documents of other MIME types are not passed to the Boilerpipe extractor. |
| </description> |
| </property> |
| |
| <property> |
| <name>tika.parse.embedded</name> |
| <value>true</value> |
| <description> |
| Whether parse-tika shall parse embedded documents (even recursively). |
| </description> |
| </property> |
| |
| <!-- urlfilter plugin properties --> |
| |
| <property> |
| <name>urlfilter.domain.file</name> |
| <value>domain-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing either top level domains or |
| hostnames used by urlfilter-domain (DomainURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.regex.file</name> |
| <value>regex-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing regular expressions |
| used by urlfilter-regex (RegexURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.automaton.file</name> |
| <value>automaton-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing regular expressions |
| used by urlfilter-automaton (AutomatonURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.prefix.file</name> |
| <value>prefix-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing url prefixes |
| used by urlfilter-prefix (PrefixURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.suffix.file</name> |
| <value>suffix-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing url suffixes |
| used by urlfilter-suffix (SuffixURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.fast.file</name> |
| <value>fast-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing regular expressions |
| used by urlfilter-fast (FastURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.order</name> |
| <value></value> |
| <description>The order by which url filters are applied. |
| If empty, all available url filters (as dictated by properties |
| plugin-includes and plugin-excludes above) are loaded and applied in system |
| defined order. If not empty, only named filters are loaded and applied |
| in given order. For example, if this property has value: |
| org.apache.nutch.urlfilter.regex.RegexURLFilter org.apache.nutch.urlfilter.prefix.PrefixURLFilter |
| then RegexURLFilter is applied first, and PrefixURLFilter second. |
| Since all filters are AND'ed, filter ordering does not have impact |
| on end result, but it may have performance implication, depending |
| on relative expensiveness of filters. |
| </description> |
| </property> |
| |
| <!-- scoring filters properties --> |
| |
| <property> |
| <name>scoring.filter.order</name> |
| <value></value> |
| <description>The order in which scoring filters are applied. This |
| may be left empty (in which case all available scoring filters will |
| be applied in system defined order), or a space separated list of |
| implementation classes. |
| </description> |
| </property> |
| |
| <!-- scoring-depth properties |
| Add 'scoring-depth' to the list of active plugins |
| in the parameter 'plugin.includes' in order to use it. |
| --> |
| |
| <property> |
| <name>scoring.depth.max</name> |
| <value>1000</value> |
| <description>Max depth value from seed allowed by default. |
| Can be overridden on a per-seed basis by specifying "_maxdepth_=VALUE" |
| as a seed metadata. This plugin adds a "_depth_" metadatum to the pages |
| to track the distance from the seed it was found from. |
| The depth is used to prioritise URLs in the generation step so that |
| shallower pages are fetched first. |
| </description> |
| </property> |
| |
| <!-- scoring similarity properties |
| Add scoring-similarity to the list of active plugins |
| in the parameter 'plugin.includes' in order to use it. |
| For more detailed information on the working of this filter |
| visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> |
| |
| <property> |
| <name>scoring.similarity.model</name> |
| <value>cosine</value> |
| <description>The type of similarity metric to use. Eg - cosine (which is, currently, the only available model). |
| Please make sure to set the model specific properties for the scoring to function properly. |
| Description of these properties can be found on the wiki. |
| </description> |
| </property> |
| |
| <property> |
| <name>scoring.similarity.ngrams</name> |
| <value>1,1</value> |
| <description>Specifies the min 'n' and max 'n' in ngrams as comma-separated. |
| If one value is specified as 'n', it will be used for both the min 'n' and max 'n' in ngrams. |
| </description> |
| </property> |
| |
| <property> |
| <name>cosine.goldstandard.file</name> |
| <value>goldstandard.txt</value> |
| <description>Path to the gold standard file which contains all the relevant text and terms, |
| pertaining to the domain. |
| </description> |
| </property> |
| |
| <property> |
| <name>scoring.similarity.stopword.file</name> |
| <value>stopwords.txt</value> |
| <description>Name of the stopword text file. The user can specify a custom list of stop words |
| in a text file. Each new stopword should be on a new line. |
| </description> |
| </property> |
| |
| <!-- scoring filter orphan properties --> |
| |
| <property> |
| <name>scoring.orphan.mark.gone.after</name> |
| <value>2592000</value> |
| <description>Time in seconds after which orphaned |
| pages are marked as gone. Default is 30 days. |
| </description> |
| </property> |
| |
| <property> |
| <name>scoring.orphan.mark.orphan.after</name> |
| <value>3456000</value> |
| <description>Time in seconds after which orphaned |
| pages are marked as gone. Default is 40 days. |
| </description> |
| </property> |
| |
| <!-- language-identifier plugin properties --> |
| |
| <property> |
| <name>lang.analyze.max.length</name> |
| <value>2048</value> |
| <description> The maximum number of bytes used to identify |
| the language (0 means full content analysis). |
| The larger is this value, the better is the analysis, but the |
| slowest it is. |
| </description> |
| </property> |
| |
| <property> |
| <name>lang.extraction.policy</name> |
| <value>detect,identify</value> |
| <description>This determines when the plugin uses detection and |
| statistical identification mechanisms. The order in which the |
| detect and identify are written will determine the extraction |
| policy. Default case (detect,identify) means the plugin will |
| first try to extract language info from page headers and metadata, |
| if this is not successful it will try using tika language |
| identification. Possible values are: |
| detect |
| identify |
| detect,identify |
| identify,detect |
| </description> |
| </property> |
| |
| <property> |
| <name>lang.identification.only.certain</name> |
| <value>false</value> |
| <description>If set to true with lang.extraction.policy containing identify, |
| the language code returned by Tika will be assigned to the document ONLY |
| if it is deemed certain by Tika. |
| </description> |
| </property> |
| |
| <property> |
| <name>lang.index.languages</name> |
| <value></value> |
| <description>If not empty, should be a comma separated list of language codes. |
| Only documents with one of these language codes will be indexed. |
| "unknown" is a valid language code, will match documents where language |
| detection failed. |
| </description> |
| </property> |
| |
| <!-- index-jexl-filter plugin properties --> |
| |
| <property> |
| <name>index.jexl.filter</name> |
| <value></value> |
| <description> A JEXL expression. If it evaluates to false, |
| the document will not be indexed. |
| Available primitives in the JEXL context: |
| * status, fetchTime, modifiedTime, retries, interval, score, signature, url, text, title |
| Available objects in the JEXL context: |
| * httpStatus - contains majorCode, minorCode, message |
| * documentMeta, contentMeta, parseMeta - contain all the Metadata properties. |
| each property value is always an array of Strings (so if you expect one value, use [0]) |
| * doc - contains all the NutchFields from the NutchDocument. |
| each property value is always an array of Objects. |
| </description> |
| </property> |
| |
| <!-- index-static plugin properties --> |
| |
| <property> |
| <name>index.static</name> |
| <value></value> |
| <description> |
| Used by plugin index-static to adds fields with static data at indexing time. |
| You can specify a comma-separated list of fieldname:fieldcontent per Nutch job. |
| Each fieldcontent can have multiple values separated by space, e.g., |
| field1:value1.1 value1.2 value1.3,field2:value2.1 value2.2 ... |
| It can be useful when collections can't be created by URL patterns, |
| like in subcollection, but on a job-basis. |
| </description> |
| </property> |
| |
| <property> |
| <name>index.static.fieldsep</name> |
| <value>,</value> |
| <description> |
| Used by plugin index-static to parse the property index.static. Default: comma. |
| This delimiter is used to separate individual field specifications in the property. |
| </description> |
| </property> |
| |
| <property> |
| <name>index.static.keysep</name> |
| <value>:</value> |
| <description> |
| Used by plugin index-static to parse the property index.static. Default: colon. |
| This delimiter is used to separate the field name from the field value in the field specification. |
| </description> |
| </property> |
| |
| <property> |
| <name>index.static.valuesep</name> |
| <value> </value> |
| <description> |
| Used by plugin index-static to parse the property index.static. Default: space. |
| This delimiter is used to separate multiple field values in the value setting of the field specification. |
| </description> |
| </property> |
| |
| |
| <!-- index-metadata plugin properties --> |
| |
| <property> |
| <name>index.parse.md</name> |
| <value>metatag.description,metatag.keywords</value> |
| <description> |
| Comma-separated list of keys to be taken from the parse metadata to generate fields. |
| Can be used e.g. for 'description' or 'keywords' provided that these values are generated |
| by a parser (see parse-metatags plugin) |
| </description> |
| </property> |
| |
| <property> |
| <name>index.content.md</name> |
| <value></value> |
| <description> |
| Comma-separated list of keys to be taken from the content metadata to generate fields. |
| </description> |
| </property> |
| |
| <property> |
| <name>index.db.md</name> |
| <value></value> |
| <description> |
| Comma-separated list of keys to be taken from the crawldb metadata to generate fields. |
| Can be used to index values propagated from the seeds with the plugin urlmeta |
| </description> |
| </property> |
| |
| <property> |
| <name>index.metadata.separator</name> |
| <value></value> |
| <description> |
| Separator to use if you want to index multiple values for a given field. Leave empty to |
| treat each value as a single value. |
| </description> |
| </property> |
| |
| <!-- index-geoip plugin properties --> |
| <property> |
| <name>index.geoip.usage</name> |
| <value>insightsService</value> |
| <description> |
| A string representing the information source to be used for GeoIP information |
| association. Either enter 'cityDatabase', 'connectionTypeDatabase', |
| 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the |
| Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, |
| GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the classpath and |
| available at runtime. |
| </description> |
| </property> |
| |
| <property> |
| <name>index.geoip.userid</name> |
| <value></value> |
| <description> |
| The userId associated with the GeoIP2 Precision Services account. |
| </description> |
| </property> |
| |
| <property> |
| <name>index.geoip.licensekey</name> |
| <value></value> |
| <description> |
| The license key associated with the GeoIP2 Precision Services account. |
| </description> |
| </property> |
| |
| <property> |
| <name>index.replace.regexp</name> |
| <value/> |
| <description>Allows indexing-time regexp replace manipulation of metadata fields. |
| The format of the property is a list of regexp replacements, one line per field being |
| modified. Include index-replace in your plugin.includes. |
| |
| Example: |
| hostmatch=.*somedomain.com |
| fldname1=/regexp/replacement/flags |
| fldname2=/regexp/replacement/flags |
| |
| Field names would be one of those from https://wiki.apache.org/nutch/IndexStructure. |
| See https://wiki.apache.org/nutch/IndexReplace for further details. |
| </description> |
| </property> |
| |
| <!-- parse-metatags plugin properties --> |
| <property> |
| <name>metatags.names</name> |
| <value>description,keywords</value> |
| <description> Names of the metatags to extract, separated by ','. |
| Use '*' to extract all metatags. Prefixes the names with 'metatag.' |
| in the parse-metadata. For instance to index description and keywords, |
| you need to activate the plugin index-metadata and set the value of the |
| parameter 'index.parse.md' to 'metatag.description,metatag.keywords'. |
| </description> |
| </property> |
| |
| <!-- Temporary Hadoop 0.17.x workaround. --> |
| |
| <property> |
| <name>hadoop.job.history.user.location</name> |
| <value>${hadoop.log.dir}/history/user</value> |
| <description>Hadoop 0.17.x comes with a default setting to create |
| user logs inside the output path of the job. This breaks some |
| Hadoop classes, which expect the output to contain only |
| part-XXXXX files. This setting changes the output to a |
| subdirectory of the regular log directory. |
| </description> |
| </property> |
| |
| <property> |
| <name>io.serializations</name> |
| <value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.JavaSerialization</value> |
| <!-- org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization, |
| org.apache.hadoop.io.serializer.avro.AvroReflectSerialization, |
| org.apache.hadoop.io.serializer.avro.AvroGenericSerialization, --> |
| <description>A list of serialization classes that can be used for |
| obtaining serializers and deserializers.</description> |
| </property> |
| |
| <!-- linkrank scoring properties --> |
| |
| <property> |
| <name>link.ignore.internal.host</name> |
| <value>true</value> |
| <description>Ignore outlinks to the same hostname.</description> |
| </property> |
| |
| <property> |
| <name>link.ignore.internal.domain</name> |
| <value>true</value> |
| <description>Ignore outlinks to the same domain.</description> |
| </property> |
| |
| <property> |
| <name>link.ignore.limit.page</name> |
| <value>true</value> |
| <description>Limit to only a single outlink to the same page.</description> |
| </property> |
| |
| <property> |
| <name>link.ignore.limit.domain</name> |
| <value>true</value> |
| <description>Limit to only a single outlink to the same domain.</description> |
| </property> |
| |
| <property> |
| <name>link.analyze.num.iterations</name> |
| <value>10</value> |
| <description>The number of LinkRank iterations to run.</description> |
| </property> |
| |
| <property> |
| <name>link.analyze.initial.score</name> |
| <value>1.0f</value> |
| <description>The initial score.</description> |
| </property> |
| |
| <property> |
| <name>link.analyze.damping.factor</name> |
| <value>0.85f</value> |
| <description>The damping factor.</description> |
| </property> |
| |
| <property> |
| <name>link.delete.gone</name> |
| <value>false</value> |
| <description>Whether to delete gone pages from the web graph.</description> |
| </property> |
| |
| <property> |
| <name>link.loops.depth</name> |
| <value>2</value> |
| <description>The depth for the loops algorithm.</description> |
| </property> |
| |
| <property> |
| <name>link.score.updater.clear.score</name> |
| <value>0.0f</value> |
| <description>The default score for URL's that are not in the web graph.</description> |
| </property> |
| |
| <property> |
| <name>mapreduce.fileoutputcommitter.marksuccessfuljobs</name> |
| <value>false</value> |
| <description>Hadoop >= 0.21 generates SUCCESS files in the output which can crash |
| the readers. This should not be an issue once Nutch is ported to the new MapReduce API |
| but for now this parameter should prevent such cases. |
| </description> |
| </property> |
| |
| <!-- subcollection properties --> |
| |
| <property> |
| <name>subcollection.default.fieldname</name> |
| <value>subcollection</value> |
| <description> |
| The default field name for the subcollections. |
| </description> |
| </property> |
| |
| <property> |
| <name>subcollection.case.insensitive</name> |
| <value>false</value> |
| <description> |
| Whether the URL prefixes are to be treated case insensitive. |
| </description> |
| </property> |
| |
| <!-- Headings plugin properties --> |
| |
| <property> |
| <name>headings</name> |
| <value>h1,h2</value> |
| <description>Comma separated list of headings to retrieve from the document</description> |
| </property> |
| |
| <property> |
| <name>headings.multivalued</name> |
| <value>false</value> |
| <description>Whether to support multivalued headings.</description> |
| </property> |
| |
| <!-- mimetype-filter plugin properties --> |
| |
| <property> |
| <name>mimetype.filter.file</name> |
| <value>mimetype-filter.txt</value> |
| <description> |
| The configuration file for the mimetype-filter plugin. This file contains |
| the rules used to allow or deny the indexing of certain documents. |
| </description> |
| </property> |
| |
| <!-- plugin properties that applies to lib-selenium, protocol-selenium, |
| protocol-interactiveselenium, lib-htmlunit, protocol-htmlunit --> |
| |
| <property> |
| <name>page.load.delay</name> |
| <value>3</value> |
| <description> |
| The delay in seconds to use when loading a page with htmlunit or selenium. |
| </description> |
| </property> |
| |
| <property> |
| <name>take.screenshot</name> |
| <value>false</value> |
| <description> |
| Boolean property determining whether the protocol-htmlunit |
| WebDriver should capture a screenshot of the URL. If set to |
| true remember to define the 'screenshot.location' |
| property as this determines the location screenshots should be |
| persisted to on HDFS. If that property is not set, screenshots |
| are simply discarded. |
| </description> |
| </property> |
| |
| <property> |
| <name>screenshot.location</name> |
| <value></value> |
| <description> |
| The location on disk where a URL screenshot should be saved |
| to if the 'take.screenshot' property is set to true. |
| By default this is null, in this case screenshots held in memory |
| are simply discarded. |
| </description> |
| </property> |
| |
| <!-- lib-htmlunit plugin properties; applies to protocol-htmlunit --> |
| |
| <property> |
| <name>htmlunit.enable.javascript</name> |
| <value>true</value> |
| <description> |
| A Boolean value representing if javascript should |
| be enabled or disabled when using htmlunit. The default value is enabled. |
| </description> |
| </property> |
| |
| <property> |
| <name>htmlunit.javascript.timeout</name> |
| <value>3500</value> |
| <description> |
| The timeout in milliseconds when loading javascript with lib-htmlunit. This |
| setting is used by protocol-htmlunit since they depending on |
| lib-htmlunit for fetching. |
| </description> |
| </property> |
| |
| <property> |
| <name>htmlunit.enable.css</name> |
| <value>false</value> |
| <description> |
| A Boolean value representing if CSS should |
| be enabled or disabled when using htmlunit. The default value is disabled. |
| </description> |
| </property> |
| |
| <!-- protocol-selenium plugin properties --> |
| |
| <property> |
| <name>selenium.driver</name> |
| <value>firefox</value> |
| <description> |
| A String value representing the flavour of Selenium |
| WebDriver() to use. Currently the following options |
| exist - 'firefox', 'chrome', 'safari', 'opera' and 'remote'. |
| If 'remote' is used it is essential to also set correct properties for |
| 'selenium.hub.port', 'selenium.hub.path', 'selenium.hub.host', |
| 'selenium.hub.protocol', 'selenium.grid.driver', 'selenium.grid.binary' |
| and 'selenium.enable.headless'. |
| </description> |
| </property> |
| |
| <property> |
| <name>selenium.hub.port</name> |
| <value>4444</value> |
| <description>Selenium Hub Location connection port</description> |
| </property> |
| |
| <property> |
| <name>selenium.hub.path</name> |
| <value>/wd/hub</value> |
| <description>Selenium Hub Location connection path</description> |
| </property> |
| |
| <property> |
| <name>selenium.hub.host</name> |
| <value>localhost</value> |
| <description>Selenium Hub Location connection host</description> |
| </property> |
| |
| <property> |
| <name>selenium.hub.protocol</name> |
| <value>http</value> |
| <description>Selenium Hub Location connection protocol</description> |
| </property> |
| |
| <property> |
| <name>selenium.grid.driver</name> |
| <value>firefox</value> |
| <description>A String value representing the flavour of Selenium |
| WebDriver() used on the selenium grid. We must set `selenium.driver` to `remote` first. |
| Currently the following options |
| exist - 'firefox', 'chrome', 'random' </description> |
| </property> |
| |
| <property> |
| <name>selenium.grid.binary</name> |
| <value></value> |
| <description>A String value representing the path to the browser binary |
| location for each node |
| </description> |
| </property> |
| |
| <!-- headless options for Firefox and Chrome--> |
| <property> |
| <name>selenium.enable.headless</name> |
| <value>false</value> |
| <description>A Boolean value representing the headless option |
| for Firefix and Chrome drivers |
| </description> |
| </property> |
| <!-- selenium firefox configuration; |
| applies to protocol-selenium and protocol-interactiveselenium plugins --> |
| <property> |
| <name>selenium.firefox.allowed.hosts</name> |
| <value>localhost</value> |
| <description>A String value representing the allowed hosts preference |
| according to the operating system hosts file (Example - /etc/hosts in Unix). |
| Currently this option exist for - 'firefox' </description> |
| </property> |
| |
| <property> |
| <name>selenium.firefox.binary.timeout</name> |
| <value>45</value> |
| <description>A Long value representing the timeout value |
| for firefox to be available for command execution. The value is in seconds. |
| Currently this option exist for - 'firefox' </description> |
| </property> |
| |
| <property> |
| <name>selenium.firefox.enable.flash</name> |
| <value>false</value> |
| <description>A Boolean value representing if flash should |
| be enabled or disabled. The default value is disabled. |
| Currently this option exist for - 'firefox' </description> |
| </property> |
| |
| <property> |
| <name>selenium.firefox.load.image</name> |
| <value>1</value> |
| <description>An Integer value representing the restriction on |
| loading images. The default value is no restriction i.e. load all images. |
| Other options are: |
| 1: Load all images, regardless of origin |
| 2: Block all images |
| 3: Prevent third-party images from loading |
| Currently this option exist for - 'firefox' </description> |
| </property> |
| |
| <property> |
| <name>selenium.firefox.load.stylesheet</name> |
| <value>1</value> |
| <description>An Integer value representing the restriction on |
| loading stylesheet. The default value is no restriction i.e. load |
| all stylesheet. |
| Other options are: |
| 1: Load all stylesheet |
| 2: Block all stylesheet |
| Currently this option exist for - 'firefox' </description> |
| </property> |
| |
| <!-- selenium chrome configurations --> |
| <property> |
| <name>webdriver.chrome.driver</name> |
| <value>/root/chromedriver</value> |
| <description>The path to the ChromeDriver binary</description> |
| </property> |
| <!-- end of selenium chrome configurations --> |
| |
| <!-- protocol-interactiveselenium configuration --> |
| <property> |
| <name>interactiveselenium.handlers</name> |
| <value>DefaultHandler</value> |
| <description> |
| A comma separated list of Selenium handlers that should be run for a given |
| URL. The DefaultHandler causes the same functionality as protocol-selenium. |
| Custom handlers can be implemented in the plugin package and included here. |
| </description> |
| </property> |
| |
| <property> |
| <name>store.http.request</name> |
| <value>false</value> |
| <description> |
| Store the raw request made by Nutch, required to use the CommonCrawlDataDumper |
| tool for the WARC format. |
| </description> |
| </property> |
| |
| <property> |
| <name>store.http.headers</name> |
| <value>false</value> |
| <description> |
| Store the raw headers received by Nutch from the server, required to use the |
| CommonCrawlDataDumper tool for the WARC format. |
| </description> |
| </property> |
| |
| <!-- index-links plugin --> |
| |
| <property> |
| <name>index.links.outlinks.host.ignore</name> |
| <value>false</value> |
| <description> |
| Ignore outlinks that point out to the same host as the URL being indexed. |
| By default all outlinks are indexed. If db.ignore.internal.links is true (default |
| value), this setting does nothing since the internal links are already |
| ignored. |
| </description> |
| </property> |
| |
| <property> |
| <name>index.links.inlinks.host.ignore</name> |
| <value>false</value> |
| <description> |
| Ignore inlinks coming from the same host as the URL being indexed. By default |
| all inlinks are indexed. If db.ignore.internal.links is true (default |
| value), this setting does nothing since the internal links are already |
| ignored. |
| </description> |
| </property> |
| |
| <property> |
| <name>index.links.hosts.only</name> |
| <value>false</value> |
| <description> |
| This force the index-links plugin to only index the host portion of the inlinks |
| or outlinks. |
| </description> |
| </property> |
| |
| <!-- HostDB settings --> |
| <property> |
| <name>hostdb.recheck.interval</name> |
| <value>86400000</value> |
| <description> |
| Interval between rechecks in milliseconds. Default is one week. Recheck |
| interval is multiplied by the number of DNS lookup failures for a given |
| host. |
| </description> |
| </property> |
| |
| <property> |
| <name>hostdb.purge.failed.hosts.threshold</name> |
| <value>3</value> |
| <description> |
| If hosts have more failed DNS lookups than this threshold, they are |
| removed from the HostDB. Hosts can, of course, return if they are still |
| present in the CrawlDB. |
| </description> |
| </property> |
| |
| <property> |
| <name>hostdb.num.resolvers.threads</name> |
| <value>25</value> |
| <description> |
| Number of resolver threads per reducer. Make sure your DNS resolver is |
| capable of handling this value multiplied by the number of reducers. |
| </description> |
| </property> |
| |
| <property> |
| <name>hostdb.check.failed</name> |
| <value>true</value> |
| <description> |
| True if hosts for which DNS lookup failed are eligible for recheck. If |
| false, hosts that failed DNS lookup more than 0 times are not eligible |
| for DNS lookup. |
| </description> |
| </property> |
| |
| <property> |
| <name>hostdb.check.new</name> |
| <value>true</value> |
| <description> |
| True if newly discovered hosts eligible for DNS lookup check. If false, |
| hosts that are just added to the HostDB are not eligible for DNS lookup. |
| </description> |
| </property> |
| |
| <property> |
| <name>hostdb.check.known</name> |
| <value>true</value> |
| <description> |
| True if newly already known hosts eligible for DNS lookup check. If false, |
| known hosts are not eligible for DNS lookup. |
| </description> |
| </property> |
| |
| <property> |
| <name>hostdb.force.check</name> |
| <value>false</value> |
| <description> |
| If true hosts are checked regardless of their respective recheck |
| intervals or status. |
| </description> |
| </property> |
| |
| <property> |
| <name>hostdb.url.filter</name> |
| <value>false</value> |
| <description> |
| Whether the records are to be passed through configured filters. |
| </description> |
| </property> |
| |
| <property> |
| <name>hostdb.url.normalize</name> |
| <value>false</value> |
| <description> |
| Whether the records are to be passed through configured normalizers. |
| </description> |
| </property> |
| |
| <property> |
| <name>hostdb.numeric.fields</name> |
| <value>_rs_</value> |
| <description> |
| Comma-separated list of CrawlDatum metadata fields for which aggregations are needed. |
| </description> |
| </property> |
| |
| <property> |
| <name>hostdb.string.fields</name> |
| <value>Content-Type</value> |
| <description> |
| Comma-separated list of CrawlDatum metadata fields for which sums are needed. |
| </description> |
| </property> |
| |
| <property> |
| <name>hostdb.percentiles</name> |
| <value>50,75,95,99</value> |
| <description> |
| Comma-separated list of percentiles that must be calculated for all numeric |
| field aggregations. Host metadata will contain fields for each percentile. |
| </description> |
| </property> |
| |
| <!-- publisher properties |
| Do not forget to add the name of your publisher implementation |
| in plugin.includes ex- publish-rabbitmq --> |
| <property> |
| <name>publisher.queue.type</name> |
| <value></value> |
| <description> |
| Choose the type of Queue being used (ex - RabbitMQ, ActiveMq, Kafka, etc). |
| Currently there exists an implemtation for RabbitMQ producer. |
| </description> |
| </property> |
| |
| <property> |
| <name>publisher.order</name> |
| <value></value> |
| <description> |
| The order in which the publisher queues would be loaded |
| </description> |
| </property> |
| |
| <!-- RabbitMQ publisher properties --> |
| |
| <property> |
| <name>rabbitmq.publisher.server.uri</name> |
| <value>amqp://guest:guest@localhost:5672/</value> |
| <description> |
| URI with connection parameters in the form |
| amqp://username:password@hostname:port/virtualHost |
| where: |
| username is the username for RabbitMQ server. |
| password is the password for RabbitMQ server. |
| hostname is where the RabbitMQ server is running. |
| port is where the RabbitMQ server is listening. |
| virtualHost is where where the exchange is and the user has access. |
| </description> |
| </property> |
| |
| <property> |
| <name>rabbitmq.publisher.binding</name> |
| <value>false</value> |
| <description> |
| Whether the relationship between an exchange and a queue is created |
| automatically. Default "false". |
| |
| NOTE: Binding between exchanges is not supported. |
| </description> |
| </property> |
| |
| <property> |
| <name>rabbitmq.publisher.binding.arguments</name> |
| <value></value> |
| <description> |
| Arguments used in binding. It must have the form key1=value1,key2=value2. |
| This value is only used when the exchange's type is headers and |
| the value of 'rabbitmq.publisher.binding' property is true. In other cases |
| is ignored. |
| </description> |
| </property> |
| |
| <property> |
| <name>rabbitmq.publisher.exchange.name</name> |
| <value></value> |
| <description> |
| Name for the exchange where the messages will be sent. Default "". |
| </description> |
| </property> |
| |
| <property> |
| <name>rabbitmq.publisher.exchange.options</name> |
| <value>type=direct,durable=true</value> |
| <description> |
| Options used when the exchange is created. |
| Only used when the value of 'rabbitmq.publisher.binding' property is true. |
| Default "type=direct,durable=true". |
| </description> |
| </property> |
| |
| <property> |
| <name>rabbitmq.publisher.queue.name</name> |
| <value>nutch.events.queue</value> |
| <description> |
| Name of the queue used to create the binding. Default "nutch.queue". |
| Only used when the value of 'rabbitmq.publisher.binding' property is true. |
| </description> |
| </property> |
| |
| <property> |
| <name>rabbitmq.publisher.queue.options</name> |
| <value>durable=true,exclusive=false,auto-delete=false</value> |
| <description> |
| Options used when the queue is created. |
| Only used when the value of 'rabbitmq.publisher.binding' property is true. |
| Default "durable=true,exclusive=false,auto-delete=false". |
| |
| It must have the form |
| durable={durable},exclusive={exclusive},auto-delete={auto-delete},arguments={arguments} |
| where: |
| durable is true or false |
| exclusive is true or false |
| auto-delete is true or false |
| arguments must be the for {key1:value1;key2:value2} |
| </description> |
| </property> |
| |
| <property> |
| <name>rabbitmq.publisher.routingkey</name> |
| <value></value> |
| <description> |
| The routing key used to publish messages to specific queues. |
| It is only used when the exchange type is "topic" or "direct". Default |
| is the value of 'rabbitmq.publisher.queue.name' property. |
| </description> |
| </property> |
| |
| <property> |
| <name>rabbitmq.publisher.headers.static</name> |
| <value></value> |
| <description> |
| Headers to add to each message. It must have the form key1=value1,key2=value2. |
| </description> |
| </property> |
| |
| <!-- sitemap properties --> |
| |
| <property> |
| <name>sitemap.strict.parsing</name> |
| <value>true</value> |
| <description> |
| If true (default) the Sitemap parser rejects URLs not sharing the same |
| prefix with the sitemap: a sitemap `http://example.com/catalog/sitemap.xml' |
| may only contain URLs starting with `http://example.com/catalog/'. |
| All other URLs are skipped. If false the parser will allow any URLs contained |
| in the sitemap. |
| </description> |
| </property> |
| |
| <property> |
| <name>sitemap.url.filter</name> |
| <value>true</value> |
| <description> |
| Filter URLs from sitemaps. |
| </description> |
| </property> |
| |
| <property> |
| <name>sitemap.url.normalize</name> |
| <value>true</value> |
| <description> |
| Normalize URLs from sitemaps. |
| </description> |
| </property> |
| |
| <property> |
| <name>sitemap.url.default.sitemap.xml</name> |
| <value>true</value> |
| <description> |
| Always try <host>/sitemap.xml root even if no sitemap |
| is announced in /robots.txt. |
| </description> |
| </property> |
| |
| <property> |
| <name>sitemap.url.overwrite.existing</name> |
| <value>false</value> |
| <description> |
| If true, the record's existing modified time, interval and score are |
| overwritten by the information in the sitemap. WARNING: overwriting |
| these values may have unexpected effects on what is crawled. Use this |
| only if you can trust the sitemap and if the values in the sitemap do |
| fit with your crawler configuration. |
| </description> |
| </property> |
| |
| <property> |
| <name>sitemap.redir.max</name> |
| <value>3</value> |
| <description> |
| Maximum number of redirects to follow. |
| </description> |
| </property> |
| </configuration> |