| <?xml version="1.0"?> |
| <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> |
| <!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| <!-- Do not modify this file directly. Instead, copy entries that you --> |
| <!-- wish to modify from this file into nutch-site.xml and change them --> |
| <!-- there. If nutch-site.xml does not already exist, create it. --> |
| |
| <configuration> |
| |
| <!-- general properties --> |
| |
| <property> |
| <name>store.ip.address</name> |
| <value>false</value> |
| <description>Enables us to capture the specific IP address |
| (InetSocketAddress) of the host which we connect to via |
| the given protocol. |
| </description> |
| </property> |
| |
| <!-- file properties --> |
| |
| <property> |
| <name>file.content.limit</name> |
| <value>65536</value> |
| <description>The length limit for downloaded content using the file |
| protocol, in bytes. If this value is nonnegative (>=0), content longer |
| than it will be truncated; otherwise, no truncation at all. Do not |
| confuse this setting with the http.content.limit setting. |
| </description> |
| </property> |
| |
| <property> |
| <name>file.crawl.redirect_noncanonical</name> |
| <value>true</value> |
| <description> |
| If true, protocol-file treats non-canonical file names as |
| redirects and does not canonicalize file names internally. A file |
| name containing symbolic links as path elements is then not |
| resolved and "fetched" but recorded as redirect with the |
| canonical name (all links on path are resolved) as redirect |
| target. |
| </description> |
| </property> |
| |
| <property> |
| <name>file.content.ignored</name> |
| <value>true</value> |
| <description>If true, no file content will be saved during fetch. |
| And it is probably what we want to set most of time, since file:// URLs |
| are meant to be local and we can always use them directly at parsing |
| and indexing stages. Otherwise file contents will be saved. |
| !! NO IMPLEMENTED YET !! |
| </description> |
| </property> |
| |
| <property> |
| <name>file.crawl.parent</name> |
| <value>true</value> |
| <description>The crawler is not restricted to the directories that you specified in the |
| urls file but it is jumping into the parent directories as well. For your own crawling you can |
| change this behavior (set to false) the way that only directories beneath the directories that you specify get |
| crawled. |
| </description> |
| </property> |
| |
| |
| <!-- HTTP properties --> |
| |
| <property> |
| <name>http.agent.name</name> |
| <value></value> |
| <description>HTTP 'User-Agent' request header. MUST NOT be empty - |
| please set this to a single word uniquely related to your organization. |
| |
| NOTE: You should also check other related properties: |
| |
| http.robots.agents |
| http.agent.description |
| http.agent.url |
| http.agent.email |
| http.agent.version |
| |
| and set their values appropriately. |
| |
| </description> |
| </property> |
| |
| <property> |
| <name>http.robots.agents</name> |
| <value></value> |
| <description>Any other agents, apart from 'http.agent.name', that the robots |
| parser would look for in robots.txt. Multiple agents can be provided using |
| comma as a delimiter. eg. mybot,foo-spider,bar-crawler |
| |
| The ordering of agents does NOT matter and the robots parser would make |
| decision based on the agent which matches first to the robots rules. |
| Also, there is NO need to add a wildcard (ie. "*") to this string as the |
| robots parser would smartly take care of a no-match situation. |
| |
| If no value is specified, by default HTTP agent (ie. 'http.agent.name') |
| would be used for user agent matching by the robots parser. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.robots.403.allow</name> |
| <value>true</value> |
| <description>Some servers return HTTP status 403 (Forbidden) if |
| /robots.txt doesn't exist. This should probably mean that we are |
| allowed to crawl the site nonetheless. If this is set to false, |
| then such sites will be treated as forbidden.</description> |
| </property> |
| |
| <property> |
| <name>http.agent.description</name> |
| <value></value> |
| <description>Further description of our bot- this text is used in |
| the User-Agent header. It appears in parenthesis after the agent name. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.url</name> |
| <value></value> |
| <description>A URL to advertise in the User-Agent header. This will |
| appear in parenthesis after the agent name. Custom dictates that this |
| should be a URL of a page explaining the purpose and behavior of this |
| crawler. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.email</name> |
| <value></value> |
| <description>An email address to advertise in the HTTP 'From' request |
| header and User-Agent header. A good practice is to mangle this |
| address (e.g. 'info at example dot com') to avoid spamming. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.version</name> |
| <value>Nutch-2.3.1</value> |
| <description>A version string to advertise in the User-Agent |
| header.</description> |
| </property> |
| |
| <property> |
| <name>http.agent.rotate</name> |
| <value>false</value> |
| <description> |
| If true, instead of http.agent.name, alternating agent names are |
| chosen from a list provided via http.agent.rotate.file. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.rotate.file</name> |
| <value>agents.txt</value> |
| <description> |
| File containing alternative user agent names to be used instead of |
| http.agent.name on a rotating basis if http.agent.rotate is true. |
| Each line of the file should contain exactly one agent |
| specification including name, version, description, URL, etc. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.host</name> |
| <value></value> |
| <description>Name or IP address of the host on which the Nutch crawler |
| would be running. Currently this is used by 'protocol-httpclient' |
| plugin. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.timeout</name> |
| <value>10000</value> |
| <description>The default network timeout, in milliseconds.</description> |
| </property> |
| |
| <property> |
| <name>http.max.delays</name> |
| <value>100</value> |
| <description>The number of times a thread will delay when trying to |
| fetch a page. Each time it finds that a host is busy, it will wait |
| fetcher.server.delay. After http.max.delays attepts, it will give |
| up on the page for now.</description> |
| </property> |
| |
| <property> |
| <name>http.content.limit</name> |
| <value>65536</value> |
| <description>The length limit for downloaded content using the http |
| protocol, in bytes. If this value is nonnegative (>=0), content longer |
| than it will be truncated; otherwise, no truncation at all. Do not |
| confuse this setting with the file.content.limit setting. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.proxy.host</name> |
| <value></value> |
| <description>The proxy hostname. If empty, no proxy is used.</description> |
| </property> |
| |
| <property> |
| <name>http.proxy.port</name> |
| <value></value> |
| <description>The proxy port.</description> |
| </property> |
| |
| <property> |
| <name>http.proxy.username</name> |
| <value></value> |
| <description>Username for proxy. This will be used by |
| 'protocol-httpclient', if the proxy server requests basic, digest |
| and/or NTLM authentication. To use this, 'protocol-httpclient' must |
| be present in the value of 'plugin.includes' property. |
| NOTE: For NTLM authentication, do not prefix the username with the |
| domain, i.e. 'susam' is correct whereas 'DOMAIN\susam' is incorrect. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.proxy.password</name> |
| <value></value> |
| <description>Password for proxy. This will be used by |
| 'protocol-httpclient', if the proxy server requests basic, digest |
| and/or NTLM authentication. To use this, 'protocol-httpclient' must |
| be present in the value of 'plugin.includes' property. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.proxy.realm</name> |
| <value></value> |
| <description>Authentication realm for proxy. Do not define a value |
| if realm is not required or authentication should take place for any |
| realm. NTLM does not use the notion of realms. Specify the domain name |
| of NTLM authentication as the value for this property. To use this, |
| 'protocol-httpclient' must be present in the value of |
| 'plugin.includes' property. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.auth.file</name> |
| <value>httpclient-auth.xml</value> |
| <description>Authentication configuration file for |
| 'protocol-httpclient' plugin. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.verbose</name> |
| <value>false</value> |
| <description>If true, HTTP will log more verbosely.</description> |
| </property> |
| |
| <property> |
| <name>http.useHttp11</name> |
| <value>false</value> |
| <description>NOTE: at the moment this works only for protocol-httpclient. |
| If true, use HTTP 1.1, if false use HTTP 1.0 . |
| </description> |
| </property> |
| |
| <property> |
| <name>http.accept.language</name> |
| <value>en-us,en-gb,en;q=0.7,*;q=0.3</value> |
| <description>Value of the "Accept-Language" request header field. |
| This allows selecting non-English language as default one to retrieve. |
| It is a useful setting for search engines build for certain national group. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.accept</name> |
| <value>text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8</value> |
| <description>Value of the "Accept" request header field. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.store.responsetime</name> |
| <value>true</value> |
| <description>Enables us to record the response time of the |
| host which is the time period between start connection to end |
| connection of a pages host.</description> |
| </property> |
| |
| <!-- FTP properties --> |
| |
| <property> |
| <name>ftp.username</name> |
| <value>anonymous</value> |
| <description>ftp login username.</description> |
| </property> |
| |
| <property> |
| <name>ftp.password</name> |
| <value>anonymous@example.com</value> |
| <description>ftp login password.</description> |
| </property> |
| |
| <property> |
| <name>ftp.content.limit</name> |
| <value>65536</value> |
| <description>The length limit for downloaded content, in bytes. |
| If this value is nonnegative (>=0), content longer than it will be truncated; |
| otherwise, no truncation at all. |
| Caution: classical ftp RFCs never defines partial transfer and, in fact, |
| some ftp servers out there do not handle client side forced close-down very |
| well. Our implementation tries its best to handle such situations smoothly. |
| </description> |
| </property> |
| |
| <property> |
| <name>ftp.timeout</name> |
| <value>60000</value> |
| <description>Default timeout for ftp client socket, in millisec. |
| Please also see ftp.keep.connection below.</description> |
| </property> |
| |
| <property> |
| <name>ftp.server.timeout</name> |
| <value>100000</value> |
| <description>An estimation of ftp server idle time, in millisec. |
| Typically it is 120000 millisec for many ftp servers out there. |
| Better be conservative here. Together with ftp.timeout, it is used to |
| decide if we need to delete (annihilate) current ftp.client instance and |
| force to start another ftp.client instance anew. This is necessary because |
| a fetcher thread may not be able to obtain next request from queue in time |
| (due to idleness) before our ftp client times out or remote server |
| disconnects. Used only when ftp.keep.connection is true (please see below). |
| </description> |
| </property> |
| |
| <property> |
| <name>ftp.keep.connection</name> |
| <value>false</value> |
| <description>Whether to keep ftp connection. Useful if crawling same host |
| again and again. When set to true, it avoids connection, login and dir list |
| parser setup for subsequent urls. If it is set to true, however, you must |
| make sure (roughly): |
| (1) ftp.timeout is less than ftp.server.timeout |
| (2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay) |
| Otherwise there will be too many "delete client because idled too long" |
| messages in thread logs.</description> |
| </property> |
| |
| <property> |
| <name>ftp.follow.talk</name> |
| <value>false</value> |
| <description>Whether to log dialogue between our client and remote |
| server. Useful for debugging.</description> |
| </property> |
| |
| <!-- web db properties --> |
| |
| <property> |
| <name>db.fetch.interval.default</name> |
| <value>2592000</value> |
| <description>The default number of seconds between re-fetches of a page (30 days). |
| </description> |
| </property> |
| |
| <property> |
| <name>db.fetch.interval.max</name> |
| <value>7776000</value> |
| <description>The maximum number of seconds between re-fetches of a page |
| (90 days). After this period every page in the db will be re-tried, no |
| matter what is its status. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.class</name> |
| <value>org.apache.nutch.crawl.DefaultFetchSchedule</value> |
| <description>The implementation of fetch schedule. DefaultFetchSchedule simply |
| adds the original fetchInterval to the last fetch time, regardless of |
| page changes.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.inc_rate</name> |
| <value>0.4</value> |
| <description>If a page is unmodified, its fetchInterval will be |
| increased by this rate. This value should not |
| exceed 0.5, otherwise the algorithm becomes unstable.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.dec_rate</name> |
| <value>0.2</value> |
| <description>If a page is modified, its fetchInterval will be |
| decreased by this rate. This value should not |
| exceed 0.5, otherwise the algorithm becomes unstable.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.min_interval</name> |
| <value>60</value> |
| <description>Minimum fetchInterval, in seconds.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.max_interval</name> |
| <value>31536000</value> |
| <description>Maximum fetchInterval, in seconds (365 days). |
| NOTE: this is limited by db.fetch.interval.max. Pages with |
| fetchInterval larger than db.fetch.interval.max |
| will be fetched anyway.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.sync_delta</name> |
| <value>true</value> |
| <description>If true, try to synchronize with the time of page change. |
| by shifting the next fetchTime by a fraction (sync_rate) of the difference |
| between the last modification time, and the last fetch time.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.sync_delta_rate</name> |
| <value>0.3</value> |
| <description>See sync_delta for description. This value should not |
| exceed 0.5, otherwise the algorithm becomes unstable.</description> |
| </property> |
| |
| <property> |
| <name>db.update.additions.allowed</name> |
| <value>true</value> |
| <description>If true, updatedb will add newly discovered URLs, if false |
| only already existing URLs in the CrawlDb will be updated and no new |
| URLs will be added. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.update.max.inlinks</name> |
| <value>10000</value> |
| <description>Maximum number of inlinks to take into account when updating |
| a URL score in the crawlDB. Only the best scoring inlinks are kept. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.ignore.internal.links</name> |
| <value>true</value> |
| <description>If true, when adding new links to a page, links from |
| the same host are ignored. This is an effective way to limit the |
| size of the link database, keeping only the highest quality |
| links. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.ignore.external.links</name> |
| <value>false</value> |
| <description>If true, outlinks leading from a page to external hosts |
| will be ignored. This is an effective way to limit the crawl to include |
| only initially injected hosts, without creating complex URLFilters. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.score.injected</name> |
| <value>1.0</value> |
| <description>The score of new pages added by the injector. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.score.link.external</name> |
| <value>1.0</value> |
| <description>The score factor for new pages added due to a link from |
| another host relative to the referencing page's score. Scoring plugins |
| may use this value to affect initial scores of external links. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.score.link.internal</name> |
| <value>1.0</value> |
| <description>The score factor for pages added due to a link from the |
| same host, relative to the referencing page's score. Scoring plugins |
| may use this value to affect initial scores of internal links. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.score.count.filtered</name> |
| <value>false</value> |
| <description>The score value passed to newly discovered pages is |
| calculated as a fraction of the original page score divided by the |
| number of outlinks. If this option is false, only the outlinks that passed |
| URLFilters will count, if it's true then all outlinks will count. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.max.outlinks.per.page</name> |
| <value>100</value> |
| <description>The maximum number of outlinks that we'll process for a page. |
| If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks |
| will be processed for a page; otherwise, all outlinks will be processed. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.max.anchor.length</name> |
| <value>100</value> |
| <description>The maximum number of characters permitted in an anchor. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.parsemeta.to.crawldb</name> |
| <value></value> |
| <description>Comma-separated list of parse metadata keys to transfer to the crawldb (NUTCH-779). |
| Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang' |
| will copy both the key 'lang' and its value to the corresponding entry in the crawldb. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.fetch.retry.max</name> |
| <value>3</value> |
| <description>The maximum number of times a url that has encountered |
| recoverable errors is generated for fetch.</description> |
| </property> |
| |
| <property> |
| <name>db.signature.class</name> |
| <value>org.apache.nutch.crawl.MD5Signature</value> |
| <description>The default implementation of a page signature. Signatures |
| created with this implementation will be used for duplicate detection |
| and removal.</description> |
| </property> |
| |
| <property> |
| <name>db.signature.text_profile.min_token_len</name> |
| <value>2</value> |
| <description>Minimum token length to be included in the signature. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.signature.text_profile.quant_rate</name> |
| <value>0.01</value> |
| <description>Profile frequencies will be rounded down to a multiple of |
| QUANT = (int)(QUANT_RATE * maxFreq), where maxFreq is a maximum token |
| frequency. If maxFreq > 1 then QUANT will be at least 2, which means that |
| for longer texts tokens with frequency 1 will always be discarded. |
| </description> |
| </property> |
| |
| <!-- generate properties --> |
| |
| <property> |
| <name>generate.max.count</name> |
| <value>-1</value> |
| <description>The maximum number of urls in a single |
| fetchlist. -1 if unlimited. The urls are counted according |
| to the value of the parameter generator.count.mode. |
| </description> |
| </property> |
| |
| <property> |
| <name>generate.max.distance</name> |
| <value>-1</value> |
| <description>The maximum distance of an URL that the generator is allowed |
| to select for fetch. The distance is the smallest number of nodes (shortest path) |
| of an URL from the original injected URL. (Injected URLs have distance 0). |
| </description> |
| </property> |
| |
| <property> |
| <name>generate.count.mode</name> |
| <value>host</value> |
| <description>Determines how the URLs are counted for generator.max.count. |
| Default value is 'host' but can be 'domain'. Note that we do not count |
| per IP in the new version of the Generator. |
| </description> |
| </property> |
| |
| <property> |
| <name>generate.update.crawldb</name> |
| <value>false</value> |
| <description>For highly-concurrent environments, where several |
| generate/fetch/update cycles may overlap, setting this to true ensures |
| that generate will create different fetchlists even without intervening |
| updatedb-s, at the cost of running an additional job to update CrawlDB. |
| If false, running generate twice without intervening |
| updatedb will generate identical fetchlists.</description> |
| </property> |
| |
| <!-- urlpartitioner properties --> |
| <property> |
| <name>partition.url.mode</name> |
| <value>byHost</value> |
| <description>Determines how to partition URLs. Default value is 'byHost', |
| also takes 'byDomain' or 'byIP'. |
| </description> |
| </property> |
| |
| <property> |
| <name>crawl.gen.delay</name> |
| <value>604800000</value> |
| <description> |
| This value, expressed in days, defines how long we should keep the lock on records |
| in CrawlDb that were just selected for fetching. If these records are not updated |
| in the meantime, the lock is canceled, i.e. the become eligible for selecting. |
| Default value of this is 7 days. |
| </description> |
| </property> |
| |
| <!-- fetcher properties --> |
| |
| <property> |
| <name>fetcher.server.delay</name> |
| <value>5.0</value> |
| <description>The number of seconds the fetcher will delay between |
| successive requests to the same server. Note that this might get |
| overriden by a Crawl-Delay from a robots.txt and is used ONLY if |
| fetcher.threads.per.queue is set to 1. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.server.min.delay</name> |
| <value>0.0</value> |
| <description>The minimum number of seconds the fetcher will delay between |
| successive requests to the same server. This value is applicable ONLY |
| if fetcher.threads.per.queue is greater than 1 (i.e. the host blocking |
| is turned off).</description> |
| </property> |
| |
| <property> |
| <name>fetcher.max.crawl.delay</name> |
| <value>30</value> |
| <description> |
| If the Crawl-Delay in robots.txt is set to greater than this value (in |
| seconds) then the fetcher will skip this page, generating an error report. |
| If set to -1 the fetcher will never skip such pages and will wait the |
| amount of time retrieved from robots.txt Crawl-Delay, however long that |
| might be. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.threads.fetch</name> |
| <value>10</value> |
| <description>The number of FetcherThreads the fetcher should use. |
| This is also determines the maximum number of requests that are |
| made at once (each FetcherThread handles one connection). The total |
| number of threads running in distributed mode will be the number of |
| fetcher threads * number of nodes as fetcher has one map task per node. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.threads.per.queue</name> |
| <value>1</value> |
| <description>This number is the maximum number of threads that |
| should be allowed to access a queue at one time. Setting it to |
| a value > 1 will cause the Crawl-Delay value from robots.txt to |
| be ignored and the value of fetcher.server.min.delay to be used |
| as a delay between successive requests to the same server instead |
| of fetcher.server.delay. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.queue.mode</name> |
| <value>byHost</value> |
| <description>Determines how the URLs are placed into queues. |
| Allowed values are 'byHost', 'byDomain' and 'byIP'. |
| The value would usually correspond to that of 'partition.url.mode'. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.queue.use.host.settings</name> |
| <value>false</value> |
| <description>Allows us to optionally enable host specific queue behavior if present. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.verbose</name> |
| <value>false</value> |
| <description>If true, fetcher will log more verbosely.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.parse</name> |
| <value>false</value> |
| <description>If true, fetcher will parse content. NOTE: previous releases would |
| default to true. Since 2.0 this is set to false as a safer default.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.store.content</name> |
| <value>true</value> |
| <description>If true, fetcher will store content.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.timelimit.mins</name> |
| <value>-1</value> |
| <description>This is the number of minutes allocated to the fetching. |
| Once this value is reached, any remaining entry from the input URL list is skipped |
| and all active queues are emptied. The default value of -1 deactivates the time limit. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.max.exceptions.per.queue</name> |
| <value>-1</value> |
| <description>The maximum number of protocol-level exceptions (e.g. timeouts) per |
| host (or IP) queue. Once this value is reached, any remaining entries from this |
| queue are purged, effectively stopping the fetching from this host/IP. The default |
| value of -1 deactivates this limit. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.throughput.threshold.pages</name> |
| <value>-1</value> |
| <description>The threshold of minimum pages per second. If the fetcher downloads less |
| pages per second than the configured threshold, the fetcher stops, preventing slow queue's |
| from stalling the throughput. This threshold must be an integer. This can be useful when |
| fetcher.timelimit.mins is hard to determine. The default value of -1 disables this check. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.throughput.threshold.sequence</name> |
| <value>5</value> |
| <description>The number of times the fetcher.throughput.threshold is allowed to be exceeded, |
| in a row. This setting prevents accidental slow downs from stopping the fetcher. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.throughput.threshold.check.after</name> |
| <value>5</value> |
| <description>The number of minutes after which the throughput check is enabled.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.queue.depth.multiplier</name> |
| <value>50</value> |
| <description>(EXPERT)The fetcher buffers the incoming URLs into queues based on the [host|domain|IP] |
| (see param fetcher.queue.mode). The depth of the queue is the number of threads times the value of this parameter. |
| A large value requires more memory but can improve the performance of the fetch when the order of the URLS in the fetch list |
| is not optimal. |
| </description> |
| </property> |
| |
| <!-- indexingfilter plugin properties --> |
| |
| <property> |
| <name>indexingfilter.order</name> |
| <value></value> |
| <description>The order by which index filters are applied. |
| If empty, all available index filters (as dictated by properties |
| plugin-includes and plugin-excludes above) are loaded and applied in system |
| defined order. If not empty, only named filters are loaded and applied |
| in given order. For example, if this property has value: |
| org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter |
| then BasicIndexingFilter is applied first, and MoreIndexingFilter second. |
| |
| Filter ordering might have impact on result if one filter depends on output of |
| another filter. |
| </description> |
| </property> |
| |
| <property> |
| <name>indexer.score.power</name> |
| <value>0.5</value> |
| <description>Used by the OPIC plugin. Determines the power of link analyis scores. |
| Each pages's boost is set to <i>score<sup>scorePower</sup></i> where |
| <i>score</i> is its link analysis score and <i>scorePower</i> is the |
| value of this parameter. This is compiled into indexes, so, when |
| this is changed, pages must be re-indexed for it to take |
| effect.</description> |
| </property> |
| |
| <!-- BasicIndexingfilter plugin properties --> |
| |
| <property> |
| <name>indexer.max.title.length</name> |
| <value>100</value> |
| <description>The maximum number of characters of a title that are indexed. A value of -1 disables this check. |
| Used by index-basic. |
| </description> |
| </property> |
| |
| <!-- moreindexingfilter plugin properties --> |
| |
| <property> |
| <name>moreIndexingFilter.indexMimeTypeParts</name> |
| <value>true</value> |
| <description>Determines whether the index-more plugin will split the mime-type |
| in sub parts, this requires the type field to be multi valued. Set to true for backward |
| compatibility. False will not split the mime-type. |
| </description> |
| </property> |
| |
| <!-- AnchorIndexing filter plugin properties --> |
| |
| <property> |
| <name>anchorIndexingFilter.deduplicate</name> |
| <value>false</value> |
| <description>With this enabled the indexer will case-insensitive deduplicate hanchors |
| before indexing. This prevents possible hundreds or thousands of identical anchors for |
| a given page to be indexed but will affect the search scoring (i.e. tf=1.0f). |
| </description> |
| </property> |
| |
| <!-- URL normalizer properties --> |
| |
| <property> |
| <name>urlnormalizer.order</name> |
| <value>org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer</value> |
| <description>Order in which normalizers will run. If any of these isn't |
| activated it will be silently skipped. If other normalizers not on the |
| list are activated, they will run in random order after the ones |
| specified here are run. |
| </description> |
| </property> |
| |
| <property> |
| <name>urlnormalizer.regex.file</name> |
| <value>regex-normalize.xml</value> |
| <description>Name of the config file used by the RegexUrlNormalizer class. |
| </description> |
| </property> |
| |
| <property> |
| <name>urlnormalizer.loop.count</name> |
| <value>1</value> |
| <description>Optionally loop through normalizers several times, to make |
| sure that all transformations have been performed. |
| </description> |
| </property> |
| |
| <!-- mime properties --> |
| |
| <!-- |
| <property> |
| <name>mime.types.file</name> |
| <value>tika-mimetypes.xml</value> |
| <description>Name of file in CLASSPATH containing filename extension and |
| magic sequence to mime types mapping information. Overrides the default Tika config |
| if specified. |
| </description> |
| </property> |
| --> |
| |
| <property> |
| <name>mime.type.magic</name> |
| <value>true</value> |
| <description>Defines if the mime content type detector uses magic resolution. |
| </description> |
| </property> |
| |
| <!-- plugin properties --> |
| |
| <property> |
| <name>plugin.folders</name> |
| <value>plugins</value> |
| <description>Directories where nutch plugins are located. Each |
| element may be a relative or absolute path. If absolute, it is used |
| as is. If relative, it is searched for on the classpath.</description> |
| </property> |
| |
| <property> |
| <name>plugin.auto-activation</name> |
| <value>true</value> |
| <description>Defines if some plugins that are not activated regarding |
| the plugin.includes and plugin.excludes properties must be automaticaly |
| activated if they are needed by some actived plugins. |
| </description> |
| </property> |
| |
| <property> |
| <name>plugin.includes</name> |
| <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|urlnormalizer-(pass|regex|basic)|scoring-opic</value> |
| <description>Regular expression naming plugin directory names to |
| include. Any plugin not matching this expression is excluded. |
| In any case you need at least include the nutch-extensionpoints plugin. By |
| default Nutch includes crawling just HTML and plain text via HTTP, |
| and basic indexing and search plugins. In order to use HTTPS please enable |
| protocol-httpclient, but be aware of possible intermittent problems with the |
| underlying commons-httpclient library. |
| </description> |
| </property> |
| |
| <property> |
| <name>plugin.excludes</name> |
| <value></value> |
| <description>Regular expression naming plugin directory names to exclude. |
| </description> |
| </property> |
| |
| <!-- parser properties --> |
| |
| <property> |
| <name>parse.plugin.file</name> |
| <value>parse-plugins.xml</value> |
| <description>The name of the file that defines the associations between |
| content-types and parsers.</description> |
| </property> |
| |
| <property> |
| <name>parser.character.encoding.default</name> |
| <value>windows-1252</value> |
| <description>The character encoding to fall back to when no other information |
| is available</description> |
| </property> |
| |
| <property> |
| <name>encodingdetector.charset.min.confidence</name> |
| <value>-1</value> |
| <description>A integer between 0-100 indicating minimum confidence value |
| for charset auto-detection. Any negative value disables auto-detection. |
| </description> |
| </property> |
| |
| <property> |
| <name>parser.caching.forbidden.policy</name> |
| <value>content</value> |
| <description>If a site (or a page) requests through its robot metatags |
| that it should not be shown as cached content, apply this policy. Currently |
| three keywords are recognized: "none" ignores any "noarchive" directives. |
| "content" doesn't show the content, but shows summaries (snippets). |
| "all" doesn't show either content or summaries.</description> |
| </property> |
| |
| |
| <property> |
| <name>parser.html.impl</name> |
| <value>neko</value> |
| <description>HTML Parser implementation. Currently the following keywords |
| are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup. |
| </description> |
| </property> |
| |
| <property> |
| <name>parser.html.form.use_action</name> |
| <value>false</value> |
| <description>If true, HTML parser will collect URLs from form action |
| attributes. This may lead to undesirable behavior (submitting empty |
| forms during next fetch cycle). If false, form action attribute will |
| be ignored.</description> |
| </property> |
| |
| <property> |
| <name>parser.html.outlinks.ignore_tags</name> |
| <value></value> |
| <description>Comma separated list of HTML tags, from which outlinks |
| shouldn't be extracted. Nutch takes links from: a, area, form, frame, |
| iframe, script, link, img. If you add any of those tags here, it |
| won't be taken. Default is empty list. Probably reasonable value |
| for most people would be "img,script,link".</description> |
| </property> |
| |
| <property> |
| <name>htmlparsefilter.order</name> |
| <value></value> |
| <description>The order by which HTMLParse filters are applied. |
| If empty, all available HTMLParse filters (as dictated by properties |
| plugin-includes and plugin-excludes above) are loaded and applied in system |
| defined order. If not empty, only named filters are loaded and applied |
| in given order. |
| HTMLParse filter ordering MAY have an impact |
| on end result, as some filters could rely on the metadata generated by a previous filter. |
| </description> |
| </property> |
| |
| <property> |
| <name>parser.timeout</name> |
| <value>30</value> |
| <description>Timeout in seconds for the parsing of a document, otherwise treats it as an exception and |
| moves on the the following documents. This parameter is applied to any Parser implementation. |
| Set to -1 to deactivate, bearing in mind that this could cause |
| the parsing to crash because of a very long or corrupted document. |
| </description> |
| </property> |
| |
| <property> |
| <name>parser.skip.truncated</name> |
| <value>true</value> |
| <description>Boolean value for whether we should skip parsing for truncated documents. By default this |
| property is activated due to extremely high levels of CPU which parsing can sometimes take. |
| </description> |
| </property> |
| |
| <!-- |
| <property> |
| <name>tika.htmlmapper.classname</name> |
| <value>org.apache.tika.parser.html.IdentityHtmlMapper</value> |
| <description>Classname of Tika HTMLMapper to use. Influences the elements included in the DOM and hence |
| the behaviour of the HTMLParseFilters. |
| </description> |
| </property> |
| --> |
| |
| <!-- urlfilter plugin properties --> |
| |
| <property> |
| <name>urlfilter.tld.length</name> |
| <value></value> |
| <description>Maximum Character length of top-level-domain</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.domain.file</name> |
| <value>domain-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing either top level domains or |
| hostnames used by urlfilter-domain (DomainURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.regex.file</name> |
| <value>regex-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing regular expressions |
| used by urlfilter-regex (RegexURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.automaton.file</name> |
| <value>automaton-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing regular expressions |
| used by urlfilter-automaton (AutomatonURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.prefix.file</name> |
| <value>prefix-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing url prefixes |
| used by urlfilter-prefix (PrefixURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.suffix.file</name> |
| <value>suffix-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing url suffixes |
| used by urlfilter-suffix (SuffixURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.order</name> |
| <value></value> |
| <description>The order by which url filters are applied. |
| If empty, all available url filters (as dictated by properties |
| plugin-includes and plugin-excludes above) are loaded and applied in system |
| defined order. If not empty, only named filters are loaded and applied |
| in given order. For example, if this property has value: |
| org.apache.nutch.urlfilter.regex.RegexURLFilter org.apache.nutch.urlfilter.prefix.PrefixURLFilter |
| then RegexURLFilter is applied first, and PrefixURLFilter second. |
| Since all filters are AND'ed, filter ordering does not have impact |
| on end result, but it may have performance implication, depending |
| on relative expensiveness of filters. |
| </description> |
| </property> |
| |
| <!-- scoring filters properties --> |
| |
| <property> |
| <name>scoring.filter.order</name> |
| <value></value> |
| <description>The order in which scoring filters are applied. |
| This may be left empty (in which case all available scoring |
| filters will be applied in the order defined in plugin-includes |
| and plugin-excludes), or a space separated list of implementation |
| classes. |
| </description> |
| </property> |
| |
| <!-- language-identifier plugin properties --> |
| |
| <property> |
| <name>lang.ngram.min.length</name> |
| <value>1</value> |
| <description> The minimum size of ngrams to uses to identify |
| language (must be between 1 and lang.ngram.max.length). |
| The larger is the range between lang.ngram.min.length and |
| lang.ngram.max.length, the better is the identification, but |
| the slowest it is. |
| </description> |
| </property> |
| |
| <property> |
| <name>lang.ngram.max.length</name> |
| <value>4</value> |
| <description> The maximum size of ngrams to uses to identify |
| language (must be between lang.ngram.min.length and 4). |
| The larger is the range between lang.ngram.min.length and |
| lang.ngram.max.length, the better is the identification, but |
| the slowest it is. |
| </description> |
| </property> |
| |
| <property> |
| <name>lang.analyze.max.length</name> |
| <value>2048</value> |
| <description> The maximum bytes of data to uses to indentify |
| the language (0 means full content analysis). |
| The larger is this value, the better is the analysis, but the |
| slowest it is. |
| </description> |
| </property> |
| |
| <property> |
| <name>lang.extraction.policy</name> |
| <value>detect,identify</value> |
| <description>This determines when the plugin uses detection and |
| statistical identification mechanisms. The order in which the |
| detect and identify are written will determine the extraction |
| policy. Default case (detect,identify) means the plugin will |
| first try to extract language info from page headers and metadata, |
| if this is not successful it will try using tika language |
| identification. Possible values are: |
| detect |
| identify |
| detect,identify |
| identify,detect |
| </description> |
| </property> |
| |
| <property> |
| <name>lang.identification.only.certain</name> |
| <value>false</value> |
| <description>If set to true with lang.extraction.policy containing identify, |
| the language code returned by Tika will be assigned to the document ONLY |
| if it is deemed certain by Tika. |
| </description> |
| </property> |
| |
| <!-- index-metadata plugin properties --> |
| |
| <property> |
| <name>index.metadata</name> |
| <value>description,keywords</value> |
| <description> |
| Comma-separated list of keys to be taken from the metadata to generate fields. |
| Can be used e.g. for 'description' or 'keywords' provided that these values are generated |
| by a parser (see parse-metatags plugin), and property 'metatags.names'. |
| </description> |
| </property> |
| |
| <!-- parse-metatags plugin properties --> |
| <property> |
| <name>metatags.names</name> |
| <value>*</value> |
| <description>Names of the metatags to extract, separated by ','. |
| Use '*' to extract all metatags. Prefixes the names with 'meta_' in |
| the parse-metadata. For instance, to index description and keywords, |
| you need to activate the plugins parse-metadata and index-metadata |
| and set the value of the properties 'metatags.names' and |
| 'index.metadata' to 'description,keywords'. |
| </description> |
| </property> |
| |
| <!-- Temporary Hadoop 0.17.x workaround. --> |
| |
| <property> |
| <name>hadoop.job.history.user.location</name> |
| <value>${hadoop.log.dir}/history/user</value> |
| <description>Hadoop 0.17.x comes with a default setting to create |
| user logs inside the output path of the job. This breaks some |
| Hadoop classes, which expect the output to contain only |
| part-XXXXX files. This setting changes the output to a |
| subdirectory of the regular log directory. |
| </description> |
| </property> |
| |
| <property> |
| <name>io.serializations</name> |
| <value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.JavaSerialization</value> |
| <!-- org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization, |
| org.apache.hadoop.io.serializer.avro.AvroReflectSerialization, |
| org.apache.hadoop.io.serializer.avro.AvroGenericSerialization, --> |
| <description>A list of serialization classes that can be used for |
| obtaining serializers and deserializers.</description> |
| </property> |
| |
| <!-- solr index properties --> |
| |
| <property> |
| <name>solr.mapping.file</name> |
| <value>solrindex-mapping.xml</value> |
| <description> |
| Defines the name of the file that will be used in the mapping of internal |
| nutch field names to solr index fields as specified in the target Solr schema. |
| </description> |
| </property> |
| |
| <property> |
| <name>solr.commit.size</name> |
| <value>250</value> |
| <description> |
| Defines the number of documents to send to Solr in a single update batch. |
| Decrease when handling very large documents to prevent Nutch from running |
| out of memory. NOTE: It does not explicitly trigger a server side commit. |
| </description> |
| </property> |
| |
| <property> |
| <name>solr.commit.index</name> |
| <value>true</value> |
| <description> |
| When closing the indexer, trigger a commit to the Solr server. |
| </description> |
| </property> |
| |
| <property> |
| <name>solr.auth</name> |
| <value>false</value> |
| <description> |
| Whether to enable HTTP basic authentication for communicating with Solr. |
| Use the solr.auth.username and solr.auth.password properties to configure |
| your credentials. |
| </description> |
| </property> |
| |
| <!-- elasticsearch index properties --> |
| <property> |
| <name>elastic.host</name> |
| <value></value> |
| <description>The hostname to send documents to using TransportClient. |
| Either host and port must be defined or cluster. |
| </description> |
| </property> |
| |
| <property> |
| <name>elastic.port</name> |
| <value>9300</value> |
| <description> |
| The port to connect to using TransportClient. |
| </description> |
| </property> |
| |
| <property> |
| <name>elastic.cluster</name> |
| <value></value> |
| <description>The cluster name to discover. Either host and potr must |
| be defined or cluster. |
| </description> |
| </property> |
| |
| <property> |
| <name>elastic.index</name> |
| <value>nutch</value> |
| <description> |
| The name of the elasticsearch index. Will normally be autocreated if it |
| doesn't exist. |
| </description> |
| </property> |
| |
| <property> |
| <name>elastic.max.bulk.docs</name> |
| <value>250</value> |
| <description> |
| The number of docs in the batch that will trigger a flush to |
| elasticsearch. |
| </description> |
| </property> |
| |
| <property> |
| <name>elastic.max.bulk.size</name> |
| <value>2500500</value> |
| <description> |
| The total length of all indexed text in a batch that will trigger a |
| flush to elasticsearch, by checking after every document for excess |
| of this amount. |
| </description> |
| </property> |
| |
| <!-- storage properties --> |
| |
| <property> |
| <name>storage.data.store.class</name> |
| <value>org.apache.gora.memory.store.MemStore</value> |
| <description>The Gora DataStore class for storing and retrieving data. |
| Currently the following stores are available: |
| |
| org.apache.gora.sql.store.SqlStore |
| Default store. A DataStore implementation for RDBMS with a SQL interface. |
| SqlStore uses JDBC drivers to communicate with the DB. As explained in |
| ivy.xml, currently >= gora-core 0.3 is not backwards compatable with |
| SqlStore. |
| |
| org.apache.gora.cassandra.store.CassandraStore |
| Gora class for storing data in Apache Cassandra. |
| |
| org.apache.gora.hbase.store.HBaseStore |
| Gora class for storing data in Apache HBase. |
| |
| org.apache.gora.accumulo.store.AccumuloStore |
| Gora class for storing data in Apache Accumulo. |
| |
| org.apache.gora.avro.store.AvroStore |
| Gora class for storing data in Apache Avro. |
| |
| org.apache.gora.avro.store.DataFileAvroStore |
| Gora class for storing data in Apache Avro. DataFileAvroStore is |
| a file based store which uses Avro's DataFile{Writer,Reader}'s as a backend. |
| This datastore supports mapreduce. |
| |
| org.apache.gora.memory.store.MemStore |
| Gora class for storing data in a Memory based implementation for tests. |
| |
| org.apache.gora.mongodb.store.MongoStore |
| Gora class for storing data in MongoDB. |
| |
| org.apache.gora.solr.store.SolrStore |
| Gora class for storing data in Apache Solr. |
| </description> |
| </property> |
| |
| <property> |
| <name>storage.schema.webpage</name> |
| <value>webpage</value> |
| <description>This value holds the schema name used for Nutch web db. |
| Note that Nutch ignores the value in the gora mapping files, and uses |
| this as the webpage schema name. |
| </description> |
| </property> |
| |
| <property> |
| <name>storage.schema.host</name> |
| <value>host</value> |
| <description>This value holds the schema name used for Nutch host db. |
| Note that Nutch ignores the value in the gora mapping files, and uses |
| this as the host schema name. |
| </description> |
| </property> |
| |
| <property> |
| <name>storage.crawl.id</name> |
| <value></value> |
| <description>This value helps differentiate between the datasets that |
| the jobs in the crawl cycle generate and operate on. The value will |
| be input to all the jobs which then will use it as a prefix when |
| accessing to the schemas. The default configuration uses no id to prefix |
| the schemas. The value could also be given as a command line argument |
| to each job. |
| </description> |
| </property> |
| |
| <property> |
| <name>gora.buffer.read.limit</name> |
| <value>10000</value> |
| <description>The maximum number of buffered Records we wish to |
| read in one batch. @see org.apache.gora.mapreduce.GoraRecordReader |
| </description> |
| </property> |
| |
| <property> |
| <name>gora.buffer.write.limit</name> |
| <value>10000</value> |
| <description>Configures (for the Hadoop record writer) the maximum number of |
| buffered Records we wish to regularly flush to the Gora datastore. |
| @see org.apache.gora.mapreduce.GoraRecordWriter. |
| </description> |
| </property> |
| |
| </configuration> |