| <?xml version="1.0"?> |
| <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> |
| <!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| <!-- Do not modify this file directly. Instead, copy entries that you --> |
| <!-- wish to modify from this file into nutch-site.xml and change them --> |
| <!-- there. If nutch-site.xml does not already exist, create it. --> |
| |
| <configuration> |
| |
| <!-- file properties --> |
| |
| <property> |
| <name>file.content.limit</name> |
| <value>65536</value> |
| <description>The length limit for downloaded content using the file:// |
| protocol, in bytes. If this value is nonnegative (>=0), content longer |
| than it will be truncated; otherwise, no truncation at all. Do not |
| confuse this setting with the http.content.limit setting. |
| </description> |
| </property> |
| |
| <property> |
| <name>file.crawl.parent</name> |
| <value>true</value> |
| <description>The crawler is not restricted to the directories that you specified in the |
| Urls file but it is jumping into the parent directories as well. For your own crawlings you can |
| change this bahavior (set to false) the way that only directories beneath the directories that you specify get |
| crawled.</description> |
| </property> |
| |
| <property> |
| <name>file.content.ignored</name> |
| <value>true</value> |
| <description>If true, no file content will be saved during fetch. |
| And it is probably what we want to set most of time, since file:// URLs |
| are meant to be local and we can always use them directly at parsing |
| and indexing stages. Otherwise file contents will be saved. |
| !! NO IMPLEMENTED YET !! |
| </description> |
| </property> |
| |
| <!-- HTTP properties --> |
| |
| <property> |
| <name>http.agent.name</name> |
| <value></value> |
| <description>HTTP 'User-Agent' request header. MUST NOT be empty - |
| please set this to a single word uniquely related to your organization. |
| |
| NOTE: You should also check other related properties: |
| |
| http.robots.agents |
| http.agent.description |
| http.agent.url |
| http.agent.email |
| http.agent.version |
| |
| and set their values appropriately. |
| |
| </description> |
| </property> |
| |
| <property> |
| <name>http.robots.agents</name> |
| <value>*</value> |
| <description>The agent strings we'll look for in robots.txt files, |
| comma-separated, in decreasing order of precedence. You should |
| put the value of http.agent.name as the first agent name, and keep the |
| default * at the end of the list. E.g.: BlurflDev,Blurfl,* |
| </description> |
| </property> |
| |
| <property> |
| <name>http.robots.403.allow</name> |
| <value>true</value> |
| <description>Some servers return HTTP status 403 (Forbidden) if |
| /robots.txt doesn't exist. This should probably mean that we are |
| allowed to crawl the site nonetheless. If this is set to false, |
| then such sites will be treated as forbidden.</description> |
| </property> |
| |
| <property> |
| <name>http.agent.description</name> |
| <value></value> |
| <description>Further description of our bot- this text is used in |
| the User-Agent header. It appears in parenthesis after the agent name. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.url</name> |
| <value></value> |
| <description>A URL to advertise in the User-Agent header. This will |
| appear in parenthesis after the agent name. Custom dictates that this |
| should be a URL of a page explaining the purpose and behavior of this |
| crawler. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.email</name> |
| <value></value> |
| <description>An email address to advertise in the HTTP 'From' request |
| header and User-Agent header. A good practice is to mangle this |
| address (e.g. 'info at example dot com') to avoid spamming. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.version</name> |
| <value>Nutch-1.7</value> |
| <description>A version string to advertise in the User-Agent |
| header.</description> |
| </property> |
| |
| <property> |
| <name>http.agent.host</name> |
| <value></value> |
| <description>Name or IP address of the host on which the Nutch crawler |
| would be running. Currently this is used by 'protocol-httpclient' |
| plugin. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.timeout</name> |
| <value>10000</value> |
| <description>The default network timeout, in milliseconds.</description> |
| </property> |
| |
| <property> |
| <name>http.max.delays</name> |
| <value>100</value> |
| <description>The number of times a thread will delay when trying to |
| fetch a page. Each time it finds that a host is busy, it will wait |
| fetcher.server.delay. After http.max.delays attepts, it will give |
| up on the page for now.</description> |
| </property> |
| |
| <property> |
| <name>http.content.limit</name> |
| <value>65536</value> |
| <description>The length limit for downloaded content using the http:// |
| protocol, in bytes. If this value is nonnegative (>=0), content longer |
| than it will be truncated; otherwise, no truncation at all. Do not |
| confuse this setting with the file.content.limit setting. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.proxy.host</name> |
| <value></value> |
| <description>The proxy hostname. If empty, no proxy is used.</description> |
| </property> |
| |
| <property> |
| <name>http.proxy.port</name> |
| <value></value> |
| <description>The proxy port.</description> |
| </property> |
| |
| <property> |
| <name>http.proxy.username</name> |
| <value></value> |
| <description>Username for proxy. This will be used by |
| 'protocol-httpclient', if the proxy server requests basic, digest |
| and/or NTLM authentication. To use this, 'protocol-httpclient' must |
| be present in the value of 'plugin.includes' property. |
| NOTE: For NTLM authentication, do not prefix the username with the |
| domain, i.e. 'susam' is correct whereas 'DOMAIN\susam' is incorrect. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.proxy.password</name> |
| <value></value> |
| <description>Password for proxy. This will be used by |
| 'protocol-httpclient', if the proxy server requests basic, digest |
| and/or NTLM authentication. To use this, 'protocol-httpclient' must |
| be present in the value of 'plugin.includes' property. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.proxy.realm</name> |
| <value></value> |
| <description>Authentication realm for proxy. Do not define a value |
| if realm is not required or authentication should take place for any |
| realm. NTLM does not use the notion of realms. Specify the domain name |
| of NTLM authentication as the value for this property. To use this, |
| 'protocol-httpclient' must be present in the value of |
| 'plugin.includes' property. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.auth.file</name> |
| <value>httpclient-auth.xml</value> |
| <description>Authentication configuration file for |
| 'protocol-httpclient' plugin. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.verbose</name> |
| <value>false</value> |
| <description>If true, HTTP will log more verbosely.</description> |
| </property> |
| |
| <property> |
| <name>http.redirect.max</name> |
| <value>0</value> |
| <description>The maximum number of redirects the fetcher will follow when |
| trying to fetch a page. If set to negative or 0, fetcher won't immediately |
| follow redirected URLs, instead it will record them for later fetching. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.useHttp11</name> |
| <value>false</value> |
| <description>NOTE: at the moment this works only for protocol-httpclient. |
| If true, use HTTP 1.1, if false use HTTP 1.0 . |
| </description> |
| </property> |
| |
| <property> |
| <name>http.accept.language</name> |
| <value>en-us,en-gb,en;q=0.7,*;q=0.3</value> |
| <description>Value of the "Accept-Language" request header field. |
| This allows selecting non-English language as default one to retrieve. |
| It is a useful setting for search engines build for certain national group. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.accept</name> |
| <value>text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8</value> |
| <description>Value of the "Accept" request header field. |
| </description> |
| </property> |
| |
| <!-- FTP properties --> |
| |
| <property> |
| <name>ftp.username</name> |
| <value>anonymous</value> |
| <description>ftp login username.</description> |
| </property> |
| |
| <property> |
| <name>ftp.password</name> |
| <value>anonymous@example.com</value> |
| <description>ftp login password.</description> |
| </property> |
| |
| <property> |
| <name>ftp.content.limit</name> |
| <value>65536</value> |
| <description>The length limit for downloaded content, in bytes. |
| If this value is nonnegative (>=0), content longer than it will be truncated; |
| otherwise, no truncation at all. |
| Caution: classical ftp RFCs never defines partial transfer and, in fact, |
| some ftp servers out there do not handle client side forced close-down very |
| well. Our implementation tries its best to handle such situations smoothly. |
| </description> |
| </property> |
| |
| <property> |
| <name>ftp.timeout</name> |
| <value>60000</value> |
| <description>Default timeout for ftp client socket, in millisec. |
| Please also see ftp.keep.connection below.</description> |
| </property> |
| |
| <property> |
| <name>ftp.server.timeout</name> |
| <value>100000</value> |
| <description>An estimation of ftp server idle time, in millisec. |
| Typically it is 120000 millisec for many ftp servers out there. |
| Better be conservative here. Together with ftp.timeout, it is used to |
| decide if we need to delete (annihilate) current ftp.client instance and |
| force to start another ftp.client instance anew. This is necessary because |
| a fetcher thread may not be able to obtain next request from queue in time |
| (due to idleness) before our ftp client times out or remote server |
| disconnects. Used only when ftp.keep.connection is true (please see below). |
| </description> |
| </property> |
| |
| <property> |
| <name>ftp.keep.connection</name> |
| <value>false</value> |
| <description>Whether to keep ftp connection. Useful if crawling same host |
| again and again. When set to true, it avoids connection, login and dir list |
| parser setup for subsequent urls. If it is set to true, however, you must |
| make sure (roughly): |
| (1) ftp.timeout is less than ftp.server.timeout |
| (2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay) |
| Otherwise there will be too many "delete client because idled too long" |
| messages in thread logs.</description> |
| </property> |
| |
| <property> |
| <name>ftp.follow.talk</name> |
| <value>false</value> |
| <description>Whether to log dialogue between our client and remote |
| server. Useful for debugging.</description> |
| </property> |
| |
| <!-- web db properties --> |
| |
| <property> |
| <name>db.fetch.interval.default</name> |
| <value>2592000</value> |
| <description>The default number of seconds between re-fetches of a page (30 days). |
| </description> |
| </property> |
| |
| <property> |
| <name>db.fetch.interval.max</name> |
| <value>7776000</value> |
| <description>The maximum number of seconds between re-fetches of a page |
| (90 days). After this period every page in the db will be re-tried, no |
| matter what is its status. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.class</name> |
| <value>org.apache.nutch.crawl.DefaultFetchSchedule</value> |
| <description>The implementation of fetch schedule. DefaultFetchSchedule simply |
| adds the original fetchInterval to the last fetch time, regardless of |
| page changes.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.inc_rate</name> |
| <value>0.4</value> |
| <description>If a page is unmodified, its fetchInterval will be |
| increased by this rate. This value should not |
| exceed 0.5, otherwise the algorithm becomes unstable.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.dec_rate</name> |
| <value>0.2</value> |
| <description>If a page is modified, its fetchInterval will be |
| decreased by this rate. This value should not |
| exceed 0.5, otherwise the algorithm becomes unstable.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.min_interval</name> |
| <value>60.0</value> |
| <description>Minimum fetchInterval, in seconds.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.max_interval</name> |
| <value>31536000.0</value> |
| <description>Maximum fetchInterval, in seconds (365 days). |
| NOTE: this is limited by db.fetch.interval.max. Pages with |
| fetchInterval larger than db.fetch.interval.max |
| will be fetched anyway.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.sync_delta</name> |
| <value>true</value> |
| <description>If true, try to synchronize with the time of page change. |
| by shifting the next fetchTime by a fraction (sync_rate) of the difference |
| between the last modification time, and the last fetch time.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.sync_delta_rate</name> |
| <value>0.3</value> |
| <description>See sync_delta for description. This value should not |
| exceed 0.5, otherwise the algorithm becomes unstable.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.mime.file</name> |
| <value>adaptive-mimetypes.txt</value> |
| <description>The configuration file for the MimeAdaptiveFetchSchedule. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.update.additions.allowed</name> |
| <value>true</value> |
| <description>If true, updatedb will add newly discovered URLs, if false |
| only already existing URLs in the CrawlDb will be updated and no new |
| URLs will be added. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.preserve.backup</name> |
| <value>true</value> |
| <description>If true, updatedb will keep a backup of the previous CrawlDB |
| version in the old directory. In case of disaster, one can rename old to |
| current and restore the CrawlDB to its previous state. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.update.purge.404</name> |
| <value>false</value> |
| <description>If true, updatedb will add purge records with status DB_GONE |
| from the CrawlDB. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.update.max.inlinks</name> |
| <value>10000</value> |
| <description>Maximum number of inlinks to take into account when updating |
| a URL score in the crawlDB. Only the best scoring inlinks are kept. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.ignore.internal.links</name> |
| <value>true</value> |
| <description>If true, when adding new links to a page, links from |
| the same host are ignored. This is an effective way to limit the |
| size of the link database, keeping only the highest quality |
| links. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.ignore.external.links</name> |
| <value>false</value> |
| <description>If true, outlinks leading from a page to external hosts |
| will be ignored. This is an effective way to limit the crawl to include |
| only initially injected hosts, without creating complex URLFilters. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.injector.overwrite</name> |
| <value>false</value> |
| <description>Whether existing records in the CrawlDB will be overwritten |
| by injected records. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.injector.update</name> |
| <value>false</value> |
| <description>If true existing records in the CrawlDB will be updated with |
| injected records. Old meta data is preserved. The db.injector.overwrite |
| parameter has precedence. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.score.injected</name> |
| <value>1.0</value> |
| <description>The score of new pages added by the injector. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.score.link.external</name> |
| <value>1.0</value> |
| <description>The score factor for new pages added due to a link from |
| another host relative to the referencing page's score. Scoring plugins |
| may use this value to affect initial scores of external links. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.score.link.internal</name> |
| <value>1.0</value> |
| <description>The score factor for pages added due to a link from the |
| same host, relative to the referencing page's score. Scoring plugins |
| may use this value to affect initial scores of internal links. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.score.count.filtered</name> |
| <value>false</value> |
| <description>The score value passed to newly discovered pages is |
| calculated as a fraction of the original page score divided by the |
| number of outlinks. If this option is false, only the outlinks that passed |
| URLFilters will count, if it's true then all outlinks will count. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.max.inlinks</name> |
| <value>10000</value> |
| <description>Maximum number of Inlinks per URL to be kept in LinkDb. |
| If "invertlinks" finds more inlinks than this number, only the first |
| N inlinks will be stored, and the rest will be discarded. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.max.outlinks.per.page</name> |
| <value>100</value> |
| <description>The maximum number of outlinks that we'll process for a page. |
| If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks |
| will be processed for a page; otherwise, all outlinks will be processed. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.max.anchor.length</name> |
| <value>100</value> |
| <description>The maximum number of characters permitted in an anchor. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.parsemeta.to.crawldb</name> |
| <value></value> |
| <description>Comma-separated list of parse metadata keys to transfer to the crawldb (NUTCH-779). |
| Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang' |
| will copy both the key 'lang' and its value to the corresponding entry in the crawldb. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.fetch.retry.max</name> |
| <value>3</value> |
| <description>The maximum number of times a url that has encountered |
| recoverable errors is generated for fetch.</description> |
| </property> |
| |
| <property> |
| <name>db.signature.class</name> |
| <value>org.apache.nutch.crawl.MD5Signature</value> |
| <description>The default implementation of a page signature. Signatures |
| created with this implementation will be used for duplicate detection |
| and removal.</description> |
| </property> |
| |
| <property> |
| <name>db.signature.text_profile.min_token_len</name> |
| <value>2</value> |
| <description>Minimum token length to be included in the signature. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.signature.text_profile.quant_rate</name> |
| <value>0.01</value> |
| <description>Profile frequencies will be rounded down to a multiple of |
| QUANT = (int)(QUANT_RATE * maxFreq), where maxFreq is a maximum token |
| frequency. If maxFreq > 1 then QUANT will be at least 2, which means that |
| for longer texts tokens with frequency 1 will always be discarded. |
| </description> |
| </property> |
| |
| <!-- generate properties --> |
| |
| <property> |
| <name>generate.max.count</name> |
| <value>-1</value> |
| <description>The maximum number of urls in a single |
| fetchlist. -1 if unlimited. The urls are counted according |
| to the value of the parameter generator.count.mode. |
| </description> |
| </property> |
| |
| <property> |
| <name>generate.count.mode</name> |
| <value>host</value> |
| <description>Determines how the URLs are counted for generator.max.count. |
| Default value is 'host' but can be 'domain'. Note that we do not count |
| per IP in the new version of the Generator. |
| </description> |
| </property> |
| |
| <property> |
| <name>generate.update.crawldb</name> |
| <value>false</value> |
| <description>For highly-concurrent environments, where several |
| generate/fetch/update cycles may overlap, setting this to true ensures |
| that generate will create different fetchlists even without intervening |
| updatedb-s, at the cost of running an additional job to update CrawlDB. |
| If false, running generate twice without intervening |
| updatedb will generate identical fetchlists.</description> |
| </property> |
| |
| <property> |
| <name>generate.min.score</name> |
| <value>0</value> |
| <description>Select only entries with a score larger than |
| generate.min.score.</description> |
| </property> |
| |
| <property> |
| <name>generate.min.interval</name> |
| <value>-1</value> |
| <description>Select only entries with a retry interval lower than |
| generate.min.interval. A value of -1 disables this check.</description> |
| </property> |
| |
| <!-- urlpartitioner properties --> |
| |
| <property> |
| <name>partition.url.mode</name> |
| <value>byHost</value> |
| <description>Determines how to partition URLs. Default value is 'byHost', |
| also takes 'byDomain' or 'byIP'. |
| </description> |
| </property> |
| |
| <property> |
| <name>crawl.gen.delay</name> |
| <value>604800000</value> |
| <description> |
| This value, expressed in milliseconds, defines how long we should keep the lock on records |
| in CrawlDb that were just selected for fetching. If these records are not updated |
| in the meantime, the lock is canceled, i.e. they become eligible for selecting. |
| Default value of this is 7 days (604800000 ms). |
| </description> |
| </property> |
| |
| <!-- fetcher properties --> |
| |
| <property> |
| <name>fetcher.server.delay</name> |
| <value>5.0</value> |
| <description>The number of seconds the fetcher will delay between |
| successive requests to the same server.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.server.min.delay</name> |
| <value>0.0</value> |
| <description>The minimum number of seconds the fetcher will delay between |
| successive requests to the same server. This value is applicable ONLY |
| if fetcher.threads.per.host is greater than 1 (i.e. the host blocking |
| is turned off).</description> |
| </property> |
| |
| <property> |
| <name>fetcher.max.crawl.delay</name> |
| <value>30</value> |
| <description> |
| If the Crawl-Delay in robots.txt is set to greater than this value (in |
| seconds) then the fetcher will skip this page, generating an error report. |
| If set to -1 the fetcher will never skip such pages and will wait the |
| amount of time retrieved from robots.txt Crawl-Delay, however long that |
| might be. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.threads.fetch</name> |
| <value>10</value> |
| <description>The number of FetcherThreads the fetcher should use. |
| This is also determines the maximum number of requests that are |
| made at once (each FetcherThread handles one connection). The total |
| number of threads running in distributed mode will be the number of |
| fetcher threads * number of nodes as fetcher has one map task per node. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.threads.per.queue</name> |
| <value>1</value> |
| <description>This number is the maximum number of threads that |
| should be allowed to access a queue at one time. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.queue.mode</name> |
| <value>byHost</value> |
| <description>Determines how to put URLs into queues. Default value is 'byHost', |
| also takes 'byDomain' or 'byIP'. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.verbose</name> |
| <value>false</value> |
| <description>If true, fetcher will log more verbosely.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.parse</name> |
| <value>false</value> |
| <description>If true, fetcher will parse content. Default is false, which means |
| that a separate parsing step is required after fetching is finished.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.store.content</name> |
| <value>true</value> |
| <description>If true, fetcher will store content.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.timelimit.mins</name> |
| <value>-1</value> |
| <description>This is the number of minutes allocated to the fetching. |
| Once this value is reached, any remaining entry from the input URL list is skipped |
| and all active queues are emptied. The default value of -1 deactivates the time limit. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.max.exceptions.per.queue</name> |
| <value>-1</value> |
| <description>The maximum number of protocol-level exceptions (e.g. timeouts) per |
| host (or IP) queue. Once this value is reached, any remaining entries from this |
| queue are purged, effectively stopping the fetching from this host/IP. The default |
| value of -1 deactivates this limit. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.throughput.threshold.pages</name> |
| <value>-1</value> |
| <description>The threshold of minimum pages per second. If the fetcher downloads less |
| pages per second than the configured threshold, the fetcher stops, preventing slow queue's |
| from stalling the throughput. This threshold must be an integer. This can be useful when |
| fetcher.timelimit.mins is hard to determine. The default value of -1 disables this check. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.throughput.threshold.retries</name> |
| <value>5</value> |
| <description>The number of times the fetcher.throughput.threshold is allowed to be exceeded. |
| This settings prevents accidental slow downs from immediately killing the fetcher thread. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.throughput.threshold.check.after</name> |
| <value>5</value> |
| <description>The number of minutes after which the throughput check is enabled.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.threads.timeout.divisor</name> |
| <value>2</value> |
| <description>(EXPERT)The thread time-out divisor to use. By default threads have a time-out |
| value of mapred.task.timeout / 2. Increase this setting if the fetcher waits too |
| long before killing hanged threads. Be careful, a too high setting (+8) will most likely kill the |
| fetcher threads prematurely. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.queue.depth.multiplier</name> |
| <value>50</value> |
| <description>(EXPERT)The fetcher buffers the incoming URLs into queues based on the [host|domain|IP] |
| (see param fetcher.queue.mode). The depth of the queue is the number of threads times the value of this parameter. |
| A large value requires more memory but can improve the performance of the fetch when the order of the URLS in the fetch list |
| is not optimal. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.follow.outlinks.depth</name> |
| <value>-1</value> |
| <description>(EXPERT)When fetcher.parse is true and this value is greater than 0 the fetcher will extract outlinks |
| and follow until the desired depth is reached. A value of 1 means all generated pages are fetched and their first degree |
| outlinks are fetched and parsed too. Be careful, this feature is in itself agnostic of the state of the CrawlDB and does not |
| know about already fetched pages. A setting larger than 2 will most likely fetch home pages twice in the same fetch cycle. |
| It is highly recommended to set db.ignore.external.links to true to restrict the outlink follower to URL's within the same |
| domain. When disabled (false) the feature is likely to follow duplicates even when depth=1. |
| A value of -1 of 0 disables this feature. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.follow.outlinks.num.links</name> |
| <value>4</value> |
| <description>(EXPERT)The number of outlinks to follow when fetcher.follow.outlinks.depth is enabled. Be careful, this can multiply |
| the total number of pages to fetch. This works with fetcher.follow.outlinks.depth.divisor, by default settings the followed outlinks |
| at depth 1 is 8, not 4. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.follow.outlinks.depth.divisor</name> |
| <value>2</value> |
| <description>(EXPERT)The divisor of fetcher.follow.outlinks.num.links per fetcher.follow.outlinks.depth. This decreases the number |
| of outlinks to follow by increasing depth. The formula used is: outlinks = floor(divisor / depth * num.links). This prevents |
| exponential growth of the fetch list. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.follow.outlinks.ignore.external</name> |
| <value>true</value> |
| <description>Whether to ignore or follow external links. Set db.ignore.external.links to false and this to true to store outlinks |
| in the output but not follow them. If db.ignore.external.links is true this directive is ignored. |
| </description> |
| </property> |
| |
| <!-- moreindexingfilter plugin properties --> |
| |
| <property> |
| <name>moreIndexingFilter.indexMimeTypeParts</name> |
| <value>true</value> |
| <description>Determines whether the index-more plugin will split the mime-type |
| in sub parts, this requires the type field to be multi valued. Set to true for backward |
| compatibility. False will not split the mime-type. |
| </description> |
| </property> |
| |
| <property> |
| <name>moreIndexingFilter.mapMimeTypes</name> |
| <value>false</value> |
| <description>Determines whether MIME-type mapping is enabled. It takes a |
| plain text file with mapped MIME-types. With it the user can map both |
| application/xhtml+xml and text/html to the same target MIME-type so it |
| can be treated equally in an index. See conf/contenttype-mapping.txt. |
| </description> |
| </property> |
| |
| <!-- AnchorIndexing filter plugin properties --> |
| |
| <property> |
| <name>anchorIndexingFilter.deduplicate</name> |
| <value>false</value> |
| <description>With this enabled the indexer will case-insensitive deduplicate anchors |
| before indexing. This prevents possible hundreds or thousands of identical anchors for |
| a given page to be indexed but will affect the search scoring (i.e. tf=1.0f). |
| </description> |
| </property> |
| |
| <!-- indexingfilter plugin properties --> |
| |
| <property> |
| <name>indexingfilter.order</name> |
| <value></value> |
| <description>The order by which index filters are applied. |
| If empty, all available index filters (as dictated by properties |
| plugin-includes and plugin-excludes above) are loaded and applied in system |
| defined order. If not empty, only named filters are loaded and applied |
| in given order. For example, if this property has value: |
| org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter |
| then BasicIndexingFilter is applied first, and MoreIndexingFilter second. |
| |
| Filter ordering might have impact on result if one filter depends on output of |
| another filter. |
| </description> |
| </property> |
| |
| <property> |
| <name>indexer.score.power</name> |
| <value>0.5</value> |
| <description>Determines the power of link analyis scores. Each |
| pages's boost is set to <i>score<sup>scorePower</sup></i> where |
| <i>score</i> is its link analysis score and <i>scorePower</i> is the |
| value of this parameter. This is compiled into indexes, so, when |
| this is changed, pages must be re-indexed for it to take |
| effect.</description> |
| </property> |
| |
| <property> |
| <name>indexer.max.title.length</name> |
| <value>100</value> |
| <description>The maximum number of characters of a title that are indexed. A value of -1 disables this check. |
| </description> |
| </property> |
| |
| <property> |
| <name>indexer.max.content.length</name> |
| <value>-1</value> |
| <description>The maximum number of characters of a content that are indexed. |
| Content beyond the limit is truncated. A value of -1 disables this check. |
| </description> |
| </property> |
| |
| <property> |
| <name>indexer.add.domain</name> |
| <value>false</value> |
| <description>Whether to add the domain field to a NutchDocument.</description> |
| </property> |
| |
| <property> |
| <name>indexer.skip.notmodified</name> |
| <value>false</value> |
| <description>Whether the indexer will skip records with a db_notmodified status. |
| </description> |
| </property> |
| |
| <!-- URL normalizer properties --> |
| |
| <property> |
| <name>urlnormalizer.order</name> |
| <value>org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer</value> |
| <description>Order in which normalizers will run. If any of these isn't |
| activated it will be silently skipped. If other normalizers not on the |
| list are activated, they will run in random order after the ones |
| specified here are run. |
| </description> |
| </property> |
| |
| <property> |
| <name>urlnormalizer.regex.file</name> |
| <value>regex-normalize.xml</value> |
| <description>Name of the config file used by the RegexUrlNormalizer class. |
| </description> |
| </property> |
| |
| <property> |
| <name>urlnormalizer.loop.count</name> |
| <value>1</value> |
| <description>Optionally loop through normalizers several times, to make |
| sure that all transformations have been performed. |
| </description> |
| </property> |
| |
| <!-- mime properties --> |
| |
| <!-- |
| <property> |
| <name>mime.types.file</name> |
| <value>tika-mimetypes.xml</value> |
| <description>Name of file in CLASSPATH containing filename extension and |
| magic sequence to mime types mapping information. Overrides the default Tika config |
| if specified. |
| </description> |
| </property> |
| --> |
| |
| <property> |
| <name>mime.type.magic</name> |
| <value>true</value> |
| <description>Defines if the mime content type detector uses magic resolution. |
| </description> |
| </property> |
| |
| <!-- plugin properties --> |
| |
| <property> |
| <name>plugin.folders</name> |
| <value>plugins</value> |
| <description>Directories where nutch plugins are located. Each |
| element may be a relative or absolute path. If absolute, it is used |
| as is. If relative, it is searched for on the classpath.</description> |
| </property> |
| |
| <property> |
| <name>plugin.auto-activation</name> |
| <value>true</value> |
| <description>Defines if some plugins that are not activated regarding |
| the plugin.includes and plugin.excludes properties must be automaticaly |
| activated if they are needed by some actived plugins. |
| </description> |
| </property> |
| |
| <property> |
| <name>plugin.includes</name> |
| <value>protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value> |
| <description>Regular expression naming plugin directory names to |
| include. Any plugin not matching this expression is excluded. |
| In any case you need at least include the nutch-extensionpoints plugin. By |
| default Nutch includes crawling just HTML and plain text via HTTP, |
| and basic indexing and search plugins. In order to use HTTPS please enable |
| protocol-httpclient, but be aware of possible intermittent problems with the |
| underlying commons-httpclient library. |
| </description> |
| </property> |
| |
| <property> |
| <name>plugin.excludes</name> |
| <value></value> |
| <description>Regular expression naming plugin directory names to exclude. |
| </description> |
| </property> |
| |
| <property> |
| <name>urlmeta.tags</name> |
| <value></value> |
| <description> |
| To be used in conjunction with features introduced in NUTCH-655, which allows |
| for custom metatags to be injected alongside your crawl URLs. Specifying those |
| custom tags here will allow for their propagation into a pages outlinks, as |
| well as allow for them to be included as part of an index. |
| Values should be comma-delimited. ("tag1,tag2,tag3") Do not pad the tags with |
| white-space at their boundaries, if you are using anything earlier than Hadoop-0.21. |
| </description> |
| </property> |
| |
| <!-- parser properties --> |
| |
| <property> |
| <name>parse.plugin.file</name> |
| <value>parse-plugins.xml</value> |
| <description>The name of the file that defines the associations between |
| content-types and parsers.</description> |
| </property> |
| |
| <property> |
| <name>parser.character.encoding.default</name> |
| <value>windows-1252</value> |
| <description>The character encoding to fall back to when no other information |
| is available</description> |
| </property> |
| |
| <property> |
| <name>encodingdetector.charset.min.confidence</name> |
| <value>-1</value> |
| <description>A integer between 0-100 indicating minimum confidence value |
| for charset auto-detection. Any negative value disables auto-detection. |
| </description> |
| </property> |
| |
| <property> |
| <name>parser.caching.forbidden.policy</name> |
| <value>content</value> |
| <description>If a site (or a page) requests through its robot metatags |
| that it should not be shown as cached content, apply this policy. Currently |
| three keywords are recognized: "none" ignores any "noarchive" directives. |
| "content" doesn't show the content, but shows summaries (snippets). |
| "all" doesn't show either content or summaries.</description> |
| </property> |
| |
| <property> |
| <name>parser.html.impl</name> |
| <value>neko</value> |
| <description>HTML Parser implementation. Currently the following keywords |
| are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup. |
| </description> |
| </property> |
| |
| <property> |
| <name>parser.html.form.use_action</name> |
| <value>false</value> |
| <description>If true, HTML parser will collect URLs from form action |
| attributes. This may lead to undesirable behavior (submitting empty |
| forms during next fetch cycle). If false, form action attribute will |
| be ignored.</description> |
| </property> |
| |
| <property> |
| <name>parser.html.outlinks.ignore_tags</name> |
| <value></value> |
| <description>Comma separated list of HTML tags, from which outlinks |
| shouldn't be extracted. Nutch takes links from: a, area, form, frame, |
| iframe, script, link, img. If you add any of those tags here, it |
| won't be taken. Default is empty list. Probably reasonable value |
| for most people would be "img,script,link".</description> |
| </property> |
| |
| <property> |
| <name>parser.fix.embeddedparams</name> |
| <value>true</value> |
| <description>Whether to fix URL embedded params using semi-colons. |
| See NUTCH-436 and NUTCH-1115</description> |
| </property> |
| |
| <property> |
| <name>htmlparsefilter.order</name> |
| <value></value> |
| <description>The order by which HTMLParse filters are applied. |
| If empty, all available HTMLParse filters (as dictated by properties |
| plugin-includes and plugin-excludes above) are loaded and applied in system |
| defined order. If not empty, only named filters are loaded and applied |
| in given order. |
| HTMLParse filter ordering MAY have an impact |
| on end result, as some filters could rely on the metadata generated by a previous filter. |
| </description> |
| </property> |
| |
| <property> |
| <name>parser.timeout</name> |
| <value>30</value> |
| <description>Timeout in seconds for the parsing of a document, otherwise treats it as an exception and |
| moves on the the following documents. This parameter is applied to any Parser implementation. |
| Set to -1 to deactivate, bearing in mind that this could cause |
| the parsing to crash because of a very long or corrupted document. |
| </description> |
| </property> |
| |
| <property> |
| <name>parse.filter.urls</name> |
| <value>true</value> |
| <description>Whether the parser will filter URLs (with the configured URL filters).</description> |
| </property> |
| |
| <property> |
| <name>parse.normalize.urls</name> |
| <value>true</value> |
| <description>Whether the parser will normalize URLs (with the configured URL normalizers).</description> |
| </property> |
| |
| <property> |
| <name>parser.skip.truncated</name> |
| <value>true</value> |
| <description>Boolean value for whether we should skip parsing for truncated documents. By default this |
| property is activated due to extremely high levels of CPU which parsing can sometimes take. |
| </description> |
| </property> |
| |
| <!-- urlfilter plugin properties --> |
| |
| <property> |
| <name>urlfilter.domain.file</name> |
| <value>domain-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing either top level domains or |
| hostnames used by urlfilter-domain (DomainURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.regex.file</name> |
| <value>regex-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing regular expressions |
| used by urlfilter-regex (RegexURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.automaton.file</name> |
| <value>automaton-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing regular expressions |
| used by urlfilter-automaton (AutomatonURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.prefix.file</name> |
| <value>prefix-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing url prefixes |
| used by urlfilter-prefix (PrefixURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.suffix.file</name> |
| <value>suffix-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing url suffixes |
| used by urlfilter-suffix (SuffixURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.order</name> |
| <value></value> |
| <description>The order by which url filters are applied. |
| If empty, all available url filters (as dictated by properties |
| plugin-includes and plugin-excludes above) are loaded and applied in system |
| defined order. If not empty, only named filters are loaded and applied |
| in given order. For example, if this property has value: |
| org.apache.nutch.urlfilter.regex.RegexURLFilter org.apache.nutch.urlfilter.prefix.PrefixURLFilter |
| then RegexURLFilter is applied first, and PrefixURLFilter second. |
| Since all filters are AND'ed, filter ordering does not have impact |
| on end result, but it may have performance implication, depending |
| on relative expensiveness of filters. |
| </description> |
| </property> |
| |
| <!-- scoring filters properties --> |
| |
| <property> |
| <name>scoring.filter.order</name> |
| <value></value> |
| <description>The order in which scoring filters are applied. |
| This may be left empty (in which case all available scoring |
| filters will be applied in the order defined in plugin-includes |
| and plugin-excludes), or a space separated list of implementation |
| classes. |
| </description> |
| </property> |
| |
| <!-- scoring-depth properties |
| Add 'scoring-depth' to the list of active plugins |
| in the parameter 'plugin.includes' in order to use it. |
| --> |
| |
| <property> |
| <name>scoring.depth.max</name> |
| <value>1000</value> |
| <description>Max depth value from seed allowed by default. |
| Can be overriden on a per-seed basis by specifying "_maxdepth_=VALUE" |
| as a seed metadata. This plugin adds a "_depth_" metadatum to the pages |
| to track the distance from the seed it was found from. |
| The depth is used to prioritise URLs in the generation step so that |
| shallower pages are fetched first. |
| </description> |
| </property> |
| |
| <!-- language-identifier plugin properties --> |
| |
| <property> |
| <name>lang.analyze.max.length</name> |
| <value>2048</value> |
| <description> The maximum bytes of data to uses to indentify |
| the language (0 means full content analysis). |
| The larger is this value, the better is the analysis, but the |
| slowest it is. |
| </description> |
| </property> |
| |
| <property> |
| <name>lang.extraction.policy</name> |
| <value>detect,identify</value> |
| <description>This determines when the plugin uses detection and |
| statistical identification mechanisms. The order in which the |
| detect and identify are written will determine the extraction |
| policy. Default case (detect,identify) means the plugin will |
| first try to extract language info from page headers and metadata, |
| if this is not successful it will try using tika language |
| identification. Possible values are: |
| detect |
| identify |
| detect,identify |
| identify,detect |
| </description> |
| </property> |
| |
| <property> |
| <name>lang.identification.only.certain</name> |
| <value>false</value> |
| <description>If set to true with lang.extraction.policy containing identify, |
| the language code returned by Tika will be assigned to the document ONLY |
| if it is deemed certain by Tika. |
| </description> |
| </property> |
| |
| <!-- index-static plugin properties --> |
| |
| <property> |
| <name>index.static</name> |
| <value></value> |
| <description> |
| A simple plugin called at indexing that adds fields with static data. |
| You can specify a list of fieldname:fieldcontent per nutch job. |
| It can be useful when collections can't be created by urlpatterns, |
| like in subcollection, but on a job-basis. |
| </description> |
| </property> |
| |
| <!-- index-metadata plugin properties --> |
| |
| <property> |
| <name>index.parse.md</name> |
| <value>metatag.description,metatag.keywords</value> |
| <description> |
| Comma-separated list of keys to be taken from the parse metadata to generate fields. |
| Can be used e.g. for 'description' or 'keywords' provided that these values are generated |
| by a parser (see parse-metatags plugin) |
| </description> |
| </property> |
| |
| <property> |
| <name>index.content.md</name> |
| <value></value> |
| <description> |
| Comma-separated list of keys to be taken from the content metadata to generate fields. |
| </description> |
| </property> |
| |
| <property> |
| <name>index.db.md</name> |
| <value></value> |
| <description> |
| Comma-separated list of keys to be taken from the crawldb metadata to generate fields. |
| Can be used to index values propagated from the seeds with the plugin urlmeta |
| </description> |
| </property> |
| |
| <!-- parse-metatags plugin properties --> |
| <property> |
| <name>metatags.names</name> |
| <value>description;keywords</value> |
| <description> Names of the metatags to extract, separated by;. |
| Use '*' to extract all metatags. Prefixes the names with 'metatag.' |
| in the parse-metadata. For instance to index description and keywords, |
| you need to activate the plugin index-metadata and set the value of the |
| parameter 'index.parse.md' to 'metatag.description;metatag.keywords'. |
| </description> |
| </property> |
| |
| <!-- Temporary Hadoop 0.17.x workaround. --> |
| |
| <property> |
| <name>hadoop.job.history.user.location</name> |
| <value>${hadoop.log.dir}/history/user</value> |
| <description>Hadoop 0.17.x comes with a default setting to create |
| user logs inside the output path of the job. This breaks some |
| Hadoop classes, which expect the output to contain only |
| part-XXXXX files. This setting changes the output to a |
| subdirectory of the regular log directory. |
| </description> |
| </property> |
| |
| <!-- linkrank scoring properties --> |
| |
| <property> |
| <name>link.ignore.internal.host</name> |
| <value>true</value> |
| <description>Ignore outlinks to the same hostname.</description> |
| </property> |
| |
| <property> |
| <name>link.ignore.internal.domain</name> |
| <value>true</value> |
| <description>Ignore outlinks to the same domain.</description> |
| </property> |
| |
| <property> |
| <name>link.ignore.limit.page</name> |
| <value>true</value> |
| <description>Limit to only a single outlink to the same page.</description> |
| </property> |
| |
| <property> |
| <name>link.ignore.limit.domain</name> |
| <value>true</value> |
| <description>Limit to only a single outlink to the same domain.</description> |
| </property> |
| |
| <property> |
| <name>link.analyze.num.iterations</name> |
| <value>10</value> |
| <description>The number of LinkRank iterations to run.</description> |
| </property> |
| |
| <property> |
| <name>link.analyze.initial.score</name> |
| <value>1.0f</value> |
| <description>The initial score.</description> |
| </property> |
| |
| <property> |
| <name>link.analyze.damping.factor</name> |
| <value>0.85f</value> |
| <description>The damping factor.</description> |
| </property> |
| |
| <property> |
| <name>link.delete.gone</name> |
| <value>false</value> |
| <description>Whether to delete gone pages from the web graph.</description> |
| </property> |
| |
| <property> |
| <name>link.loops.depth</name> |
| <value>2</value> |
| <description>The depth for the loops algorithm.</description> |
| </property> |
| |
| <property> |
| <name>link.score.updater.clear.score</name> |
| <value>0.0f</value> |
| <description>The default score for URL's that are not in the web graph.</description> |
| </property> |
| |
| <property> |
| <name>mapreduce.fileoutputcommitter.marksuccessfuljobs</name> |
| <value>false</value> |
| <description>Hadoop >= 0.21 generates SUCCESS files in the output which can crash |
| the readers. This should not be an issue once Nutch is ported to the new MapReduce API |
| but for now this parameter should prevent such cases. |
| </description> |
| </property> |
| |
| <!-- solr index properties --> |
| |
| <property> |
| <name>solr.mapping.file</name> |
| <value>solrindex-mapping.xml</value> |
| <description> |
| Defines the name of the file that will be used in the mapping of internal |
| nutch field names to solr index fields as specified in the target Solr schema. |
| </description> |
| </property> |
| |
| <property> |
| <name>solr.commit.size</name> |
| <value>250</value> |
| <description> |
| Defines the number of documents to send to Solr in a single update batch. |
| Decrease when handling very large documents to prevent Nutch from running |
| out of memory. NOTE: It does not explicitly trigger a server side commit. |
| </description> |
| </property> |
| |
| <property> |
| <name>solr.commit.index</name> |
| <value>true</value> |
| <description> |
| When closing the indexer, trigger a commit to the Solr server. |
| </description> |
| </property> |
| |
| <property> |
| <name>solr.auth</name> |
| <value>false</value> |
| <description> |
| Whether to enable HTTP basic authentication for communicating with Solr. |
| Use the solr.auth.username and solr.auth.password properties to configure |
| your credentials. |
| </description> |
| </property> |
| |
| <!-- Elasticsearch properties --> |
| |
| <property> |
| <name>elastic.host</name> |
| <value></value> |
| <description>The hostname to send documents to using TransportClient. Either host |
| and port must be defined or cluster.</description> |
| </property> |
| |
| <property> |
| <name>elastic.port</name> |
| <value>9300</value>The port to connect to using TransportClient.<description> |
| </description> |
| </property> |
| |
| <property> |
| <name>elastic.cluster</name> |
| <value></value> |
| <description>The cluster name to discover. Either host and potr must be defined |
| or cluster.</description> |
| </property> |
| |
| <property> |
| <name>elastic.index</name> |
| <value>nutch</value> |
| <description>Default index to send documents to.</description> |
| </property> |
| |
| <property> |
| <name>elastic.max.bulk.docs</name> |
| <value>250</value> |
| <description>Maximum size of the bulk in number of documents.</description> |
| </property> |
| |
| <property> |
| <name>elastic.max.bulk.size</name> |
| <value>2500500</value> |
| <description>Maximum size of the bulk in bytes.</description> |
| </property> |
| |
| <!-- subcollection properties --> |
| |
| <property> |
| <name>subcollection.default.field</name> |
| <value>subcollection</value> |
| <description> |
| The default field name for the subcollections. |
| </description> |
| </property> |
| |
| <!-- Headings plugin properties --> |
| |
| <property> |
| <name>headings</name> |
| <value>h1,h2</value> |
| <description>Comma separated list of headings to retrieve from the document</description> |
| </property> |
| |
| <property> |
| <name>headings.multivalued</name> |
| <value>false</value> |
| <description>Whether to support multivalued headings.</description> |
| </property> |
| |
| </configuration> |