| <?xml version="1.0"?> |
| <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> |
| <!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| <!-- Do not modify this file directly. Instead, copy entries that you --> |
| <!-- wish to modify from this file into nutch-site.xml and change them --> |
| <!-- there. If nutch-site.xml does not already exist, create it. --> |
| |
| <configuration> |
| |
| <!-- file properties --> |
| |
| <property> |
| <name>file.content.limit</name> |
| <value>65536</value> |
| <description>The length limit for downloaded content, in bytes. |
| If this value is nonnegative (>=0), content longer than it will be truncated; |
| otherwise, no truncation at all. |
| </description> |
| </property> |
| |
| <property> |
| <name>file.content.ignored</name> |
| <value>true</value> |
| <description>If true, no file content will be saved during fetch. |
| And it is probably what we want to set most of time, since file:// URLs |
| are meant to be local and we can always use them directly at parsing |
| and indexing stages. Otherwise file contents will be saved. |
| !! NO IMPLEMENTED YET !! |
| </description> |
| </property> |
| |
| <!-- HTTP properties --> |
| |
| <property> |
| <name>http.agent.name</name> |
| <value></value> |
| <description>HTTP 'User-Agent' request header. MUST NOT be empty - |
| please set this to a single word uniquely related to your organization. |
| |
| NOTE: You should also check other related properties: |
| |
| http.robots.agents |
| http.agent.description |
| http.agent.url |
| http.agent.email |
| http.agent.version |
| |
| and set their values appropriately. |
| |
| </description> |
| </property> |
| |
| <property> |
| <name>http.robots.agents</name> |
| <value>*</value> |
| <description>The agent strings we'll look for in robots.txt files, |
| comma-separated, in decreasing order of precedence. You should |
| put the value of http.agent.name as the first agent name, and keep the |
| default * at the end of the list. E.g.: BlurflDev,Blurfl,* |
| </description> |
| </property> |
| |
| <property> |
| <name>http.robots.403.allow</name> |
| <value>true</value> |
| <description>Some servers return HTTP status 403 (Forbidden) if |
| /robots.txt doesn't exist. This should probably mean that we are |
| allowed to crawl the site nonetheless. If this is set to false, |
| then such sites will be treated as forbidden.</description> |
| </property> |
| |
| <property> |
| <name>http.agent.description</name> |
| <value></value> |
| <description>Further description of our bot- this text is used in |
| the User-Agent header. It appears in parenthesis after the agent name. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.url</name> |
| <value></value> |
| <description>A URL to advertise in the User-Agent header. This will |
| appear in parenthesis after the agent name. Custom dictates that this |
| should be a URL of a page explaining the purpose and behavior of this |
| crawler. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.email</name> |
| <value></value> |
| <description>An email address to advertise in the HTTP 'From' request |
| header and User-Agent header. A good practice is to mangle this |
| address (e.g. 'info at example dot com') to avoid spamming. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.agent.version</name> |
| <value>Nutch-1.1</value> |
| <description>A version string to advertise in the User-Agent |
| header.</description> |
| </property> |
| |
| <property> |
| <name>http.agent.host</name> |
| <value></value> |
| <description>Name or IP address of the host on which the Nutch crawler |
| would be running. Currently this is used by 'protocol-httpclient' |
| plugin. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.timeout</name> |
| <value>10000</value> |
| <description>The default network timeout, in milliseconds.</description> |
| </property> |
| |
| <property> |
| <name>http.max.delays</name> |
| <value>100</value> |
| <description>The number of times a thread will delay when trying to |
| fetch a page. Each time it finds that a host is busy, it will wait |
| fetcher.server.delay. After http.max.delays attepts, it will give |
| up on the page for now.</description> |
| </property> |
| |
| <property> |
| <name>http.content.limit</name> |
| <value>65536</value> |
| <description>The length limit for downloaded content, in bytes. |
| If this value is nonnegative (>=0), content longer than it will be truncated; |
| otherwise, no truncation at all. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.proxy.host</name> |
| <value></value> |
| <description>The proxy hostname. If empty, no proxy is used.</description> |
| </property> |
| |
| <property> |
| <name>http.proxy.port</name> |
| <value></value> |
| <description>The proxy port.</description> |
| </property> |
| |
| <property> |
| <name>http.proxy.username</name> |
| <value></value> |
| <description>Username for proxy. This will be used by |
| 'protocol-httpclient', if the proxy server requests basic, digest |
| and/or NTLM authentication. To use this, 'protocol-httpclient' must |
| be present in the value of 'plugin.includes' property. |
| NOTE: For NTLM authentication, do not prefix the username with the |
| domain, i.e. 'susam' is correct whereas 'DOMAIN\susam' is incorrect. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.proxy.password</name> |
| <value></value> |
| <description>Password for proxy. This will be used by |
| 'protocol-httpclient', if the proxy server requests basic, digest |
| and/or NTLM authentication. To use this, 'protocol-httpclient' must |
| be present in the value of 'plugin.includes' property. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.proxy.realm</name> |
| <value></value> |
| <description>Authentication realm for proxy. Do not define a value |
| if realm is not required or authentication should take place for any |
| realm. NTLM does not use the notion of realms. Specify the domain name |
| of NTLM authentication as the value for this property. To use this, |
| 'protocol-httpclient' must be present in the value of |
| 'plugin.includes' property. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.auth.file</name> |
| <value>httpclient-auth.xml</value> |
| <description>Authentication configuration file for |
| 'protocol-httpclient' plugin. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.verbose</name> |
| <value>false</value> |
| <description>If true, HTTP will log more verbosely.</description> |
| </property> |
| |
| <property> |
| <name>http.redirect.max</name> |
| <value>0</value> |
| <description>The maximum number of redirects the fetcher will follow when |
| trying to fetch a page. If set to negative or 0, fetcher won't immediately |
| follow redirected URLs, instead it will record them for later fetching. |
| </description> |
| </property> |
| |
| <property> |
| <name>http.useHttp11</name> |
| <value>false</value> |
| <description>NOTE: at the moment this works only for protocol-httpclient. |
| If true, use HTTP 1.1, if false use HTTP 1.0 . |
| </description> |
| </property> |
| |
| <property> |
| <name>http.accept.language</name> |
| <value>en-us,en-gb,en;q=0.7,*;q=0.3</value> |
| <description>Value of the "Accept-Language" request header field. |
| This allows selecting non-English language as default one to retrieve. |
| It is a useful setting for search engines build for certain national group. |
| </description> |
| </property> |
| |
| <!-- FTP properties --> |
| |
| <property> |
| <name>ftp.username</name> |
| <value>anonymous</value> |
| <description>ftp login username.</description> |
| </property> |
| |
| <property> |
| <name>ftp.password</name> |
| <value>anonymous@example.com</value> |
| <description>ftp login password.</description> |
| </property> |
| |
| <property> |
| <name>ftp.content.limit</name> |
| <value>65536</value> |
| <description>The length limit for downloaded content, in bytes. |
| If this value is nonnegative (>=0), content longer than it will be truncated; |
| otherwise, no truncation at all. |
| Caution: classical ftp RFCs never defines partial transfer and, in fact, |
| some ftp servers out there do not handle client side forced close-down very |
| well. Our implementation tries its best to handle such situations smoothly. |
| </description> |
| </property> |
| |
| <property> |
| <name>ftp.timeout</name> |
| <value>60000</value> |
| <description>Default timeout for ftp client socket, in millisec. |
| Please also see ftp.keep.connection below.</description> |
| </property> |
| |
| <property> |
| <name>ftp.server.timeout</name> |
| <value>100000</value> |
| <description>An estimation of ftp server idle time, in millisec. |
| Typically it is 120000 millisec for many ftp servers out there. |
| Better be conservative here. Together with ftp.timeout, it is used to |
| decide if we need to delete (annihilate) current ftp.client instance and |
| force to start another ftp.client instance anew. This is necessary because |
| a fetcher thread may not be able to obtain next request from queue in time |
| (due to idleness) before our ftp client times out or remote server |
| disconnects. Used only when ftp.keep.connection is true (please see below). |
| </description> |
| </property> |
| |
| <property> |
| <name>ftp.keep.connection</name> |
| <value>false</value> |
| <description>Whether to keep ftp connection. Useful if crawling same host |
| again and again. When set to true, it avoids connection, login and dir list |
| parser setup for subsequent urls. If it is set to true, however, you must |
| make sure (roughly): |
| (1) ftp.timeout is less than ftp.server.timeout |
| (2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay) |
| Otherwise there will be too many "delete client because idled too long" |
| messages in thread logs.</description> |
| </property> |
| |
| <property> |
| <name>ftp.follow.talk</name> |
| <value>false</value> |
| <description>Whether to log dialogue between our client and remote |
| server. Useful for debugging.</description> |
| </property> |
| |
| <!-- web db properties --> |
| |
| <property> |
| <name>db.default.fetch.interval</name> |
| <value>30</value> |
| <description>(DEPRECATED) The default number of days between re-fetches of a page. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.fetch.interval.default</name> |
| <value>2592000</value> |
| <description>The default number of seconds between re-fetches of a page (30 days). |
| </description> |
| </property> |
| |
| <property> |
| <name>db.fetch.interval.max</name> |
| <value>7776000</value> |
| <description>The maximum number of seconds between re-fetches of a page |
| (90 days). After this period every page in the db will be re-tried, no |
| matter what is its status. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.class</name> |
| <value>org.apache.nutch.crawl.DefaultFetchSchedule</value> |
| <description>The implementation of fetch schedule. DefaultFetchSchedule simply |
| adds the original fetchInterval to the last fetch time, regardless of |
| page changes.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.inc_rate</name> |
| <value>0.4</value> |
| <description>If a page is unmodified, its fetchInterval will be |
| increased by this rate. This value should not |
| exceed 0.5, otherwise the algorithm becomes unstable.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.dec_rate</name> |
| <value>0.2</value> |
| <description>If a page is modified, its fetchInterval will be |
| decreased by this rate. This value should not |
| exceed 0.5, otherwise the algorithm becomes unstable.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.min_interval</name> |
| <value>60.0</value> |
| <description>Minimum fetchInterval, in seconds.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.max_interval</name> |
| <value>31536000.0</value> |
| <description>Maximum fetchInterval, in seconds (365 days). |
| NOTE: this is limited by db.fetch.interval.max. Pages with |
| fetchInterval larger than db.fetch.interval.max |
| will be fetched anyway.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.sync_delta</name> |
| <value>true</value> |
| <description>If true, try to synchronize with the time of page change. |
| by shifting the next fetchTime by a fraction (sync_rate) of the difference |
| between the last modification time, and the last fetch time.</description> |
| </property> |
| |
| <property> |
| <name>db.fetch.schedule.adaptive.sync_delta_rate</name> |
| <value>0.3</value> |
| <description>See sync_delta for description. This value should not |
| exceed 0.5, otherwise the algorithm becomes unstable.</description> |
| </property> |
| |
| <property> |
| <name>db.update.additions.allowed</name> |
| <value>true</value> |
| <description>If true, updatedb will add newly discovered URLs, if false |
| only already existing URLs in the CrawlDb will be updated and no new |
| URLs will be added. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.update.max.inlinks</name> |
| <value>10000</value> |
| <description>Maximum number of inlinks to take into account when updating |
| a URL score in the crawlDB. Only the best scoring inlinks are kept. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.ignore.internal.links</name> |
| <value>true</value> |
| <description>If true, when adding new links to a page, links from |
| the same host are ignored. This is an effective way to limit the |
| size of the link database, keeping only the highest quality |
| links. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.ignore.external.links</name> |
| <value>false</value> |
| <description>If true, outlinks leading from a page to external hosts |
| will be ignored. This is an effective way to limit the crawl to include |
| only initially injected hosts, without creating complex URLFilters. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.score.injected</name> |
| <value>1.0</value> |
| <description>The score of new pages added by the injector. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.score.link.external</name> |
| <value>1.0</value> |
| <description>The score factor for new pages added due to a link from |
| another host relative to the referencing page's score. Scoring plugins |
| may use this value to affect initial scores of external links. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.score.link.internal</name> |
| <value>1.0</value> |
| <description>The score factor for pages added due to a link from the |
| same host, relative to the referencing page's score. Scoring plugins |
| may use this value to affect initial scores of internal links. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.score.count.filtered</name> |
| <value>false</value> |
| <description>The score value passed to newly discovered pages is |
| calculated as a fraction of the original page score divided by the |
| number of outlinks. If this option is false, only the outlinks that passed |
| URLFilters will count, if it's true then all outlinks will count. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.max.inlinks</name> |
| <value>10000</value> |
| <description>Maximum number of Inlinks per URL to be kept in LinkDb. |
| If "invertlinks" finds more inlinks than this number, only the first |
| N inlinks will be stored, and the rest will be discarded. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.max.outlinks.per.page</name> |
| <value>100</value> |
| <description>The maximum number of outlinks that we'll process for a page. |
| If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks |
| will be processed for a page; otherwise, all outlinks will be processed. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.max.anchor.length</name> |
| <value>100</value> |
| <description>The maximum number of characters permitted in an anchor. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.parsemeta.to.crawldb</name> |
| <value></value> |
| <description>Comma-separated list of parse metadata keys to transfer to the crawldb (NUTCH-779). |
| Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang' |
| will copy both the key 'lang' and its value to the corresponding entry in the crawldb. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.fetch.retry.max</name> |
| <value>3</value> |
| <description>The maximum number of times a url that has encountered |
| recoverable errors is generated for fetch.</description> |
| </property> |
| |
| <property> |
| <name>db.signature.class</name> |
| <value>org.apache.nutch.crawl.MD5Signature</value> |
| <description>The default implementation of a page signature. Signatures |
| created with this implementation will be used for duplicate detection |
| and removal.</description> |
| </property> |
| |
| <property> |
| <name>db.signature.text_profile.min_token_len</name> |
| <value>2</value> |
| <description>Minimum token length to be included in the signature. |
| </description> |
| </property> |
| |
| <property> |
| <name>db.signature.text_profile.quant_rate</name> |
| <value>0.01</value> |
| <description>Profile frequencies will be rounded down to a multiple of |
| QUANT = (int)(QUANT_RATE * maxFreq), where maxFreq is a maximum token |
| frequency. If maxFreq > 1 then QUANT will be at least 2, which means that |
| for longer texts tokens with frequency 1 will always be discarded. |
| </description> |
| </property> |
| |
| <!-- generate properties --> |
| |
| <property> |
| <name>generate.max.count</name> |
| <value>-1</value> |
| <description>The maximum number of urls in a single |
| fetchlist. -1 if unlimited. The urls are counted according |
| to the value of the parameter generator.count.mode. |
| </description> |
| </property> |
| |
| <property> |
| <name>generate.count.mode</name> |
| <value>host</value> |
| <description>Determines how the URLs are counted for generator.max.count. |
| Default value is 'host' but can be 'domain'. Note that we do not count |
| per IP in the new version of the Generator. |
| </description> |
| </property> |
| |
| <property> |
| <name>generate.update.crawldb</name> |
| <value>false</value> |
| <description>For highly-concurrent environments, where several |
| generate/fetch/update cycles may overlap, setting this to true ensures |
| that generate will create different fetchlists even without intervening |
| updatedb-s, at the cost of running an additional job to update CrawlDB. |
| If false, running generate twice without intervening |
| updatedb will generate identical fetchlists.</description> |
| </property> |
| |
| <property> |
| <name>generate.max.per.host</name> |
| <value>-1</value> |
| <description>(Deprecated). Use generate.max.count and generate.count.mode instead. |
| The maximum number of urls per host in a single |
| fetchlist. -1 if unlimited.</description> |
| </property> |
| |
| <!-- urlpartitioner properties --> |
| <property> |
| <name>partition.url.mode</name> |
| <value>byHost</value> |
| <description>Determines how to partition URLs. Default value is 'byHost', |
| also takes 'byDomain' or 'byIP'. |
| </description> |
| </property> |
| |
| <property> |
| <name>crawl.gen.delay</name> |
| <value>604800000</value> |
| <description> |
| This value, expressed in days, defines how long we should keep the lock on records |
| in CrawlDb that were just selected for fetching. If these records are not updated |
| in the meantime, the lock is canceled, i.e. the become eligible for selecting. |
| Default value of this is 7 days. |
| </description> |
| </property> |
| |
| <!-- fetcher properties --> |
| |
| <property> |
| <name>fetcher.server.delay</name> |
| <value>5.0</value> |
| <description>The number of seconds the fetcher will delay between |
| successive requests to the same server.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.server.min.delay</name> |
| <value>0.0</value> |
| <description>The minimum number of seconds the fetcher will delay between |
| successive requests to the same server. This value is applicable ONLY |
| if fetcher.threads.per.host is greater than 1 (i.e. the host blocking |
| is turned off).</description> |
| </property> |
| |
| <property> |
| <name>fetcher.max.crawl.delay</name> |
| <value>30</value> |
| <description> |
| If the Crawl-Delay in robots.txt is set to greater than this value (in |
| seconds) then the fetcher will skip this page, generating an error report. |
| If set to -1 the fetcher will never skip such pages and will wait the |
| amount of time retrieved from robots.txt Crawl-Delay, however long that |
| might be. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.threads.fetch</name> |
| <value>10</value> |
| <description>The number of FetcherThreads the fetcher should use. |
| This is also determines the maximum number of requests that are |
| made at once (each FetcherThread handles one connection).</description> |
| </property> |
| |
| <property> |
| <name>fetcher.threads.per.host</name> |
| <value>1</value> |
| <description>This number is the maximum number of threads that |
| should be allowed to access a host at one time.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.threads.per.host.by.ip</name> |
| <value>false</value> |
| <description>If true, then fetcher will count threads by IP address, |
| to which the URL's host name resolves. If false, only host name will be |
| used. NOTE: this should be set to the same value as |
| "generate.max.per.host.by.ip" - default settings are different only for |
| reasons of backward-compatibility.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.verbose</name> |
| <value>false</value> |
| <description>If true, fetcher will log more verbosely.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.parse</name> |
| <value>true</value> |
| <description>If true, fetcher will parse content.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.store.content</name> |
| <value>true</value> |
| <description>If true, fetcher will store content.</description> |
| </property> |
| |
| <property> |
| <name>fetcher.timelimit.mins</name> |
| <value>-1</value> |
| <description>This is the number of minutes allocated to the fetching. |
| Once this value is reached, any remaining entry from the input URL list is skipped |
| and all active queues are emptied. The default value of -1 deactivates the time limit. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.max.exceptions.per.queue</name> |
| <value>-1</value> |
| <description>The maximum number of protocol-level exceptions (e.g. timeouts) per |
| host (or IP) queue. Once this value is reached, any remaining entries from this |
| queue are purged, effectively stopping the fetching from this host/IP. The default |
| value of -1 deactivates this limit. |
| </description> |
| </property> |
| |
| <!-- indexer properties --> |
| |
| <property> |
| <name>indexer.score.power</name> |
| <value>0.5</value> |
| <description>Determines the power of link analyis scores. Each |
| pages's boost is set to <i>score<sup>scorePower</sup></i> where |
| <i>score</i> is its link analysis score and <i>scorePower</i> is the |
| value of this parameter. This is compiled into indexes, so, when |
| this is changed, pages must be re-indexed for it to take |
| effect.</description> |
| </property> |
| |
| <property> |
| <name>indexer.max.title.length</name> |
| <value>100</value> |
| <description>The maximum number of characters of a title that are indexed. |
| </description> |
| </property> |
| |
| <property> |
| <name>indexer.max.tokens</name> |
| <value>10000</value> |
| <description> |
| The maximum number of tokens that will be indexed for a single field |
| in a document. This limits the amount of memory required for |
| indexing, so that collections with very large files will not crash |
| the indexing process by running out of memory. |
| |
| Note that this effectively truncates large documents, excluding |
| from the index tokens that occur further in the document. If you |
| know your source documents are large, be sure to set this value |
| high enough to accomodate the expected size. If you set it to |
| -1, then the only limit is your memory, but you should anticipate |
| an OutOfMemoryError. |
| </description> |
| </property> |
| |
| <property> |
| <name>indexer.mergeFactor</name> |
| <value>50</value> |
| <description>The factor that determines the frequency of Lucene segment |
| merges. This must not be less than 2, higher values increase indexing |
| speed but lead to increased RAM usage, and increase the number of |
| open file handles (which may lead to "Too many open files" errors). |
| NOTE: the "segments" here have nothing to do with Nutch segments, they |
| are a low-level data unit used by Lucene. |
| </description> |
| </property> |
| |
| <property> |
| <name>indexer.minMergeDocs</name> |
| <value>50</value> |
| <description>This number determines the minimum number of Lucene |
| Documents buffered in memory between Lucene segment merges. Larger |
| values increase indexing speed and increase RAM usage. |
| </description> |
| </property> |
| |
| <property> |
| <name>indexer.maxMergeDocs</name> |
| <value>2147483647</value> |
| <description>This number determines the maximum number of Lucene |
| Documents to be merged into a new Lucene segment. Larger values |
| increase batch indexing speed and reduce the number of Lucene segments, |
| which reduces the number of open file handles; however, this also |
| decreases incremental indexing performance. |
| </description> |
| </property> |
| |
| <property> |
| <name>indexer.termIndexInterval</name> |
| <value>128</value> |
| <description>Determines the fraction of terms which Lucene keeps in |
| RAM when searching, to facilitate random-access. Smaller values use |
| more memory but make searches somewhat faster. Larger values use |
| less memory but make searches somewhat slower. |
| </description> |
| </property> |
| |
| <!-- indexingfilter plugin properties --> |
| |
| <property> |
| <name>indexingfilter.order</name> |
| <value></value> |
| <description>The order by which index filters are applied. |
| If empty, all available index filters (as dictated by properties |
| plugin-includes and plugin-excludes above) are loaded and applied in system |
| defined order. If not empty, only named filters are loaded and applied |
| in given order. For example, if this property has value: |
| org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter |
| then BasicIndexingFilter is applied first, and MoreIndexingFilter second. |
| |
| Filter ordering might have impact on result if one filter depends on output of |
| another filter. |
| </description> |
| </property> |
| |
| |
| <!-- analysis properties --> |
| |
| <property> |
| <name>analysis.common.terms.file</name> |
| <value>common-terms.utf8</value> |
| <description>The name of a file containing a list of common terms |
| that should be indexed in n-grams.</description> |
| </property> |
| |
| <!-- searcher properties --> |
| |
| <property> |
| <name>searcher.dir</name> |
| <value>crawl</value> |
| <description> |
| Path to root of crawl. This directory is searched (in |
| order) for either the file search-servers.txt, containing a list of |
| distributed search servers, or the directory "index" containing |
| merged indexes, or the directory "segments" containing segment |
| indexes. |
| </description> |
| </property> |
| |
| <property> |
| <name>searcher.filter.cache.size</name> |
| <value>16</value> |
| <description> |
| Maximum number of filters to cache. Filters can accelerate certain |
| field-based queries, like language, document format, etc. Each |
| filter requires one bit of RAM per page. So, with a 10 million page |
| index, a cache size of 16 consumes two bytes per page, or 20MB. |
| </description> |
| </property> |
| |
| <property> |
| <name>searcher.filter.cache.threshold</name> |
| <value>0.05</value> |
| <description> |
| Filters are cached when their term is matched by more than this |
| fraction of pages. For example, with a threshold of 0.05, and 10 |
| million pages, the term must match more than 1/20, or 50,000 pages. |
| So, if out of 10 million pages, 50% of pages are in English, and 2% |
| are in Finnish, then, with a threshold of 0.05, searches for |
| "lang:en" will use a cached filter, while searches for "lang:fi" |
| will score all 20,000 finnish documents. |
| </description> |
| </property> |
| |
| <property> |
| <name>searcher.hostgrouping.rawhits.factor</name> |
| <value>2.0</value> |
| <description> |
| A factor that is used to determine the number of raw hits |
| initially fetched, before host grouping is done. |
| </description> |
| </property> |
| |
| <property> |
| <name>searcher.summary.context</name> |
| <value>5</value> |
| <description> |
| The number of context terms to display preceding and following |
| matching terms in a hit summary. |
| </description> |
| </property> |
| |
| <property> |
| <name>searcher.summary.length</name> |
| <value>20</value> |
| <description> |
| The total number of terms to display in a hit summary. |
| </description> |
| </property> |
| |
| <property> |
| <name>searcher.max.hits</name> |
| <value>-1</value> |
| <description>If positive, search stops after this many hits are |
| found. Setting this to small, positive values (e.g., 1000) can make |
| searches much faster. With a sorted index, the quality of the hits |
| suffers little.</description> |
| </property> |
| |
| <property> |
| <name>searcher.max.time.tick_count</name> |
| <value>-1</value> |
| <description>If positive value is defined here, limit search time for |
| every request to this number of elapsed ticks (see the tick_length |
| property below). The total maximum time for any search request will be |
| then limited to tick_count * tick_length milliseconds. When search time |
| is exceeded, partial results will be returned, and the total number of |
| hits will be estimated. |
| </description> |
| </property> |
| |
| <property> |
| <name>searcher.max.time.tick_length</name> |
| <value>200</value> |
| <description>The number of milliseconds between ticks. Larger values |
| reduce the timer granularity (precision). Smaller values bring more |
| overhead. |
| </description> |
| </property> |
| |
| <property> |
| <name>searcher.num.handlers</name> |
| <value>10</value> |
| <description>The number of handlers for the distributed search server. |
| </description> |
| </property> |
| |
| <property> |
| <name>searcher.max.hits.per.page</name> |
| <value>1000</value> |
| <description> The maximum number of hits to show per page. -1 if |
| unlimited. If the number of hits requested by the user (via |
| hitsPerPage parameter in the query string) is more than the value |
| specified in this property, then this value is assumed as the number |
| of hits per page. |
| </description> |
| </property> |
| |
| <!-- URL normalizer properties --> |
| |
| <property> |
| <name>urlnormalizer.order</name> |
| <value>org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer</value> |
| <description>Order in which normalizers will run. If any of these isn't |
| activated it will be silently skipped. If other normalizers not on the |
| list are activated, they will run in random order after the ones |
| specified here are run. |
| </description> |
| </property> |
| |
| <property> |
| <name>urlnormalizer.regex.file</name> |
| <value>regex-normalize.xml</value> |
| <description>Name of the config file used by the RegexUrlNormalizer class. |
| </description> |
| </property> |
| |
| <property> |
| <name>urlnormalizer.loop.count</name> |
| <value>1</value> |
| <description>Optionally loop through normalizers several times, to make |
| sure that all transformations have been performed. |
| </description> |
| </property> |
| |
| <!-- mime properties --> |
| |
| <property> |
| <name>mime.types.file</name> |
| <value>tika-mimetypes.xml</value> |
| <description>Name of file in CLASSPATH containing filename extension and |
| magic sequence to mime types mapping information</description> |
| </property> |
| |
| <property> |
| <name>mime.type.magic</name> |
| <value>true</value> |
| <description>Defines if the mime content type detector uses magic resolution. |
| </description> |
| </property> |
| |
| <!-- plugin properties --> |
| |
| <property> |
| <name>plugin.folders</name> |
| <value>plugins</value> |
| <description>Directories where nutch plugins are located. Each |
| element may be a relative or absolute path. If absolute, it is used |
| as is. If relative, it is searched for on the classpath.</description> |
| </property> |
| |
| <property> |
| <name>plugin.auto-activation</name> |
| <value>true</value> |
| <description>Defines if some plugins that are not activated regarding |
| the plugin.includes and plugin.excludes properties must be automaticaly |
| activated if they are needed by some actived plugins. |
| </description> |
| </property> |
| |
| <property> |
| <name>plugin.includes</name> |
| <value>protocol-http|urlfilter-regex|parse-(text|html|js|tika)|index-(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value> |
| <description>Regular expression naming plugin directory names to |
| include. Any plugin not matching this expression is excluded. |
| In any case you need at least include the nutch-extensionpoints plugin. By |
| default Nutch includes crawling just HTML and plain text via HTTP, |
| and basic indexing and search plugins. In order to use HTTPS please enable |
| protocol-httpclient, but be aware of possible intermittent problems with the |
| underlying commons-httpclient library. Nutch now also includes integration with Tika |
| to leverage Tika's parsing capabilities for multiple content types. The existing Nutch |
| parser implementations will likely be phased out in the next release or so, as such, it is |
| a good idea to begin migrating away from anything not provided by parse-tika. |
| </description> |
| </property> |
| |
| <property> |
| <name>plugin.excludes</name> |
| <value></value> |
| <description>Regular expression naming plugin directory names to exclude. |
| </description> |
| </property> |
| |
| <!-- parser properties --> |
| |
| <property> |
| <name>parse.plugin.file</name> |
| <value>parse-plugins.xml</value> |
| <description>The name of the file that defines the associations between |
| content-types and parsers.</description> |
| </property> |
| |
| <property> |
| <name>parser.character.encoding.default</name> |
| <value>windows-1252</value> |
| <description>The character encoding to fall back to when no other information |
| is available</description> |
| </property> |
| |
| <property> |
| <name>encodingdetector.charset.min.confidence</name> |
| <value>-1</value> |
| <description>A integer between 0-100 indicating minimum confidence value |
| for charset auto-detection. Any negative value disables auto-detection. |
| </description> |
| </property> |
| |
| <property> |
| <name>parser.caching.forbidden.policy</name> |
| <value>content</value> |
| <description>If a site (or a page) requests through its robot metatags |
| that it should not be shown as cached content, apply this policy. Currently |
| three keywords are recognized: "none" ignores any "noarchive" directives. |
| "content" doesn't show the content, but shows summaries (snippets). |
| "all" doesn't show either content or summaries.</description> |
| </property> |
| |
| |
| <property> |
| <name>parser.html.impl</name> |
| <value>neko</value> |
| <description>HTML Parser implementation. Currently the following keywords |
| are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup. |
| </description> |
| </property> |
| |
| <property> |
| <name>parser.html.form.use_action</name> |
| <value>false</value> |
| <description>If true, HTML parser will collect URLs from form action |
| attributes. This may lead to undesirable behavior (submitting empty |
| forms during next fetch cycle). If false, form action attribute will |
| be ignored.</description> |
| </property> |
| |
| <property> |
| <name>parser.html.outlinks.ignore_tags</name> |
| <value></value> |
| <description>Comma separated list of HTML tags, from which outlinks |
| shouldn't be extracted. Nutch takes links from: a, area, form, frame, |
| iframe, script, link, img. If you add any of those tags here, it |
| won't be taken. Default is empty list. Probably reasonable value |
| for most people would be "img,script,link".</description> |
| </property> |
| |
| <property> |
| <name>htmlparsefilter.order</name> |
| <value></value> |
| <description>The order by which HTMLParse filters are applied. |
| If empty, all available HTMLParse filters (as dictated by properties |
| plugin-includes and plugin-excludes above) are loaded and applied in system |
| defined order. If not empty, only named filters are loaded and applied |
| in given order. |
| HTMLParse filter ordering MAY have an impact |
| on end result, as some filters could rely on the metadata generated by a previous filter. |
| </description> |
| </property> |
| |
| <!-- urlfilter plugin properties --> |
| |
| <property> |
| <name>urlfilter.domain.file</name> |
| <value>domain-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing either top level domains or |
| hostnames used by urlfilter-domain (DomainURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.regex.file</name> |
| <value>regex-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing regular expressions |
| used by urlfilter-regex (RegexURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.automaton.file</name> |
| <value>automaton-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing regular expressions |
| used by urlfilter-automaton (AutomatonURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.prefix.file</name> |
| <value>prefix-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing url prefixes |
| used by urlfilter-prefix (PrefixURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.suffix.file</name> |
| <value>suffix-urlfilter.txt</value> |
| <description>Name of file on CLASSPATH containing url suffixes |
| used by urlfilter-suffix (SuffixURLFilter) plugin.</description> |
| </property> |
| |
| <property> |
| <name>urlfilter.order</name> |
| <value></value> |
| <description>The order by which url filters are applied. |
| If empty, all available url filters (as dictated by properties |
| plugin-includes and plugin-excludes above) are loaded and applied in system |
| defined order. If not empty, only named filters are loaded and applied |
| in given order. For example, if this property has value: |
| org.apache.nutch.urlfilter.regex.RegexURLFilter org.apache.nutch.urlfilter.prefix.PrefixURLFilter |
| then RegexURLFilter is applied first, and PrefixURLFilter second. |
| Since all filters are AND'ed, filter ordering does not have impact |
| on end result, but it may have performance implication, depending |
| on relative expensiveness of filters. |
| </description> |
| </property> |
| |
| <!-- scoring filters properties --> |
| |
| <property> |
| <name>scoring.filter.order</name> |
| <value></value> |
| <description>The order in which scoring filters are applied. |
| This may be left empty (in which case all available scoring |
| filters will be applied in the order defined in plugin-includes |
| and plugin-excludes), or a space separated list of implementation |
| classes. |
| </description> |
| </property> |
| |
| <!-- clustering extension properties --> |
| |
| <property> |
| <name>extension.clustering.hits-to-cluster</name> |
| <value>100</value> |
| <description>Number of snippets retrieved for the clustering extension |
| if clustering extension is available and user requested results |
| to be clustered.</description> |
| </property> |
| |
| <property> |
| <name>extension.clustering.extension-name</name> |
| <value></value> |
| <description>Use the specified online clustering extension. If empty, |
| the first available extension will be used. The "name" here refers to an 'id' |
| attribute of the 'implementation' element in the plugin descriptor XML |
| file.</description> |
| </property> |
| |
| <!-- ontology extension properties --> |
| |
| <property> |
| <name>extension.ontology.extension-name</name> |
| <value></value> |
| <description>Use the specified online ontology extension. If empty, |
| the first available extension will be used. The "name" here refers to an 'id' |
| attribute of the 'implementation' element in the plugin descriptor XML |
| file.</description> |
| </property> |
| |
| <property> |
| <name>extension.ontology.urls</name> |
| <value> |
| </value> |
| <description>Urls of owl files, separated by spaces, such as |
| http://www.example.com/ontology/time.owl |
| http://www.example.com/ontology/space.owl |
| http://www.example.com/ontology/wine.owl |
| Or |
| file:/ontology/time.owl |
| file:/ontology/space.owl |
| file:/ontology/wine.owl |
| You have to make sure each url is valid. |
| By default, there is no owl file, so query refinement based on ontology |
| is silently ignored. |
| </description> |
| </property> |
| |
| <!-- query-basic plugin properties --> |
| |
| <property> |
| <name>query.url.boost</name> |
| <value>4.0</value> |
| <description> Used as a boost for url field in Lucene query. |
| </description> |
| </property> |
| |
| <property> |
| <name>query.anchor.boost</name> |
| <value>2.0</value> |
| <description> Used as a boost for anchor field in Lucene query. |
| </description> |
| </property> |
| |
| <property> |
| <name>query.title.boost</name> |
| <value>1.5</value> |
| <description> Used as a boost for title field in Lucene query. |
| </description> |
| </property> |
| |
| <property> |
| <name>query.host.boost</name> |
| <value>2.0</value> |
| <description> Used as a boost for host field in Lucene query. |
| </description> |
| </property> |
| |
| <property> |
| <name>query.phrase.boost</name> |
| <value>1.0</value> |
| <description> Used as a boost for phrase in Lucene query. |
| Multiplied by boost for field phrase is matched in. |
| </description> |
| </property> |
| |
| <!-- |
| <property> |
| <name>query.basic.description.boost</name> |
| <value>1.0</value> |
| <description> Declares a custom field and its boost to be added to the default fields of the Lucene query. |
| </description> |
| </property> |
| --> |
| |
| <!-- creative-commons plugin properties --> |
| |
| <property> |
| <name>query.cc.boost</name> |
| <value>0.0</value> |
| <description> Used as a boost for cc field in Lucene query. |
| </description> |
| </property> |
| |
| <!-- query-more plugin properties --> |
| |
| <property> |
| <name>query.type.boost</name> |
| <value>0.0</value> |
| <description> Used as a boost for type field in Lucene query. |
| </description> |
| </property> |
| |
| <!-- query-site plugin properties --> |
| |
| <property> |
| <name>query.site.boost</name> |
| <value>0.0</value> |
| <description> Used as a boost for site field in Lucene query. |
| </description> |
| </property> |
| |
| <!-- microformats-reltag plugin properties --> |
| |
| <property> |
| <name>query.tag.boost</name> |
| <value>1.0</value> |
| <description> Used as a boost for tag field in Lucene query. |
| </description> |
| </property> |
| |
| <!-- language-identifier plugin properties --> |
| |
| <property> |
| <name>lang.ngram.min.length</name> |
| <value>1</value> |
| <description> The minimum size of ngrams to uses to identify |
| language (must be between 1 and lang.ngram.max.length). |
| The larger is the range between lang.ngram.min.length and |
| lang.ngram.max.length, the better is the identification, but |
| the slowest it is. |
| </description> |
| </property> |
| |
| <property> |
| <name>lang.ngram.max.length</name> |
| <value>4</value> |
| <description> The maximum size of ngrams to uses to identify |
| language (must be between lang.ngram.min.length and 4). |
| The larger is the range between lang.ngram.min.length and |
| lang.ngram.max.length, the better is the identification, but |
| the slowest it is. |
| </description> |
| </property> |
| |
| <property> |
| <name>lang.analyze.max.length</name> |
| <value>2048</value> |
| <description> The maximum bytes of data to uses to indentify |
| the language (0 means full content analysis). |
| The larger is this value, the better is the analysis, but the |
| slowest it is. |
| </description> |
| </property> |
| |
| <property> |
| <name>query.lang.boost</name> |
| <value>0.0</value> |
| <description> Used as a boost for lang field in Lucene query. |
| </description> |
| </property> |
| |
| <!-- Temporary Hadoop 0.17.x workaround. --> |
| |
| <property> |
| <name>hadoop.job.history.user.location</name> |
| <value>${hadoop.log.dir}/history/user</value> |
| <description>Hadoop 0.17.x comes with a default setting to create |
| user logs inside the output path of the job. This breaks some |
| Hadoop classes, which expect the output to contain only |
| part-XXXXX files. This setting changes the output to a |
| subdirectory of the regular log directory. |
| </description> |
| </property> |
| |
| <!-- response writer properties --> |
| |
| <property> |
| <name>search.response.default.type</name> |
| <value>xml</value> |
| <description> |
| The default response type returned if none is specified. |
| </description> |
| </property> |
| |
| <property> |
| <name>search.response.default.lang</name> |
| <value>en</value> |
| <description> |
| The default response language if none is specified. |
| </description> |
| </property> |
| |
| <property> |
| <name>search.response.default.numrows</name> |
| <value>10</value> |
| <description> |
| The default number of rows to return if none is specified. |
| </description> |
| </property> |
| |
| <property> |
| <name>search.response.default.dedupfield</name> |
| <value>site</value> |
| <description> |
| The default dedup field if none is specified. |
| </description> |
| </property> |
| |
| <property> |
| <name>search.response.default.numdupes</name> |
| <value>1</value> |
| <description> |
| The default number of duplicates returned if none is specified. |
| </description> |
| </property> |
| |
| <property> |
| <name>searcher.response.maxage</name> |
| <value>86400</value> |
| <description> |
| The maxage of a response in seconds. Used in caching headers. |
| </description> |
| </property> |
| |
| <property> |
| <name>searcher.response.prettyprint</name> |
| <value>true</value> |
| <description> |
| Should the response output be pretty printed. Setting to true enables better |
| debugging, false removes unneeded spaces and gives better throughput. |
| </description> |
| </property> |
| |
| <!-- solr index properties --> |
| <property> |
| <name>solrindex.mapping.file</name> |
| <value>solrindex-mapping.xml</value> |
| <description> |
| Defines the name of the file that will be used in the mapping of internal |
| nutch field names to solr index fields as specified in the target Solr schema. |
| </description> |
| </property> |
| |
| </configuration> |