conf/nutch-default.xml - nutch - Git at Google

 <?xml version="1.0"?>
 <?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?>

 <!-- Do not modify this file directly.  Instead, copy entries that you -->
 <!-- wish to modify from this file into nutch-site.xml and change them -->
 <!-- there.  If nutch-site.xml does not already exist, create it.      -->

 <nutch-conf>

 <!-- HTTP properties -->

 <property>
   <name>http.agent.name</name>
   <value>NutchCVS</value>
   <description>Our HTTP 'User-Agent' request header.</description>
 </property>

 <property>
   <name>http.robots.agents</name>
   <value>NutchCVS,Nutch,*</value>
   <description>The agent strings we'll look for in robots.txt files,
   comma-separated, in decreasing order of precedence.</description>
 </property>

 <property>
   <name>http.robots.403.allow</name>
   <value>true</value>
   <description>Some servers return HTTP status 403 (Forbidden) if
   /robots.txt doesn't exist. This should probably mean that we are
   allowed to crawl the site nonetheless. If this is set to false,
   then such sites will be treated as forbidden.</description>
 </property>

 <property>
   <name>http.agent.description</name>
   <value>Nutch</value>
   <description>Further description of our bot- this text is used in
   the User-Agent header.  It appears in parenthesis after the agent name.
   </description>
 </property>

 <property>
   <name>http.agent.url</name>
   <value>http://lucene.apache.org/nutch/bot.html</value>
   <description>A URL to advertise in the User-Agent header.  This will
    appear in parenthesis after the agent name.
   </description>
 </property>

 <property>
   <name>http.agent.email</name>
   <value>nutch-agent@lucene.apache.org</value>
   <description>An email address to advertise in the HTTP 'From' request
    header and User-Agent header.</description>
 </property>

 <property>
   <name>http.agent.version</name>
   <value>0.7.2</value>
   <description>A version string to advertise in the User-Agent
    header.</description>
 </property>

 <property>
   <name>http.timeout</name>
   <value>10000</value>
   <description>The default network timeout, in milliseconds.</description>
 </property>

 <property>
   <name>http.max.delays</name>
   <value>3</value>
   <description>The number of times a thread will delay when trying to
   fetch a page.  Each time it finds that a host is busy, it will wait
   fetcher.server.delay.  After http.max.delays attepts, it will give
   up on the page for now.</description>
 </property>

 <property>
   <name>http.content.limit</name>
   <value>65536</value>
   <description>The length limit for downloaded content, in bytes.
   If this value is nonnegative (>=0), content longer than it will be truncated;
   otherwise, no truncation at all.
   </description>
 </property>

 <property>
   <name>http.proxy.host</name>
   <value></value>
   <description>The proxy hostname.  If empty, no proxy is used.</description>
 </property>

 <property>
   <name>http.proxy.port</name>
   <value></value>
   <description>The proxy port.</description>
 </property>

 <property>
   <name>http.verbose</name>
   <value>false</value>
   <description>If true, HTTP will log more verbosely.</description>
 </property>

 <property>
   <name>http.redirect.max</name>
   <value>3</value>
   <description>The maximum number of redirects the fetcher will follow when
     trying to fetch a page.</description>
 </property>

 <!-- FILE properties -->

 <property>
   <name>file.content.limit</name>
   <value>65536</value>
   <description>The length limit for downloaded content, in bytes.
   If this value is larger than zero, content longer than it will be
   truncated; otherwise (zero or negative), no truncation at all.
   </description>
 </property>

 <property>
   <name>file.content.ignored</name>
   <value>true</value>
   <description>If true, no file content will be saved during fetch.
   And it is probably what we want to set most of time, since file:// URLs
   are meant to be local and we can always use them directly at parsing
   and indexing stages. Otherwise file contents will be saved.
   !! NO IMPLEMENTED YET !!
   </description>
 </property>

 <!-- FTP properties -->

 <property>
   <name>ftp.username</name>
   <value>anonymous</value>
   <description>ftp login username.</description>
 </property>

 <property>
   <name>ftp.password</name>
   <value>anonymous@example.com</value>
   <description>ftp login password.</description>
 </property>

 <property>
   <name>ftp.content.limit</name>
   <value>65536</value>
   <description>The length limit for downloaded content, in bytes.
   If this value is larger than zero, content longer than it is truncated;
   otherwise (zero or negative), no truncation at all. Caution: classical
   ftp RFCs never defines partial transfer and, in fact, some ftp servers
   out there do not handle client side forced close-down very well.
   Our implementation tries its best to handle such situations smoothly.
   </description>
 </property>

 <property>
   <name>ftp.timeout</name>
   <value>60000</value>
   <description>Default timeout for ftp client socket, in millisec.
   Please also see ftp.keep.connection below.</description>
 </property>

 <property>
   <name>ftp.server.timeout</name>
   <value>100000</value>
   <description>An estimation of ftp server idle time, in millisec.
   Typically it is 120000 millisec for many ftp servers out there.
   Better be conservative here. Together with ftp.timeout, it is used to
   decide if we need to delete (annihilate) current ftp.client instance and
   force to start another ftp.client instance anew. This is necessary because
   a fetcher thread may not be able to obtain next request from queue in time
   (due to idleness) before our ftp client times out or remote server
   disconnects. Used only when ftp.keep.connection is true (please see below).
   </description>
 </property>

 <property>
   <name>ftp.keep.connection</name>
   <value>false</value>
   <description>Whether to keep ftp connection. Useful if crawling same host
   again and again. When set to true, it avoids connection, login and dir list
   parser setup for subsequent urls. If it is set to true, however, you must
   make sure (roughly):
   (1) ftp.timeout is less than ftp.server.timeout
   (2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay)
   Otherwise there will be too many "delete client because idled too long"
   messages in thread logs.</description>
 </property>

 <property>
   <name>ftp.follow.talk</name>
   <value>false</value>
   <description>Whether to log dialogue between our client and remote
   server. Useful for debugging.</description>
 </property>

 <!-- web db properties -->

 <property>
   <name>db.default.fetch.interval</name>
   <value>30</value>
   <description>The default number of days between re-fetches of a page.
   </description>
 </property>

 <property>
   <name>db.ignore.internal.links</name>
   <value>true</value>
   <description>If true, when adding new links to a page, links from
   the same host are ignored.  This is an effective way to limit the
   size of the link database, keeping the only the highest quality
   links.
   </description>
 </property>

 <property>
   <name>db.score.injected</name>
   <value>1.0</value>
   <description>The score of new pages added by the injector.
   </description>
 </property>

 <property>
   <name>db.score.link.external</name>
   <value>1.0</value>
   <description>The score factor for new pages added due to a link from
   another host relative to the referencing page's score.
   </description>
 </property>

 <property>
   <name>db.score.link.internal</name>
   <value>1.0</value>
   <description>The score factor for pages added due to a link from the
   same host, relative to the referencing page's score.
   </description>
 </property>

 <property>
   <name>db.max.outlinks.per.page</name>
   <value>100</value>
   <description>The maximum number of outlinks that we'll process for a page.
   </description>
 </property>

 <property>
   <name>db.max.anchor.length</name>
   <value>100</value>
   <description>The maximum number of characters permitted in an anchor.
   </description>
 </property>

 <property>
   <name>db.fetch.retry.max</name>
   <value>3</value>
   <description>The maximum number of times a url that has encountered
   recoverable errors is generated for fetch.</description>
 </property>

 <!-- fetchlist tool properties -->

 <property>
   <name>fetchlist.score.by.link.count</name>
   <value>true</value>
   <description>If true, set page scores on fetchlist entries based on
   log(number of anchors), instead of using original page scores. This
   results in prioritization of pages with many incoming links.
   </description>
 </property>

 <!-- fetcher properties -->

 <property>
   <name>fetcher.server.delay</name>
   <value>5.0</value>
   <description>The number of seconds the fetcher will delay between
    successive requests to the same server.</description>
 </property>

 <property>
   <name>fetcher.threads.fetch</name>
   <value>10</value>
   <description>The number of FetcherThreads the fetcher should use.
     This is also determines the maximum number of requests that are
     made at once (each FetcherThread handles one connection).</description>
 </property>

 <property>
   <name>fetcher.threads.per.host</name>
   <value>1</value>
   <description>This number is the maximum number of threads that
     should be allowed to access a host at one time.</description>
 </property>

 <property>
   <name>fetcher.verbose</name>
   <value>false</value>
   <description>If true, fetcher will log more verbosely.</description>
 </property>

 <!-- parser properties -->
 <property>
   <name>parser.threads.parse</name>
   <value>10</value>
   <description>Number of ParserThreads ParseSegment should use.</description>
 </property>

 <!-- i/o properties -->

 <property>
   <name>io.sort.factor</name>
   <value>100</value>
   <description>The number of streams to merge at once while sorting
   files.  This determines the number of open file handles.</description>
 </property>

 <property>
   <name>io.sort.mb</name>
   <value>100</value>
   <description>The total amount of buffer memory to use while sorting
   files, in megabytes.  By default, gives each merge stream 1MB, which
   should minimize seeks.</description>
 </property>

 <property>
   <name>io.file.buffer.size</name>
   <value>131072</value>
   <description>The size of buffer for use in sequence files.
   The size of this buffer should probably be a multiple of hardware
   page size (4096 on Intel x86), and it determines how much data is
   buffered during read and write operations.</description>
 </property>

 <!-- file system properties -->

 <property>
   <name>fs.default.name</name>
   <value>local</value>
   <description>The name of the default file system.  Either the
   literal string "local" or a host:port for NDFS.</description>
 </property>

 <property>
   <name>ndfs.name.dir</name>
   <value>/tmp/nutch/ndfs/name</value>
   <description>Determines where on the local filesystem the NDFS name node
       should store the name table.</description>
 </property>

 <property>
   <name>ndfs.data.dir</name>
   <value>/tmp/nutch/ndfs/data</value>
   <description>Determines where on the local filesystem an NDFS data node
       should store its blocks.</description>
 </property>

 <!-- map/reduce properties -->

 <property>
   <name>mapred.job.tracker</name>
   <value>localhost:8010</value>
   <description>The host and port that the MapReduce job tracker runs at.
   </description>
 </property>

 <property>
   <name>mapred.local.dir</name>
   <value>/tmp/nutch/mapred/local</value>
   <description>The local directory where MapReduce stores temprorary files
       related to tasks and jobs.
   </description>
 </property>

 <!-- indexer properties -->

 <property>
   <name>indexer.score.power</name>
   <value>0.5</value>
   <description>Determines the power of link analyis scores.  Each
   pages's boost is set to <i>score<sup>scorePower</sup></i> where
   <i>score</i> is its link analysis score and <i>scorePower</i> is the
   value of this parameter.  This is compiled into indexes, so, when
   this is changed, pages must be re-indexed for it to take
   effect.</description>
 </property>

 <property>
   <name>indexer.boost.by.link.count</name>
   <value>true</value>
   <description>When true scores for a page are multipled by the log of
   the number of incoming links to the page.</description>
 </property>

 <property>
   <name>indexer.max.title.length</name>
   <value>100</value>
   <description>The maximum number of characters of a title that are indexed.
   </description>
 </property>

 <property>
   <name>indexer.max.tokens</name>
   <value>10000</value>
   <description>
   The maximum number of tokens that will be indexed for a single field
   in a document. This limits the amount of memory required for
   indexing, so that collections with very large files will not crash
   the indexing process by running out of memory.

   Note that this effectively truncates large documents, excluding
   from the index tokens that occur further in the document. If you
   know your source documents are large, be sure to set this value
   high enough to accomodate the expected size. If you set it to
   Integer.MAX_VALUE, then the only limit is your memory, but you
   should anticipate an OutOfMemoryError.
   </description>
 </property>

 <property>
   <name>indexer.mergeFactor</name>
   <value>50</value>
   <description>The factor that determines the frequency of Lucene segment
   merges. This must not be less than 2, higher values increase indexing
   speed but lead to increased RAM usage, and increase the number of
   open file handles (which may lead to "Too many open files" errors).
   NOTE: the "segments" here have nothing to do with Nutch segments, they
   are a low-level data unit used by Lucene.
   </description>
 </property>

 <property>
   <name>indexer.minMergeDocs</name>
   <value>50</value>
   <description>This number determines the minimum number of Lucene
   Documents buffered in memory between Lucene segment merges. Larger
   values increase indexing speed and increase RAM usage.
   </description>
 </property>

 <property>
   <name>indexer.maxMergeDocs</name>
   <value>2147483647</value>
   <description>This number determines the maximum number of Lucene
   Documents to be merged into a new Lucene segment. Larger values
   increase indexing speed and reduce the number of Lucene segments,
   which reduces the number of open file handles; however, this also
   increases RAM usage during indexing.
   </description>
 </property>

 <property>
   <name>indexer.termIndexInterval</name>
   <value>128</value>
   <description>Determines the fraction of terms which Lucene keeps in
   RAM when searching, to facilitate random-access.  Smaller values use
   more memory but make searches somewhat faster.  Larger values use
   less memory but make searches somewhat slower.
   </description>
 </property>


 <!-- analysis properties -->

 <property>
   <name>analysis.common.terms.file</name>
   <value>common-terms.utf8</value>
   <description>The name of a file containing a list of common terms
   that should be indexed in n-grams.</description>
 </property>

 <!-- searcher properties -->

 <property>
   <name>searcher.dir</name>
   <value>.</value>
   <description>
   Path to root of index directories.  This directory is searched (in
   order) for either the file search-servers.txt, containing a list of
   distributed search servers, or the directory "index" containing
   merged indexes, or the directory "segments" containing segment
   indexes.
   </description>
 </property>

 <property>
   <name>searcher.filter.cache.size</name>
   <value>16</value>
   <description>
   Maximum number of filters to cache.  Filters can accelerate certain
   field-based queries, like language, document format, etc.  Each
   filter requires one bit of RAM per page.  So, with a 10 million page
   index, a cache size of 16 consumes two bytes per page, or 20MB.
   </description>
 </property>

 <property>
   <name>searcher.filter.cache.threshold</name>
   <value>0.05</value>
   <description>
   Filters are cached when their term is matched by more than this
   fraction of pages.  For example, with a threshold of 0.05, and 10
   million pages, the term must match more than 1/20, or 50,000 pages.
   So, if out of 10 million pages, 50% of pages are in English, and 2%
   are in Finnish, then, with a threshold of 0.05, searches for
   "lang:en" will use a cached filter, while searches for "lang:fi"
   will score all 20,000 finnish documents.
   </description>
 </property>

 <property>
   <name>searcher.hostgrouping.rawhits.factor</name>
   <value>2.0</value>
   <description>
   A factor that is used to determine the number of raw hits
   initially fetched, before host grouping is done.
   </description>
 </property>

 <property>
   <name>searcher.summary.context</name>
   <value>5</value>
   <description>
   The number of context terms to display preceding and following
   matching terms in a hit summary.
   </description>
 </property>

 <property>
   <name>searcher.summary.length</name>
   <value>20</value>
   <description>
   The total number of terms to display in a hit summary.
   </description>
 </property>

 <!-- URL normalizer properties -->

 <property>
   <name>urlnormalizer.class</name>
   <value>org.apache.nutch.net.BasicUrlNormalizer</value>
   <description>Name of the class used to normalize URLs.</description>
 </property>

 <property>
   <name>urlnormalizer.regex.file</name>
   <value>regex-normalize.xml</value>
   <description>Name of the config file used by the RegexUrlNormalizer class.</description></property>

 <!-- mime properties -->

 <property>
   <name>mime.types.file</name>
   <value>mime-types.xml</value>
   <description>Name of file in CLASSPATH containing filename extension and
   magic sequence to mime types mapping information</description>
 </property>

 <property>
   <name>mime.type.magic</name>
   <value>true</value>
   <description>Defines if the mime content type detector uses magic resolution.
   </description>
 </property>

 <!-- ipc properties -->

 <property>
   <name>ipc.client.timeout</name>
   <value>10000</value>
   <description>Defines the timeout for IPC calls in milliseconds. </description>
 </property>

 <!-- plugin properties -->

 <property>
   <name>plugin.folders</name>
   <value>plugins</value>
   <description>Directories where nutch plugins are located.  Each
   element may be a relative or absolute path.  If absolute, it is used
   as is.  If relative, it is searched for on the classpath.</description>
 </property>

 <property>
   <name>plugin.includes</name>
   <value>nutch-extensionpoints|protocol-http|urlfilter-regex|parse-(text|html)|index-basic|query-(basic|site|url)</value>
   <description>Regular expression naming plugin directory names to
   include.  Any plugin not matching this expression is excluded.
   In any case you need at least include the nutch-extensionpoints plugin. By
   default Nutch includes crawling just HTML and plain text via HTTP,
   and basic indexing and search plugins.
   </description>
 </property>

 <property>
   <name>plugin.excludes</name>
   <value></value>
   <description>Regular expression naming plugin directory names to exclude.
   </description>
 </property>

 <property>
   <name>parser.character.encoding.default</name>
   <value>windows-1252</value>
   <description>The character encoding to fall back to when no other information
   is available</description>
 </property>

 <property>
   <name>parser.html.impl</name>
   <value>neko</value>
   <description>HTML Parser implementation. Currently the following keywords
   are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup.
   </description>
 </property>

 <!-- urlfilter plugin properties -->

 <property>
   <name>urlfilter.regex.file</name>
   <value>regex-urlfilter.txt</value>
   <description>Name of file on CLASSPATH containing regular expressions
   used by urlfilter-regex (RegexURLFilter) plugin.</description>
 </property>

 <property>
   <name>urlfilter.prefix.file</name>
   <value>prefix-urlfilter.txt</value>
   <description>Name of file on CLASSPATH containing url prefixes
   used by urlfilter-prefix (PrefixURLFilter) plugin.</description>
 </property>

 <property>
   <name>urlfilter.order</name>
   <value></value>
   <description>The order by which url filters are applied.
   If empty, all available url filters (as dictated by properties
   plugin-includes and plugin-excludes above) are loaded and applied in system
   defined order. If not empty, only named filters are loaded and applied
   in given order. For example, if this property has value:
   org.apache.nutch.net.RegexURLFilter org.apache.nutch.net.PrefixURLFilter
   then RegexURLFilter is applied first, and PrefixURLFilter second.
   Since all filters are AND'ed, filter ordering does not have impact
   on end result, but it may have performance implication, depending
   on relative expensiveness of filters.
   </description>
 </property>

 <!-- clustering extension properties -->

 <property>
   <name>extension.clustering.hits-to-cluster</name>
   <value>100</value>
   <description>Number of snippets retrieved for the clustering extension
   if clustering extension is available and user requested results
   to be clustered.</description>
 </property>

 <property>
   <name>extension.clustering.extension-name</name>
   <value></value>
   <description>Use the specified online clustering extension. If empty,
   the first available extension will be used. The "name" here refers to an 'id'
   attribute of the 'implementation' element in the plugin descriptor XML
   file.</description>
 </property>

 <!-- ontology extension properties -->

 <property>
   <name>extension.ontology.extension-name</name>
   <value></value>
   <description>Use the specified online ontology extension. If empty,
   the first available extension will be used. The "name" here refers to an 'id'
   attribute of the 'implementation' element in the plugin descriptor XML
   file.</description>
 </property>

 <property>
   <name>extension.ontology.urls</name>
   <value>
   </value>
   <description>Urls of owl files, separated by spaces, such as
   http://www.example.com/ontology/time.owl
   http://www.example.com/ontology/space.owl
   http://www.example.com/ontology/wine.owl
   Or
   file:/ontology/time.owl
   file:/ontology/space.owl
   file:/ontology/wine.owl
   You have to make sure each url is valid.
   By default, there is no owl file, so query refinement based on ontology
   is silently ignored.
   </description>
 </property>

 <!-- query-basic plugin properties -->

 <property>
   <name>query.url.boost</name>
   <value>4.0</value>
   <description> Used as a boost for url field in Lucene query.
   </description>
 </property>

 <property>
   <name>query.anchor.boost</name>
   <value>2.0</value>
   <description> Used as a boost for anchor field in Lucene query.
   </description>
 </property>


 <property>
   <name>query.title.boost</name>
   <value>1.5</value>
   <description> Used as a boost for title field in Lucene query.
   </description>
 </property>

 <property>
   <name>query.host.boost</name>
   <value>2.0</value>
   <description> Used as a boost for host field in Lucene query.
   </description>
 </property>

 <property>
   <name>query.phrase.boost</name>
   <value>1.0</value>
   <description> Used as a boost for phrase in Lucene query.
   Multiplied by boost for field phrase is matched in.
   </description>
 </property>

 <!-- language-identifier plugin properties -->

 <property>
   <name>lang.ngram.min.length</name>
   <value>1</value>
   <description> The minimum size of ngrams to uses to identify
   language (must be between 1 and lang.ngram.max.length).
   The larger is the range between lang.ngram.min.length and
   lang.ngram.max.length, the better is the identification, but
   the slowest it is.
   </description>
 </property>

 <property>
   <name>lang.ngram.max.length</name>
   <value>4</value>
   <description> The maximum size of ngrams to uses to identify
   language (must be between lang.ngram.min.length and 4).
   The larger is the range between lang.ngram.min.length and
   lang.ngram.max.length, the better is the identification, but
   the slowest it is.
   </description>
 </property>

 <property>
   <name>lang.analyze.max.length</name>
   <value>2048</value>
   <description> The maximum bytes of data to uses to indentify
   the language (0 means full content analysis).
   The larger is this value, the better is the analysis, but the
   slowest it is.
   </description>
 </property>

 </nutch-conf>