blob: 8ee93b3ddc55d99e384e407c95781d2a2a4f24bd [file] [log] [blame]
<?xml version="1.0" ?>
<?xml:stylesheet type="text/xsl" href="nutch-conf.xsl"?>
<!-- Overidden defaults for intranet use. -->
<!-- Do not modify this file directly. Instead, copy entries that you -->
<!-- wish to modify from this file into nutch-site.xml and change them -->
<!-- there. If nutch-site.xml does not already exist, create it. -->
<nutch-conf>
<property>
<name>urlfilter.regex.file</name>
<value>crawl-urlfilter.txt</value>
</property>
<property>
<name>indexer.boost.by.link.count</name>
<value>true</value>
<description>When true scores for a page are multipled by the log of
the number of incoming links to the page.</description>
</property>
<property>
<name>db.ignore.internal.links</name>
<value>false</value>
<description>If true, when adding new links to a page, links from
the same host are ignored. This is an effective way to limit the
size of the link database, keeping the only the highest quality
links.
</description>
</property>
<property>
<name>fetcher.server.delay</name>
<value>1.0</value>
<description>The number of seconds the fetcher will delay between
successive requests to the same server.</description>
</property>
<property>
<name>http.max.delays</name>
<value>100</value>
<description>The number of times a thread will delay when trying to
fetch a page. When using the crawl tool there are likely to be very
few different hosts, so we need to be willing to wait longer for
each.</description>
</property>
</nutch-conf>