conf/crawl-tool.xml - nutch - Git at Google

 <?xml version="1.0" ?>
 <?xml:stylesheet type="text/xsl" href="nutch-conf.xsl"?>

 <!-- Overidden defaults for intranet use. -->

 <!-- Do not modify this file directly.  Instead, copy entries that you -->
 <!-- wish to modify from this file into nutch-site.xml and change them -->
 <!-- there.  If nutch-site.xml does not already exist, create it.      -->

 <nutch-conf>

 <property>
   <name>urlfilter.regex.file</name>
   <value>crawl-urlfilter.txt</value>
 </property>

 <property>
   <name>indexer.boost.by.link.count</name>
   <value>true</value>
   <description>When true scores for a page are multipled by the log of
   the number of incoming links to the page.</description>
 </property>

 <property>
   <name>db.ignore.internal.links</name>
   <value>false</value>
   <description>If true, when adding new links to a page, links from
   the same host are ignored.  This is an effective way to limit the
   size of the link database, keeping the only the highest quality
   links.
   </description>
 </property>

 <property>
   <name>fetcher.server.delay</name>
   <value>1.0</value>
   <description>The number of seconds the fetcher will delay between
    successive requests to the same server.</description>
 </property>

 <property>
   <name>http.max.delays</name>
   <value>100</value>
   <description>The number of times a thread will delay when trying to
   fetch a page.  When using the crawl tool there are likely to be very
   few different hosts, so we need to be willing to wait longer for
   each.</description>
 </property>

 </nutch-conf>
	<?xml version="1.0" ?>
	<?xml:stylesheet type="text/xsl" href="nutch-conf.xsl"?>

	<!-- Overidden defaults for intranet use. -->

	<!-- Do not modify this file directly. Instead, copy entries that you -->
	<!-- wish to modify from this file into nutch-site.xml and change them -->
	<!-- there. If nutch-site.xml does not already exist, create it. -->

	<nutch-conf>

	<property>
	<name>urlfilter.regex.file</name>
	<value>crawl-urlfilter.txt</value>
	</property>

	<property>
	<name>indexer.boost.by.link.count</name>
	<value>true</value>
	<description>When true scores for a page are multipled by the log of
	the number of incoming links to the page.</description>
	</property>

	<property>
	<name>db.ignore.internal.links</name>
	<value>false</value>
	<description>If true, when adding new links to a page, links from
	the same host are ignored. This is an effective way to limit the
	size of the link database, keeping the only the highest quality
	links.
	</description>
	</property>

	<property>
	<name>fetcher.server.delay</name>
	<value>1.0</value>
	<description>The number of seconds the fetcher will delay between
	successive requests to the same server.</description>
	</property>

	<property>
	<name>http.max.delays</name>
	<value>100</value>
	<description>The number of times a thread will delay when trying to
	fetch a page. When using the crawl tool there are likely to be very
	few different hosts, so we need to be willing to wait longer for
	each.</description>
	</property>

	</nutch-conf>