| <?xml version="1.0" ?> |
| <!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> |
| |
| <!-- Overidden defaults for intranet use. --> |
| |
| <!-- Do not modify this file directly. Instead, copy entries that you --> |
| <!-- wish to modify from this file into nutch-site.xml and change them --> |
| <!-- there. If nutch-site.xml does not already exist, create it. --> |
| |
| <configuration> |
| |
| <property> |
| <name>urlfilter.regex.file</name> |
| <value>crawl-urlfilter.txt</value> |
| </property> |
| |
| <property> |
| <name>db.ignore.internal.links</name> |
| <value>false</value> |
| <description>If true, when adding new links to a page, links from |
| the same host are ignored. This is an effective way to limit the |
| size of the link database, keeping the only the highest quality |
| links. |
| </description> |
| </property> |
| |
| <property> |
| <name>fetcher.server.delay</name> |
| <value>1.0</value> |
| <description>The number of seconds the fetcher will delay between |
| successive requests to the same server.</description> |
| </property> |
| |
| <property> |
| <name>http.max.delays</name> |
| <value>1000</value> |
| <description>The number of times a thread will delay when trying to |
| fetch a page. When using the crawl tool there are likely to be very |
| few different hosts, so we need to be willing to wait longer for |
| each.</description> |
| </property> |
| |
| </configuration> |