conf/crawl-urlfilter.txt.template - nutch - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.


 # The url filter file used by the crawl command.

 # Better for intranet crawling.
 # Be sure to change MY.DOMAIN.NAME to your domain name.

 # Each non-comment, non-blank line contains a regular expression
 # prefixed by '+' or '-'.  The first matching pattern in the file
 # determines whether a URL is included or ignored.  If no pattern
 # matches, the URL is ignored.

 # skip file:, ftp:, & mailto: urls
 -^(file|ftp|mailto):

 # skip image and other suffixes we can't yet parse
 -\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|jpeg|JPEG|bmp|BMP)$

 # skip URLs containing certain characters as probable queries, etc.
 -[?*!@=]

 # skip URLs with slash-delimited segment that repeats 3+ times, to break loops
 -.*(/[^/]+)/[^/]+\1/[^/]+\1/

 # accept hosts in MY.DOMAIN.NAME
 +^http://([a-z0-9]*\.)*MY.DOMAIN.NAME/

 # skip everything else
 -.
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	# The url filter file used by the crawl command.

	# Better for intranet crawling.
	# Be sure to change MY.DOMAIN.NAME to your domain name.

	# Each non-comment, non-blank line contains a regular expression
	# prefixed by '+' or '-'. The first matching pattern in the file
	# determines whether a URL is included or ignored. If no pattern
	# matches, the URL is ignored.

	# skip file:, ftp:, & mailto: urls
	-^(file\|ftp\|mailto):

	# skip image and other suffixes we can't yet parse
	-\.(gif\|GIF\|jpg\|JPG\|png\|PNG\|ico\|ICO\|css\|sit\|eps\|wmf\|zip\|ppt\|mpg\|xls\|gz\|rpm\|tgz\|mov\|MOV\|exe\|jpeg\|JPEG\|bmp\|BMP)$

	# skip URLs containing certain characters as probable queries, etc.
	-[?*!@=]

	# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
	-.*(/[^/]+)/[^/]+\1/[^/]+\1/

	# accept hosts in MY.DOMAIN.NAME
	+^http://([a-z0-9]\.)MY.DOMAIN.NAME/

	# skip everything else
	-.