src/bin/crawl - nutch - Git at Google

 #!/bin/bash
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] [-s <Seed Dir>] <Crawl Dir> <Num Rounds>
 #    -i|--index      Indexes crawl results into a configured indexer
 #    -w|--wait       NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs
 #                    are scheduled for fetching. Suffix can be: s for second,
 #                    m for minute, h for hour and d for day. If no suffix is
 #                    specified second is used by default.
 #    -D              A Java property to pass to Nutch calls
 #    -s              Path to seeds file(s)
 #    Crawl Dir       Directory where the crawl/link/segments dirs are saved
 #    Num Rounds      The number of rounds to run this crawl for
 #
 #
 # UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
 # INDEXING FOR EACH SEGMENT

 INDEXFLAG=false
 JAVA_PROPERTIES=""
 WAIT=-1 # don't wait if there are no URLs to fetch

 function __to_seconds() {
   NUMBER=$(echo $1 | tr -dc '0-9')
   MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]')

   case $MODIFIER in
       m|M)
         SECONDS=`expr $NUMBER \* 60`
         ;;
       h|H)
         SECONDS=`expr $NUMBER \* 120`
         ;;
       d|D)
         SECONDS=`expr $NUMBER \* 86400`
         ;;
       s|S|*)
         SECONDS=$NUMBER
         ;;
   esac

   echo $SECONDS
 }

 SEEDDIR=""

 while [[ $# > 0 ]]
 do
     case $1 in
         -i|--index)
             INDEXFLAG=true
             shift
             ;;
         -D)
             JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}"
             shift 2
             ;;
         -s)
             SEEDDIR="${2}"
             shift 2
             ;;
         -w|--wait)
             WAIT="${2}"
             shift 2
             ;;
         *)
             break
             ;;
     esac
 done

 if [[ $# != 2 ]]; then
     echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] [-s <Seed Dir>] <Crawl Dir> <Num Rounds>"
     echo -e "\t-i|--index\tIndexes crawl results into a configured indexer"
     echo -e "\t-D\t\tA Java property to pass to Nutch calls"
     echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs"
     echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second,"
     echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is"
     echo -e "\t\t\tspecified second is used by default."
     echo -e "\t-s Seed Dir\tPath to seeds file(s)"
     echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved"
     echo -e "\tNum Rounds\tThe number of rounds to run this crawl for"
     exit 1
 fi

 CRAWL_PATH="$1"
 LIMIT="$2"

 # convert wait time to seconds for compatibility reasons
 if [ "$WAIT" != "-1" ]; then
   WAIT=$( __to_seconds "$WAIT" )
   echo "Time to wait (--wait) = $WAIT sec."
 fi

 #############################################
 # MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
 #############################################

 # set the number of slaves nodes
 numSlaves=1

 # and the total number of available tasks
 # sets Hadoop parameter "mapreduce.job.reduces"
 numTasks=`expr $numSlaves \* 2`

 # number of urls to fetch in one iteration
 # 250K per task?
 sizeFetchlist=`expr $numSlaves \* 50000`

 # time limit for feching
 timeLimitFetch=180

 # num threads for fetching
 numThreads=50

 #############################################

 bin="`dirname "$0"`"
 bin="`cd "$bin"; pwd`"

 # determines whether mode based on presence of job file
 mode=local
 if [ -f "${bin}"/../*nutch*.job ]; then
     mode=distributed
 fi

 # note that some of the options listed here could be set in the
 # corresponding hadoop site xml param file
 commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"

  # check that hadoop can be found on the path
 if [ $mode = "distributed" ]; then
  if [ $(which hadoop | wc -l ) -eq 0 ]; then
     echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
     exit -1;
  fi
 fi


 function __bin_nutch {
     # run $bin/nutch, exit if exit value indicates error

     echo "$bin/nutch $@" ;# echo command and arguments
     "$bin/nutch" "$@"

     RETCODE=$?
     if [ $RETCODE -ne 0 ]
     then
         echo "Error running:"
         echo "  $bin/nutch $@"
         echo "Failed with exit value $RETCODE."
         exit $RETCODE
     fi
 }

 # initial injection
 if [[ !  -z  $SEEDDIR  ]]
 then
   echo "Injecting seed URLs"
   __bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
 fi

 # main loop : rounds of generate - fetch - parse - update
 for ((a=1; ; a++))
 do
   if [ -e ".STOP" ]
   then
    echo "STOP file found - escaping loop"
    break
   fi

   if [ $LIMIT -ne -1 ]; then
     if [ $a -gt $LIMIT ]; then
       echo `date` ": Finished loop with $LIMIT iterations"
       break
     fi
     echo `date` ": Iteration $a of $LIMIT"
   else
     echo `date` ": Iteration $a"
   fi

   echo "Generating a new segment"
   generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
   echo "$bin/nutch generate ${generate_args[@]}"
   $bin/nutch generate "${generate_args[@]}"
   RETCODE=$?
   if [ $RETCODE -eq 0 ]; then
       : # ok: no error
   elif [ $RETCODE -eq 1 ]; then
     echo "Generate returned 1 (no new segments created)"

     if [ "$WAIT" -ne -1 ]; then
       echo "Waiting for $WAIT sec. ..."
       sleep $WAIT
       continue
     else
       echo "Escaping loop: no more URLs to fetch now"
       break
     fi
   else
     echo "Error running:"
     echo "  $bin/nutch generate ${generate_args[@]}"
     echo "Failed with exit value $RETCODE."
     exit $RETCODE
   fi

   # capture the name of the segment
   # call hadoop in distributed mode
   # or use ls

   if [ $mode = "local" ]; then
    SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1`
   else
    SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments |  sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
   fi

   echo "Operating on segment : $SEGMENT"

   # fetching the segment
   echo "Fetching : $SEGMENT"
   __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads

   # parsing the segment
   echo "Parsing : $SEGMENT"
   # enable the skipping of records for the parsing so that a dodgy document
   # so that it does not fail the full task
   skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
   __bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT

   # updatedb with this segment
   echo "CrawlDB update"
   __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb  "$CRAWL_PATH"/segments/$SEGMENT

 # note that the link inversion - indexing routine can be done within the main loop
 # on a per segment basis
   echo "Link inversion"
   __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT

   echo "Dedup on crawldb"
   __bin_nutch dedup "$CRAWL_PATH"/crawldb

   if $INDEXFLAG; then
       echo "Indexing $SEGMENT to index"
       __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT

       echo "Cleaning up index if possible"
       __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb
   else
       echo "Skipping indexing ..."
   fi

   #######################################################
   # The following commands fall into WebGraph territory
   # and should be uncommented based on your requirements
   #######################################################
   #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
   #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"

   #echo "Running Loops Job on WebGraph within $CRAWL_PATH"
   #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH"

   #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
   #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH"

   #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and  WebGraph within $CRAWL_PATH"
   #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"

   #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
   #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores

 done

 exit 0
	#!/bin/bash
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	# Usage: crawl [-i\|--index] [-D "key=value"] [-w\|--wait] [-s <Seed Dir>] <Crawl Dir> <Num Rounds>
	# -i\|--index Indexes crawl results into a configured indexer
	# -w\|--wait NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs
	# are scheduled for fetching. Suffix can be: s for second,
	# m for minute, h for hour and d for day. If no suffix is
	# specified second is used by default.
	# -D A Java property to pass to Nutch calls
	# -s Path to seeds file(s)
	# Crawl Dir Directory where the crawl/link/segments dirs are saved
	# Num Rounds The number of rounds to run this crawl for
	#
	#
	# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
	# INDEXING FOR EACH SEGMENT

	INDEXFLAG=false
	JAVA_PROPERTIES=""
	WAIT=-1 # don't wait if there are no URLs to fetch

	function __to_seconds() {
	NUMBER=$(echo $1 \| tr -dc '0-9')
	MODIFIER=$(echo $1 \| tr -dc '[^s\|h\|m\|d]]')

	case $MODIFIER in
	m\|M)
	SECONDS=`expr $NUMBER \* 60`
	;;
	h\|H)
	SECONDS=`expr $NUMBER \* 120`
	;;
	d\|D)
	SECONDS=`expr $NUMBER \* 86400`
	;;
	s\|S\|*)
	SECONDS=$NUMBER
	;;
	esac

	echo $SECONDS
	}

	SEEDDIR=""

	while [[ $# > 0 ]]
	do
	case $1 in
	-i\|--index)
	INDEXFLAG=true
	shift
	;;
	-D)
	JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}"
	shift 2
	;;
	-s)
	SEEDDIR="${2}"
	shift 2
	;;
	-w\|--wait)
	WAIT="${2}"
	shift 2
	;;
	*)
	break
	;;
	esac
	done

	if [[ $# != 2 ]]; then
	echo "Usage: crawl [-i\|--index] [-D \"key=value\"] [-w\|--wait] [-s <Seed Dir>] <Crawl Dir> <Num Rounds>"
	echo -e "\t-i\|--index\tIndexes crawl results into a configured indexer"
	echo -e "\t-D\t\tA Java property to pass to Nutch calls"
	echo -e "\t-w\|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs"
	echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second,"
	echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is"
	echo -e "\t\t\tspecified second is used by default."
	echo -e "\t-s Seed Dir\tPath to seeds file(s)"
	echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved"
	echo -e "\tNum Rounds\tThe number of rounds to run this crawl for"
	exit 1
	fi

	CRAWL_PATH="$1"
	LIMIT="$2"

	# convert wait time to seconds for compatibility reasons
	if [ "$WAIT" != "-1" ]; then
	WAIT=$( __to_seconds "$WAIT" )
	echo "Time to wait (--wait) = $WAIT sec."
	fi

	#############################################
	# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
	#############################################

	# set the number of slaves nodes
	numSlaves=1

	# and the total number of available tasks
	# sets Hadoop parameter "mapreduce.job.reduces"
	numTasks=`expr $numSlaves \* 2`

	# number of urls to fetch in one iteration
	# 250K per task?
	sizeFetchlist=`expr $numSlaves \* 50000`

	# time limit for feching
	timeLimitFetch=180

	# num threads for fetching
	numThreads=50

	#############################################

	bin="`dirname "$0"`"
	bin="`cd "$bin"; pwd`"

	# determines whether mode based on presence of job file
	mode=local
	if [ -f "${bin}"/../nutch.job ]; then
	mode=distributed
	fi

	# note that some of the options listed here could be set in the
	# corresponding hadoop site xml param file
	commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"

	# check that hadoop can be found on the path
	if [ $mode = "distributed" ]; then
	if [ $(which hadoop \| wc -l ) -eq 0 ]; then
	echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
	exit -1;
	fi
	fi


	function __bin_nutch {
	# run $bin/nutch, exit if exit value indicates error

	echo "$bin/nutch $@" ;# echo command and arguments
	"$bin/nutch" "$@"

	RETCODE=$?
	if [ $RETCODE -ne 0 ]
	then
	echo "Error running:"
	echo " $bin/nutch $@"
	echo "Failed with exit value $RETCODE."
	exit $RETCODE
	fi
	}

	# initial injection
	if [[ ! -z $SEEDDIR ]]
	then
	echo "Injecting seed URLs"
	__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
	fi

	# main loop : rounds of generate - fetch - parse - update
	for ((a=1; ; a++))
	do
	if [ -e ".STOP" ]
	then
	echo "STOP file found - escaping loop"
	break
	fi

	if [ $LIMIT -ne -1 ]; then
	if [ $a -gt $LIMIT ]; then
	echo `date` ": Finished loop with $LIMIT iterations"
	break
	fi
	echo `date` ": Iteration $a of $LIMIT"
	else
	echo `date` ": Iteration $a"
	fi

	echo "Generating a new segment"
	generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
	echo "$bin/nutch generate ${generate_args[@]}"
	$bin/nutch generate "${generate_args[@]}"
	RETCODE=$?
	if [ $RETCODE -eq 0 ]; then
	: # ok: no error
	elif [ $RETCODE -eq 1 ]; then
	echo "Generate returned 1 (no new segments created)"

	if [ "$WAIT" -ne -1 ]; then
	echo "Waiting for $WAIT sec. ..."
	sleep $WAIT
	continue
	else
	echo "Escaping loop: no more URLs to fetch now"
	break
	fi
	else
	echo "Error running:"
	echo " $bin/nutch generate ${generate_args[@]}"
	echo "Failed with exit value $RETCODE."
	exit $RETCODE
	fi

	# capture the name of the segment
	# call hadoop in distributed mode
	# or use ls

	if [ $mode = "local" ]; then
	SEGMENT=`ls "$CRAWL_PATH"/segments/ \| sort -n \| tail -n 1`
	else
	SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ \| grep segments \| sed -e "s/\//\\n/g" \| egrep 20[0-9]+ \| sort -n \| tail -n 1`
	fi

	echo "Operating on segment : $SEGMENT"

	# fetching the segment
	echo "Fetching : $SEGMENT"
	__bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads

	# parsing the segment
	echo "Parsing : $SEGMENT"
	# enable the skipping of records for the parsing so that a dodgy document
	# so that it does not fail the full task
	skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
	__bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT

	# updatedb with this segment
	echo "CrawlDB update"
	__bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT

	# note that the link inversion - indexing routine can be done within the main loop
	# on a per segment basis
	echo "Link inversion"
	__bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT

	echo "Dedup on crawldb"
	__bin_nutch dedup "$CRAWL_PATH"/crawldb

	if $INDEXFLAG; then
	echo "Indexing $SEGMENT to index"
	__bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT

	echo "Cleaning up index if possible"
	__bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb
	else
	echo "Skipping indexing ..."
	fi

	#######################################################
	# The following commands fall into WebGraph territory
	# and should be uncommented based on your requirements
	#######################################################
	#echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
	#__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"

	#echo "Running Loops Job on WebGraph within $CRAWL_PATH"
	#__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH"

	#echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
	#__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH"

	#echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH"
	#__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"

	#echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
	#__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores

	done

	exit 0