src/bin/crawl - nutch - Git at Google

 #!/bin/bash
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # The Crawl command script : crawl <seedDir> <crawlId> <solrURL> <numberOfRounds>
 #
 #
 # UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
 # INDEXING FOR EACH BATCH

 SEEDDIR="$1"
 CRAWL_ID="$2"
 if [ "$#" -eq 3 ]; then
     LIMIT="$3"
 elif [ "$#" -eq 4 ]; then
      SOLRURL="$3"
      LIMIT="$4"
 else
     echo "Unknown # of arguments $#"
     echo "Usage: crawl <seedDir> <crawlID> [<solrUrl>] <numberOfRounds>"
     exit -1;
 fi

 if [ "$SEEDDIR" = "" ]; then
     echo "Missing seedDir : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
     exit -1;
 fi

 if [ "$CRAWL_ID" = "" ]; then
     echo "Missing crawlID : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
     exit -1;
 fi

 if [ "$SOLRURL" = "" ]; then
     echo "No SOLRURL specified. Skipping indexing."
 fi

 if [ "$LIMIT" = "" ]; then
     echo "Missing numberOfRounds : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
     exit -1;
 fi

 #############################################
 # MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
 #############################################

 # set the number of slaves nodes
 numSlaves=1

 # and the total number of available tasks
 # sets Hadoop parameter "mapred.reduce.tasks"
 numTasks=`expr $numSlaves \* 2`

 # number of urls to fetch in one iteration
 # 250K per task?
 sizeFetchlist=`expr $numSlaves \* 50000`

 # time limit for feching
 timeLimitFetch=180

 # Adds <days> to the current time to facilitate
 # crawling urls already fetched sooner then
 # db.default.fetch.interval.
 addDays=0
 #############################################

 bin="`dirname "$0"`"
 bin="`cd "$bin"; pwd`"

 # determines whether mode based on presence of job file
 mode=local
 if [ -f "${bin}"/../*nutch*.job ]; then
     mode=distributed
 fi

 # note that some of the options listed here could be set in the
 # corresponding hadoop site xml param file
 commonOptions="-D mapred.reduce.tasks=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapred.reduce.tasks.speculative.execution=false -D mapred.map.tasks.speculative.execution=false -D mapred.compress.map.output=true"

  # check that hadoop can be found on the path
 if [ $mode = "distributed" ]; then
  if [ $(which hadoop | wc -l ) -eq 0 ]; then
     echo "Can't find Hadoop executable. Add HADOOP_HOME/bin to the path or run in local mode."
     exit -1;
  fi
 fi


 function __bin_nutch {
     # run $bin/nutch, exit if exit value indicates error

     echo "$bin/nutch $@" ;# echo command and arguments
     "$bin/nutch" "$@"

     RETCODE=$?
     if [ $RETCODE -ne 0 ]
     then
         echo "Error running:"
         echo "  $bin/nutch $@"
         echo "Failed with exit value $RETCODE."
         exit $RETCODE
     fi
 }


 # initial injection
 echo "Injecting seed URLs"
 __bin_nutch inject "$SEEDDIR" -crawlId "$CRAWL_ID"


 # main loop : rounds of generate - fetch - parse - update
 for ((a=1; a <= LIMIT ; a++))
 do
   if [ -e ".STOP" ]
   then
    echo "STOP file found - escaping loop"
    break
   fi

   echo `date` ": Iteration $a of $LIMIT"

   echo "Generating batchId"
   batchId=`date +%s`-$RANDOM

   echo "Generating a new fetchlist"
   generate_args=($commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId "$CRAWL_ID" -batchId $batchId)
   echo "$bin/nutch generate ${generate_args[@]}"
   $bin/nutch generate "${generate_args[@]}"
   RETCODE=$?
   if [ $RETCODE -eq 0 ]; then
       : # ok: no error
   elif [ $RETCODE -eq 1 ]; then
     echo "Generate returned 1 (no new segments created)"
     echo "Escaping loop: no more URLs to fetch now"
     break
   else
     echo "Error running:"
     echo "  $bin/nutch generate ${generate_args[@]}"
     echo "Failed with exit value $RETCODE."
     exit $RETCODE
   fi

   echo "Fetching : "
   __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch $batchId -crawlId "$CRAWL_ID" -threads 50

   # parsing the batch
   echo "Parsing : "
   # enable the skipping of records for the parsing so that a dodgy document
   # so that it does not fail the full task
   skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1"
   __bin_nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId "$CRAWL_ID"

   # updatedb with this batch
   echo "CrawlDB update for $CRAWL_ID"
   __bin_nutch updatedb $commonOptions $batchId -crawlId "$CRAWL_ID"

   if [ -n "$SOLRURL" ]; then
     echo "Indexing $CRAWL_ID on SOLR index -> $SOLRURL"
     __bin_nutch index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId "$CRAWL_ID"

     echo "SOLR dedup -> $SOLRURL"
     __bin_nutch solrdedup $commonOptions $SOLRURL
   else
       echo "Skipping indexing tasks: no SOLR url provided."
   fi

 done

 exit 0
	#!/bin/bash
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	# The Crawl command script : crawl <seedDir> <crawlId> <solrURL> <numberOfRounds>
	#
	#
	# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
	# INDEXING FOR EACH BATCH

	SEEDDIR="$1"
	CRAWL_ID="$2"
	if [ "$#" -eq 3 ]; then
	LIMIT="$3"
	elif [ "$#" -eq 4 ]; then
	SOLRURL="$3"
	LIMIT="$4"
	else
	echo "Unknown # of arguments $#"
	echo "Usage: crawl <seedDir> <crawlID> [<solrUrl>] <numberOfRounds>"
	exit -1;
	fi

	if [ "$SEEDDIR" = "" ]; then
	echo "Missing seedDir : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
	exit -1;
	fi

	if [ "$CRAWL_ID" = "" ]; then
	echo "Missing crawlID : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
	exit -1;
	fi

	if [ "$SOLRURL" = "" ]; then
	echo "No SOLRURL specified. Skipping indexing."
	fi

	if [ "$LIMIT" = "" ]; then
	echo "Missing numberOfRounds : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
	exit -1;
	fi

	#############################################
	# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
	#############################################

	# set the number of slaves nodes
	numSlaves=1

	# and the total number of available tasks
	# sets Hadoop parameter "mapred.reduce.tasks"
	numTasks=`expr $numSlaves \* 2`

	# number of urls to fetch in one iteration
	# 250K per task?
	sizeFetchlist=`expr $numSlaves \* 50000`

	# time limit for feching
	timeLimitFetch=180

	# Adds <days> to the current time to facilitate
	# crawling urls already fetched sooner then
	# db.default.fetch.interval.
	addDays=0
	#############################################

	bin="`dirname "$0"`"
	bin="`cd "$bin"; pwd`"

	# determines whether mode based on presence of job file
	mode=local
	if [ -f "${bin}"/../nutch.job ]; then
	mode=distributed
	fi

	# note that some of the options listed here could be set in the
	# corresponding hadoop site xml param file
	commonOptions="-D mapred.reduce.tasks=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapred.reduce.tasks.speculative.execution=false -D mapred.map.tasks.speculative.execution=false -D mapred.compress.map.output=true"

	# check that hadoop can be found on the path
	if [ $mode = "distributed" ]; then
	if [ $(which hadoop \| wc -l ) -eq 0 ]; then
	echo "Can't find Hadoop executable. Add HADOOP_HOME/bin to the path or run in local mode."
	exit -1;
	fi
	fi


	function __bin_nutch {
	# run $bin/nutch, exit if exit value indicates error

	echo "$bin/nutch $@" ;# echo command and arguments
	"$bin/nutch" "$@"

	RETCODE=$?
	if [ $RETCODE -ne 0 ]
	then
	echo "Error running:"
	echo " $bin/nutch $@"
	echo "Failed with exit value $RETCODE."
	exit $RETCODE
	fi
	}



	# initial injection
	echo "Injecting seed URLs"
	__bin_nutch inject "$SEEDDIR" -crawlId "$CRAWL_ID"


	# main loop : rounds of generate - fetch - parse - update
	for ((a=1; a <= LIMIT ; a++))
	do
	if [ -e ".STOP" ]
	then
	echo "STOP file found - escaping loop"
	break
	fi

	echo `date` ": Iteration $a of $LIMIT"

	echo "Generating batchId"
	batchId=`date +%s`-$RANDOM

	echo "Generating a new fetchlist"
	generate_args=($commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId "$CRAWL_ID" -batchId $batchId)
	echo "$bin/nutch generate ${generate_args[@]}"
	$bin/nutch generate "${generate_args[@]}"
	RETCODE=$?
	if [ $RETCODE -eq 0 ]; then
	: # ok: no error
	elif [ $RETCODE -eq 1 ]; then
	echo "Generate returned 1 (no new segments created)"
	echo "Escaping loop: no more URLs to fetch now"
	break
	else
	echo "Error running:"
	echo " $bin/nutch generate ${generate_args[@]}"
	echo "Failed with exit value $RETCODE."
	exit $RETCODE
	fi

	echo "Fetching : "
	__bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch $batchId -crawlId "$CRAWL_ID" -threads 50

	# parsing the batch
	echo "Parsing : "
	# enable the skipping of records for the parsing so that a dodgy document
	# so that it does not fail the full task
	skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1"
	__bin_nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId "$CRAWL_ID"

	# updatedb with this batch
	echo "CrawlDB update for $CRAWL_ID"
	__bin_nutch updatedb $commonOptions $batchId -crawlId "$CRAWL_ID"

	if [ -n "$SOLRURL" ]; then
	echo "Indexing $CRAWL_ID on SOLR index -> $SOLRURL"
	__bin_nutch index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId "$CRAWL_ID"

	echo "SOLR dedup -> $SOLRURL"
	__bin_nutch solrdedup $commonOptions $SOLRURL
	else
	echo "Skipping indexing tasks: no SOLR url provided."
	fi

	done

	exit 0