src/bin/crawl - nutch - Git at Google

 #!/bin/bash
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # Usage: crawl [options] <crawl_dir> <num_rounds>
 #
 # Arguments:
 #   <crawl_dir>                           Directory where the crawl/host/link/segments dirs are saved
 #   <num_rounds>                          The number of rounds to run this crawl for
 #
 # Options:
 #   -i|--index                            Indexes crawl results into a configured indexer
 #   -D <propery>=<value>                  A Nutch or Hadoop property to pass to Nutch calls overwriting
 #                                         properties defined in configuration files, e.g.
 #                                           increase content limit to 2MB:
 #                                             -D http.content.limit=2097152
 #                                         (in distributed mode) configure memory of map and reduce tasks:
 #                                           -D mapreduce.map.memory.mb=4608    -D mapreduce.map.java.opts=-Xmx4096m
 #                                           -D mapreduce.reduce.memory.mb=4608 -D mapreduce.reduce.java.opts=-Xmx4096m
 #   -w|--wait <NUMBER[SUFFIX]>            Time to wait before generating a new segment when no URLs
 #                                         are scheduled for fetching. Suffix can be: s for second,
 #                                         m for minute, h for hour and d for day. If no suffix is
 #                                         specified second is used by default. [default: -1]
 #   -s <seed_dir>                         Path to seeds file(s)
 #   -sm <sitemap_dir>                     Path to sitemap URL file(s)
 #
 #   --hostdbupdate                        Boolean indicator if we call hostdbupdate or not
 #   --hostdbgenerate                      Boolean indicator if we use hostdb in generate or not
 #
 #   --num-fetchers <num_fetchers>         Number of tasks used for fetching (fetcher map tasks) [default: 1]
 #                                         Note: This can only be set when running in distributed mode and
 #                                               should correspond to the number of worker nodes in the cluster.
 #   --num-tasks <num_tasks>               Number of reducer tasks [default: 2]
 #   --size-fetchlist <size_fetchlist>     Number of URLs to fetch in one iteration [default: 50000]
 #   --time-limit-fetch <time_limit_fetch> Number of minutes allocated to the fetching [default: 180]
 #   --num-threads <num_threads>           Number of threads for fetching / sitemap processing [default: 50]
 #
 #   -dedup-group <none|host|domain>       Deduplication group method [default: none]
 #

 function __to_seconds() {
   NUMBER=$(echo $1 | tr -dc '0-9')
   MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]')

   case $MODIFIER in
       m|M)
         SECONDS=`expr $NUMBER \* 60`
         ;;
       h|H)
         SECONDS=`expr $NUMBER \* 120`
         ;;
       d|D)
         SECONDS=`expr $NUMBER \* 86400`
         ;;
       s|S|*)
         SECONDS=$NUMBER
         ;;
   esac

   echo $SECONDS
 }

 function __print_usage {
   echo "Usage: crawl [options] <crawl_dir> <num_rounds>"
   echo -e ""
   echo -e "Arguments:"
   echo -e "  <crawl_dir>\t\t\t\tDirectory where the crawl/host/link/segments dirs are saved"
   echo -e "  <num_rounds>\t\t\t\tThe number of rounds to run this crawl for"
   echo -e ""
   echo -e "Options:"
   echo -e "  -i|--index\t\t\t\tIndexes crawl results into a configured indexer"
   echo -e "  -D\t\t\t\t\tA Nutch or Hadoop property to pass to Nutch calls overwriting"
   echo -e "  \t\t\t\t\tproperties defined in configuration files, e.g."
   echo -e "  \t\t\t\t\tincrease content limit to 2MB:"
   echo -e "  \t\t\t\t\t  -D http.content.limit=2097152"
   echo -e "  \t\t\t\t\t(distributed mode only) configure memory of map and reduce tasks:"
   echo -e "  \t\t\t\t\t  -D mapreduce.map.memory.mb=4608    -D mapreduce.map.java.opts=-Xmx4096m"
   echo -e "  \t\t\t\t\t  -D mapreduce.reduce.memory.mb=4608 -D mapreduce.reduce.java.opts=-Xmx4096m"
   echo -e "  -w|--wait <NUMBER[SUFFIX]>\t\tTime to wait before generating a new segment when no URLs"
   echo -e "  \t\t\t\t\tare scheduled for fetching. Suffix can be: s for second,"
   echo -e "  \t\t\t\t\tm for minute, h for hour and d for day. If no suffix is"
   echo -e "  \t\t\t\t\tspecified second is used by default. [default: -1]"
   echo -e "  -s <seed_dir>\t\t\t\tPath to seeds file(s)"
   echo -e "  -sm <sitemap_dir>\t\t\tPath to sitemap URL file(s)"
   echo -e "  --hostdbupdate\t\t\tBoolean flag showing if we either update or not update hostdb for each round"
   echo -e "  --hostdbgenerate\t\t\tBoolean flag showing if we use hostdb in generate or not"
   echo -e "  --num-fetchers <num_fetchers>\t\tNumber of tasks used for fetching (fetcher map tasks) [default: 1]"
   echo -e "  \t\t\t\t\tNote: This can only be set when running in distributed mode and"
   echo -e "  \t\t\t\t\t      should correspond to the number of worker nodes in the cluster."
   echo -e "  --num-tasks <num_tasks>\t\tNumber of reducer tasks [default: 2]"
   echo -e "  --size-fetchlist <size_fetchlist>\tNumber of URLs to fetch in one iteration [default: 50000]"
   echo -e "  --time-limit-fetch <time_limit_fetch>\tNumber of minutes allocated to the fetching [default: 180]"
   echo -e "  --num-threads <num_threads>\t\tNumber of threads for fetching / sitemap processing [default: 50]"
   echo -e "  --sitemaps-from-hostdb <frequency>\tWhether and how often to process sitemaps based on HostDB."
   echo -e "  \t\t\t\t\tSupported values are:"
   echo -e "  \t\t\t\t\t  - never [default]"
   echo -e "  \t\t\t\t\t  - always (processing takes place in every iteration)"
   echo -e "  \t\t\t\t\t  - once (processing only takes place in the first iteration)"
   echo -e "  -dedup-group <none|host|domain>\tDeduplication group method [default: none]"

   exit 1
 }

 # default values
 INDEXFLAG=false
 HOSTDBUPDATE=false
 HOSTDBGENERATE=false
 HADOOP_PROPERTIES=()
 WAIT=-1 # don't wait if there are no URLs to fetch
 SEEDDIR=""
 NUM_FETCHERS=1
 NUM_TASKS=2 # 2 x NUM_FETCHERS
 SIZE_FETCHLIST=50000 # 25K x NUM_TASKS
 TIME_LIMIT_FETCH=180
 NUM_THREADS=50
 SITEMAPS_FROM_HOSTDB_FREQUENCY=never
 DEDUP_GROUP=none

 while [[ $# > 0 ]]
 do
     case $1 in
         -i|--index)
             INDEXFLAG=true
             shift
             ;;
         -D)
             HADOOP_PROPERTIES=("${HADOOP_PROPERTIES[@]}" -D"${2}")
             shift 2
             ;;
         -s)
             SEEDDIR="${2}"
             shift 2
             ;;
         -sm)
             SITEMAPDIR="${2}"
             shift 2
             ;;
         -w|--wait)
             WAIT="${2}"
             shift 2
             ;;
         --num-slaves)
             # back-ward compatibility: NUTCH-2759 renamed option --num-slaves to --num-fetchers
             NUM_FETCHERS="${2}"
             shift 2
             ;;
         --num-fetchers)
             NUM_FETCHERS="${2}"
             shift 2
             ;;
         --num-tasks)
             NUM_TASKS="${2}"
             shift 2
             ;;
         --size-fetchlist)
             SIZE_FETCHLIST="${2}"
             shift 2
             ;;
         --time-limit-fetch)
             TIME_LIMIT_FETCH="${2}"
             shift 2
             ;;
         --num-threads)
             NUM_THREADS="${2}"
             shift 2
             ;;
         --sitemaps-from-hostdb)
             SITEMAPS_FROM_HOSTDB_FREQUENCY="${2}"
             shift 2
             ;;
         --dedup-group)
             DEDUP_GROUP="${2}"
             shift 2
             ;;
         --hostdbupdate)
             HOSTDBUPDATE=true
             shift
             ;;
         --hostdbgenerate)
             HOSTDBGENERATE=true
             shift
             ;;
         *)
             break
             ;;
     esac
 done

 if [[ ! "$SITEMAPS_FROM_HOSTDB_FREQUENCY" =~ ^(never|always|once)$ ]]; then
   echo "Error: --sitemaps-from-hostdb <frequency> has to be one of never, always, once."
   echo -e ""
   __print_usage
 fi

 if [[ ! "$DEDUP_GROUP" =~ ^(none|host|domain)$ ]]; then
   echo "Error: --dedup-group <mode> has to be one of none, host, domain."
   echo -e ""
   __print_usage
 fi

 if [[ $# != 2 ]]; then
   __print_usage
 fi

 CRAWL_PATH="$1"
 LIMIT="$2"

 # convert wait time to seconds for compatibility reasons
 if [ "$WAIT" != "-1" ]; then
   WAIT=$( __to_seconds "$WAIT" )
   echo "Time to wait (--wait) = $WAIT sec."
 fi

 bin="`dirname "$0"`"
 bin="`cd "$bin"; pwd`"

 # determines whether mode based on presence of job file
 mode=local
 if [ -f "${bin}"/../*nutch*.job ]; then
   mode=distributed
 fi
 if [[ "$mode" = "local" ]]; then
   if [[ "$NUM_FETCHERS" -ne 1 ]]; then
     echo "Ignoring configured number of fetchers (--num_fetchers): a single fetcher task is used when running in local mode."
   fi
   NUM_FETCHERS=1
 fi

 # note that some of the options listed here could be set in the
 # corresponding hadoop site xml param file
 commonOptions=("${HADOOP_PROPERTIES[@]}" -Dmapreduce.job.reduces=$NUM_TASKS -Dmapreduce.reduce.speculative=false -Dmapreduce.map.speculative=false -Dmapreduce.map.output.compress=true)

  # check that hadoop can be found on the path
 if [ $mode = "distributed" ]; then
  if [ $(which hadoop | wc -l ) -eq 0 ]; then
     echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
     exit -1;
  fi
 fi


 function __bin_nutch {
     # run $bin/nutch, exit if exit value indicates error

     echo "$bin/nutch $@" ;# echo command and arguments
     "$bin/nutch" "$@"

     RETCODE=$?
     if [ $RETCODE -ne 0 ]
     then
         echo "Error running:"
         echo "  $bin/nutch $@"
         echo "Failed with exit value $RETCODE."
         exit $RETCODE
     fi
 }

 # check if directory exists locally or on hdfs
 function __directory_exists {
   if [[ "$mode" == local  &&  -d "$1" ]]; then
     return 0
   elif [[ "$mode" == distributed ]] && hadoop fs -test -d "$1"; then
     return 0
   else
     return 1
   fi
 }

 function __update_hostdb {
   if __directory_exists "$CRAWL_PATH"/crawldb; then
     echo "Updating HostDB"
     __bin_nutch updatehostdb "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb
   fi
 }

 # initial injection
 if [[ ! -z $SEEDDIR  ]]; then
   echo "Injecting seed URLs"
   __bin_nutch inject "${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$SEEDDIR"
 fi

 # sitemap processing based on sitemap definition file(s)
 if [[ ! -z $SITEMAPDIR ]]; then
   echo "Processing sitemaps defined in $SITEMAPDIR"
   __bin_nutch sitemap "${commonOptions[@]}" "$CRAWL_PATH/crawldb" -sitemapUrls "$SITEMAPDIR" -threads $NUM_THREADS
 fi

 # main loop : rounds of generate - fetch - parse - update
 for ((a=1; ; a++))
 do
   if [ -e ".STOP" ]; then
     echo "STOP file found - escaping loop"
     break
   fi

   if [ $LIMIT -ne -1 ]; then
     if [ $a -gt $LIMIT ]; then
       echo `date` ": Finished loop with $LIMIT iterations"
       break
     fi
     echo `date` ": Iteration $a of $LIMIT"
   else
     echo `date` ": Iteration $a"
   fi

   if [[ "$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "always" || ("$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "once" && $a -eq 1) ]]; then
     # create / update HostDB on first run
     [[ $a -eq 1 ]] && __update_hostdb

     # sitemap processing based on HostDB
     if __directory_exists "$CRAWL_PATH"/hostdb; then
       echo "Processing sitemaps based on hosts in HostDB"
       __bin_nutch sitemap "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb -threads $NUM_THREADS
     fi
   fi

   echo "Generating a new segment"
   if [[ "$HOSTDBGENERATE" == "true" ]] && __directory_exists "$CRAWL_PATH"/hostdb; then
    generate_args=("${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter -hostdb "$CRAWL_PATH"/hostdb)
   else
    generate_args=("${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter)
   fi

   echo "$bin/nutch generate ${generate_args[@]}"
   $bin/nutch generate "${generate_args[@]}"
   RETCODE=$?
   if [ $RETCODE -eq 0 ]; then
       : # ok: no error
   elif [ $RETCODE -eq 1 ]; then
     echo "Generate returned 1 (no new segments created)"

     if [ "$WAIT" -ne -1 ]; then
       echo "Waiting for $WAIT sec. ..."
       sleep $WAIT
       continue
     else
       echo "Escaping loop: no more URLs to fetch now"
       break
     fi
   else
     echo "Error running:"
     echo "  $bin/nutch generate ${generate_args[@]}"
     echo "Failed with exit value $RETCODE."
     exit $RETCODE
   fi

   # capture the name of the segment
   # call hadoop in distributed mode
   # or use ls

   if [ $mode = "local" ]; then
    SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1`
   else
    SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments |  sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
   fi

   echo "Operating on segment : $SEGMENT"

   # fetching the segment
   echo "Fetching : $SEGMENT"
   __bin_nutch fetch "${commonOptions[@]}" -D fetcher.timelimit.mins=$TIME_LIMIT_FETCH "$CRAWL_PATH"/segments/$SEGMENT -threads $NUM_THREADS

   # parsing the segment
   echo "Parsing : $SEGMENT"
   # enable the skipping of records for the parsing so that a dodgy document
   # so that it does not fail the full task
   skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
   __bin_nutch parse "${commonOptions[@]}" $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT

   # updatedb with this segment
   echo "CrawlDB update"
   __bin_nutch updatedb "${commonOptions[@]}" "$CRAWL_PATH"/crawldb  "$CRAWL_PATH"/segments/$SEGMENT

   echo "HostDB update"
   if $HOSTDBUPDATE; then
   __update_hostdb
   fi

   # Note that all steps below in this loop (link inversion, deduplication, indexing)
   # can be done
   # - either inside the loop on a per segment basis
   # - or after the loop over all segments created in all loop iterations
   #   (both invertlinks and index accept multiple segments as input)
   # The latter is more efficient but the index is then updated later.
   echo "Link inversion"
   __bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -noNormalize -nofilter

   echo "Dedup on crawldb"
   __bin_nutch dedup "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -group "$DEDUP_GROUP"

   if $INDEXFLAG; then
       echo "Indexing $SEGMENT to index"
       __bin_nutch index "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -deleteGone
   else
       echo "Skipping indexing ..."
   fi

 done

 #######################################################
 # The following commands fall into WebGraph territory
 # and should be uncommented based on your requirements
 #######################################################
 #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
 #__bin_nutch webgraph "${commonOptions[@]}" -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"

 #echo "Running Loops Job on WebGraph within $CRAWL_PATH"
 #__bin_nutch org.apache.nutch.scoring.webgraph.Loops "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"

 #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
 #__bin_nutch linkrank "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"

 #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and  WebGraph within $CRAWL_PATH"
 #__bin_nutch scoreupdater "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"

 #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
 #__bin_nutch nodedumper "${commonOptions[@]}" -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores

 exit 0
	#!/bin/bash
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	# Usage: crawl [options] <crawl_dir> <num_rounds>
	#
	# Arguments:
	# <crawl_dir> Directory where the crawl/host/link/segments dirs are saved
	# <num_rounds> The number of rounds to run this crawl for
	#
	# Options:
	# -i\|--index Indexes crawl results into a configured indexer
	# -D <propery>=<value> A Nutch or Hadoop property to pass to Nutch calls overwriting
	# properties defined in configuration files, e.g.
	# increase content limit to 2MB:
	# -D http.content.limit=2097152
	# (in distributed mode) configure memory of map and reduce tasks:
	# -D mapreduce.map.memory.mb=4608 -D mapreduce.map.java.opts=-Xmx4096m
	# -D mapreduce.reduce.memory.mb=4608 -D mapreduce.reduce.java.opts=-Xmx4096m
	# -w\|--wait <NUMBER[SUFFIX]> Time to wait before generating a new segment when no URLs
	# are scheduled for fetching. Suffix can be: s for second,
	# m for minute, h for hour and d for day. If no suffix is
	# specified second is used by default. [default: -1]
	# -s <seed_dir> Path to seeds file(s)
	# -sm <sitemap_dir> Path to sitemap URL file(s)
	#
	# --hostdbupdate Boolean indicator if we call hostdbupdate or not
	# --hostdbgenerate Boolean indicator if we use hostdb in generate or not
	#
	# --num-fetchers <num_fetchers> Number of tasks used for fetching (fetcher map tasks) [default: 1]
	# Note: This can only be set when running in distributed mode and
	# should correspond to the number of worker nodes in the cluster.
	# --num-tasks <num_tasks> Number of reducer tasks [default: 2]
	# --size-fetchlist <size_fetchlist> Number of URLs to fetch in one iteration [default: 50000]
	# --time-limit-fetch <time_limit_fetch> Number of minutes allocated to the fetching [default: 180]
	# --num-threads <num_threads> Number of threads for fetching / sitemap processing [default: 50]
	#
	# -dedup-group <none\|host\|domain> Deduplication group method [default: none]
	#

	function __to_seconds() {
	NUMBER=$(echo $1 \| tr -dc '0-9')
	MODIFIER=$(echo $1 \| tr -dc '[^s\|h\|m\|d]]')

	case $MODIFIER in
	m\|M)
	SECONDS=`expr $NUMBER \* 60`
	;;
	h\|H)
	SECONDS=`expr $NUMBER \* 120`
	;;
	d\|D)
	SECONDS=`expr $NUMBER \* 86400`
	;;
	s\|S\|*)
	SECONDS=$NUMBER
	;;
	esac

	echo $SECONDS
	}

	function __print_usage {
	echo "Usage: crawl [options] <crawl_dir> <num_rounds>"
	echo -e ""
	echo -e "Arguments:"
	echo -e " <crawl_dir>\t\t\t\tDirectory where the crawl/host/link/segments dirs are saved"
	echo -e " <num_rounds>\t\t\t\tThe number of rounds to run this crawl for"
	echo -e ""
	echo -e "Options:"
	echo -e " -i\|--index\t\t\t\tIndexes crawl results into a configured indexer"
	echo -e " -D\t\t\t\t\tA Nutch or Hadoop property to pass to Nutch calls overwriting"
	echo -e " \t\t\t\t\tproperties defined in configuration files, e.g."
	echo -e " \t\t\t\t\tincrease content limit to 2MB:"
	echo -e " \t\t\t\t\t -D http.content.limit=2097152"
	echo -e " \t\t\t\t\t(distributed mode only) configure memory of map and reduce tasks:"
	echo -e " \t\t\t\t\t -D mapreduce.map.memory.mb=4608 -D mapreduce.map.java.opts=-Xmx4096m"
	echo -e " \t\t\t\t\t -D mapreduce.reduce.memory.mb=4608 -D mapreduce.reduce.java.opts=-Xmx4096m"
	echo -e " -w\|--wait <NUMBER[SUFFIX]>\t\tTime to wait before generating a new segment when no URLs"
	echo -e " \t\t\t\t\tare scheduled for fetching. Suffix can be: s for second,"
	echo -e " \t\t\t\t\tm for minute, h for hour and d for day. If no suffix is"
	echo -e " \t\t\t\t\tspecified second is used by default. [default: -1]"
	echo -e " -s <seed_dir>\t\t\t\tPath to seeds file(s)"
	echo -e " -sm <sitemap_dir>\t\t\tPath to sitemap URL file(s)"
	echo -e " --hostdbupdate\t\t\tBoolean flag showing if we either update or not update hostdb for each round"
	echo -e " --hostdbgenerate\t\t\tBoolean flag showing if we use hostdb in generate or not"
	echo -e " --num-fetchers <num_fetchers>\t\tNumber of tasks used for fetching (fetcher map tasks) [default: 1]"
	echo -e " \t\t\t\t\tNote: This can only be set when running in distributed mode and"
	echo -e " \t\t\t\t\t should correspond to the number of worker nodes in the cluster."
	echo -e " --num-tasks <num_tasks>\t\tNumber of reducer tasks [default: 2]"
	echo -e " --size-fetchlist <size_fetchlist>\tNumber of URLs to fetch in one iteration [default: 50000]"
	echo -e " --time-limit-fetch <time_limit_fetch>\tNumber of minutes allocated to the fetching [default: 180]"
	echo -e " --num-threads <num_threads>\t\tNumber of threads for fetching / sitemap processing [default: 50]"
	echo -e " --sitemaps-from-hostdb <frequency>\tWhether and how often to process sitemaps based on HostDB."
	echo -e " \t\t\t\t\tSupported values are:"
	echo -e " \t\t\t\t\t - never [default]"
	echo -e " \t\t\t\t\t - always (processing takes place in every iteration)"
	echo -e " \t\t\t\t\t - once (processing only takes place in the first iteration)"
	echo -e " -dedup-group <none\|host\|domain>\tDeduplication group method [default: none]"

	exit 1
	}

	# default values
	INDEXFLAG=false
	HOSTDBUPDATE=false
	HOSTDBGENERATE=false
	HADOOP_PROPERTIES=()
	WAIT=-1 # don't wait if there are no URLs to fetch
	SEEDDIR=""
	NUM_FETCHERS=1
	NUM_TASKS=2 # 2 x NUM_FETCHERS
	SIZE_FETCHLIST=50000 # 25K x NUM_TASKS
	TIME_LIMIT_FETCH=180
	NUM_THREADS=50
	SITEMAPS_FROM_HOSTDB_FREQUENCY=never
	DEDUP_GROUP=none

	while [[ $# > 0 ]]
	do
	case $1 in
	-i\|--index)
	INDEXFLAG=true
	shift
	;;
	-D)
	HADOOP_PROPERTIES=("${HADOOP_PROPERTIES[@]}" -D"${2}")
	shift 2
	;;
	-s)
	SEEDDIR="${2}"
	shift 2
	;;
	-sm)
	SITEMAPDIR="${2}"
	shift 2
	;;
	-w\|--wait)
	WAIT="${2}"
	shift 2
	;;
	--num-slaves)
	# back-ward compatibility: NUTCH-2759 renamed option --num-slaves to --num-fetchers
	NUM_FETCHERS="${2}"
	shift 2
	;;
	--num-fetchers)
	NUM_FETCHERS="${2}"
	shift 2
	;;
	--num-tasks)
	NUM_TASKS="${2}"
	shift 2
	;;
	--size-fetchlist)
	SIZE_FETCHLIST="${2}"
	shift 2
	;;
	--time-limit-fetch)
	TIME_LIMIT_FETCH="${2}"
	shift 2
	;;
	--num-threads)
	NUM_THREADS="${2}"
	shift 2
	;;
	--sitemaps-from-hostdb)
	SITEMAPS_FROM_HOSTDB_FREQUENCY="${2}"
	shift 2
	;;
	--dedup-group)
	DEDUP_GROUP="${2}"
	shift 2
	;;
	--hostdbupdate)
	HOSTDBUPDATE=true
	shift
	;;
	--hostdbgenerate)
	HOSTDBGENERATE=true
	shift
	;;
	*)
	break
	;;
	esac
	done

	if [[ ! "$SITEMAPS_FROM_HOSTDB_FREQUENCY" =~ ^(never\|always\|once)$ ]]; then
	echo "Error: --sitemaps-from-hostdb <frequency> has to be one of never, always, once."
	echo -e ""
	__print_usage
	fi

	if [[ ! "$DEDUP_GROUP" =~ ^(none\|host\|domain)$ ]]; then
	echo "Error: --dedup-group <mode> has to be one of none, host, domain."
	echo -e ""
	__print_usage
	fi

	if [[ $# != 2 ]]; then
	__print_usage
	fi

	CRAWL_PATH="$1"
	LIMIT="$2"

	# convert wait time to seconds for compatibility reasons
	if [ "$WAIT" != "-1" ]; then
	WAIT=$( __to_seconds "$WAIT" )
	echo "Time to wait (--wait) = $WAIT sec."
	fi

	bin="`dirname "$0"`"
	bin="`cd "$bin"; pwd`"

	# determines whether mode based on presence of job file
	mode=local
	if [ -f "${bin}"/../nutch.job ]; then
	mode=distributed
	fi
	if [[ "$mode" = "local" ]]; then
	if [[ "$NUM_FETCHERS" -ne 1 ]]; then
	echo "Ignoring configured number of fetchers (--num_fetchers): a single fetcher task is used when running in local mode."
	fi
	NUM_FETCHERS=1
	fi

	# note that some of the options listed here could be set in the
	# corresponding hadoop site xml param file
	commonOptions=("${HADOOP_PROPERTIES[@]}" -Dmapreduce.job.reduces=$NUM_TASKS -Dmapreduce.reduce.speculative=false -Dmapreduce.map.speculative=false -Dmapreduce.map.output.compress=true)

	# check that hadoop can be found on the path
	if [ $mode = "distributed" ]; then
	if [ $(which hadoop \| wc -l ) -eq 0 ]; then
	echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
	exit -1;
	fi
	fi


	function __bin_nutch {
	# run $bin/nutch, exit if exit value indicates error

	echo "$bin/nutch $@" ;# echo command and arguments
	"$bin/nutch" "$@"

	RETCODE=$?
	if [ $RETCODE -ne 0 ]
	then
	echo "Error running:"
	echo " $bin/nutch $@"
	echo "Failed with exit value $RETCODE."
	exit $RETCODE
	fi
	}

	# check if directory exists locally or on hdfs
	function __directory_exists {
	if [[ "$mode" == local && -d "$1" ]]; then
	return 0
	elif [[ "$mode" == distributed ]] && hadoop fs -test -d "$1"; then
	return 0
	else
	return 1
	fi
	}

	function __update_hostdb {
	if __directory_exists "$CRAWL_PATH"/crawldb; then
	echo "Updating HostDB"
	__bin_nutch updatehostdb "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb
	fi
	}

	# initial injection
	if [[ ! -z $SEEDDIR ]]; then
	echo "Injecting seed URLs"
	__bin_nutch inject "${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$SEEDDIR"
	fi

	# sitemap processing based on sitemap definition file(s)
	if [[ ! -z $SITEMAPDIR ]]; then
	echo "Processing sitemaps defined in $SITEMAPDIR"
	__bin_nutch sitemap "${commonOptions[@]}" "$CRAWL_PATH/crawldb" -sitemapUrls "$SITEMAPDIR" -threads $NUM_THREADS
	fi

	# main loop : rounds of generate - fetch - parse - update
	for ((a=1; ; a++))
	do
	if [ -e ".STOP" ]; then
	echo "STOP file found - escaping loop"
	break
	fi

	if [ $LIMIT -ne -1 ]; then
	if [ $a -gt $LIMIT ]; then
	echo `date` ": Finished loop with $LIMIT iterations"
	break
	fi
	echo `date` ": Iteration $a of $LIMIT"
	else
	echo `date` ": Iteration $a"
	fi

	if [[ "$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "always" \|\| ("$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "once" && $a -eq 1) ]]; then
	# create / update HostDB on first run
	[[ $a -eq 1 ]] && __update_hostdb

	# sitemap processing based on HostDB
	if __directory_exists "$CRAWL_PATH"/hostdb; then
	echo "Processing sitemaps based on hosts in HostDB"
	__bin_nutch sitemap "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb -threads $NUM_THREADS
	fi
	fi

	echo "Generating a new segment"
	if [[ "$HOSTDBGENERATE" == "true" ]] && __directory_exists "$CRAWL_PATH"/hostdb; then
	generate_args=("${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter -hostdb "$CRAWL_PATH"/hostdb)
	else
	generate_args=("${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter)
	fi

	echo "$bin/nutch generate ${generate_args[@]}"
	$bin/nutch generate "${generate_args[@]}"
	RETCODE=$?
	if [ $RETCODE -eq 0 ]; then
	: # ok: no error
	elif [ $RETCODE -eq 1 ]; then
	echo "Generate returned 1 (no new segments created)"

	if [ "$WAIT" -ne -1 ]; then
	echo "Waiting for $WAIT sec. ..."
	sleep $WAIT
	continue
	else
	echo "Escaping loop: no more URLs to fetch now"
	break
	fi
	else
	echo "Error running:"
	echo " $bin/nutch generate ${generate_args[@]}"
	echo "Failed with exit value $RETCODE."
	exit $RETCODE
	fi

	# capture the name of the segment
	# call hadoop in distributed mode
	# or use ls

	if [ $mode = "local" ]; then
	SEGMENT=`ls "$CRAWL_PATH"/segments/ \| sort -n \| tail -n 1`
	else
	SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ \| grep segments \| sed -e "s/\//\\n/g" \| egrep 20[0-9]+ \| sort -n \| tail -n 1`
	fi

	echo "Operating on segment : $SEGMENT"

	# fetching the segment
	echo "Fetching : $SEGMENT"
	__bin_nutch fetch "${commonOptions[@]}" -D fetcher.timelimit.mins=$TIME_LIMIT_FETCH "$CRAWL_PATH"/segments/$SEGMENT -threads $NUM_THREADS

	# parsing the segment
	echo "Parsing : $SEGMENT"
	# enable the skipping of records for the parsing so that a dodgy document
	# so that it does not fail the full task
	skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
	__bin_nutch parse "${commonOptions[@]}" $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT

	# updatedb with this segment
	echo "CrawlDB update"
	__bin_nutch updatedb "${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT

	echo "HostDB update"
	if $HOSTDBUPDATE; then
	__update_hostdb
	fi

	# Note that all steps below in this loop (link inversion, deduplication, indexing)
	# can be done
	# - either inside the loop on a per segment basis
	# - or after the loop over all segments created in all loop iterations
	# (both invertlinks and index accept multiple segments as input)
	# The latter is more efficient but the index is then updated later.
	echo "Link inversion"
	__bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -noNormalize -nofilter

	echo "Dedup on crawldb"
	__bin_nutch dedup "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -group "$DEDUP_GROUP"

	if $INDEXFLAG; then
	echo "Indexing $SEGMENT to index"
	__bin_nutch index "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -deleteGone
	else
	echo "Skipping indexing ..."
	fi

	done

	#######################################################
	# The following commands fall into WebGraph territory
	# and should be uncommented based on your requirements
	#######################################################
	#echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
	#__bin_nutch webgraph "${commonOptions[@]}" -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"

	#echo "Running Loops Job on WebGraph within $CRAWL_PATH"
	#__bin_nutch org.apache.nutch.scoring.webgraph.Loops "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"

	#echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
	#__bin_nutch linkrank "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"

	#echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH"
	#__bin_nutch scoreupdater "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"

	#echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
	#__bin_nutch nodedumper "${commonOptions[@]}" -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores

	exit 0