| #!/bin/bash |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| # Usage: crawl [options] <crawl_dir> <num_rounds> |
| # |
| # Arguments: |
| # <crawl_dir> Directory where the crawl/host/link/segments dirs are saved |
| # <num_rounds> The number of rounds to run this crawl for |
| # |
| # Options: |
| # -i|--index Indexes crawl results into a configured indexer |
| # -D A Java property to pass to Nutch calls |
| # -w|--wait <NUMBER[SUFFIX]> Time to wait before generating a new segment when no URLs |
| # are scheduled for fetching. Suffix can be: s for second, |
| # m for minute, h for hour and d for day. If no suffix is |
| # specified second is used by default. [default: -1] |
| # -s <seed_dir> Path to seeds file(s) |
| # -sm <sitemap_dir> Path to sitemap URL file(s) |
| # |
| # --hostdbupdate Boolean indicator if we call hostdbupdate or not |
| # --hostdbgenerate Boolean indicator if we use hostdb in generate or not |
| # |
| # --num-slaves <num_slaves> Number of slave nodes [default: 1] |
| # Note: This can only be set when running in distribution mode |
| # --num-tasks <num_tasks> Number of reducer tasks [default: 2] |
| # --size-fetchlist <size_fetchlist> Number of URLs to fetch in one iteration [default: 50000] |
| # --time-limit-fetch <time_limit_fetch> Number of minutes allocated to the fetching [default: 180] |
| # --num-threads <num_threads> Number of threads for fetching / sitemap processing [default: 50] |
| # |
| # |
| # UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND |
| # INDEXING FOR EACH SEGMENT |
| |
| function __to_seconds() { |
| NUMBER=$(echo $1 | tr -dc '0-9') |
| MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]') |
| |
| case $MODIFIER in |
| m|M) |
| SECONDS=`expr $NUMBER \* 60` |
| ;; |
| h|H) |
| SECONDS=`expr $NUMBER \* 120` |
| ;; |
| d|D) |
| SECONDS=`expr $NUMBER \* 86400` |
| ;; |
| s|S|*) |
| SECONDS=$NUMBER |
| ;; |
| esac |
| |
| echo $SECONDS |
| } |
| |
| function __print_usage { |
| echo "Usage: crawl [options] <crawl_dir> <num_rounds>" |
| echo -e "" |
| echo -e "Arguments:" |
| echo -e " <crawl_dir>\t\t\t\tDirectory where the crawl/host/link/segments dirs are saved" |
| echo -e " <num_rounds>\t\t\t\tThe number of rounds to run this crawl for" |
| echo -e "" |
| echo -e "Options:" |
| echo -e " -i|--index\t\t\t\tIndexes crawl results into a configured indexer" |
| echo -e " -D\t\t\t\t\tA Java property to pass to Nutch calls" |
| echo -e " -w|--wait <NUMBER[SUFFIX]>\t\tTime to wait before generating a new segment when no URLs" |
| echo -e " \t\t\t\t\tare scheduled for fetching. Suffix can be: s for second," |
| echo -e " \t\t\t\t\tm for minute, h for hour and d for day. If no suffix is" |
| echo -e " \t\t\t\t\tspecified second is used by default. [default: -1]" |
| echo -e " -s <seed_dir>\t\t\t\tPath to seeds file(s)" |
| echo -e " -sm <sitemap_dir>\t\t\tPath to sitemap URL file(s)" |
| echo -e " --hostdbupdate\t\t\t\tBoolean flag showing if we either update or not update hostdb for each round" |
| echo -e " --hostdbgenerate\t\t\tBoolean flag showing if we use hostdb in generate or not" |
| echo -e " --num-slaves <num_slaves>\t\tNumber of slave nodes [default: 1]" |
| echo -e " \t\t\t\t\tNote: This can only be set when running in distribution mode" |
| echo -e " --num-tasks <num_tasks>\t\tNumber of reducer tasks [default: 2]" |
| echo -e " --size-fetchlist <size_fetchlist>\tNumber of URLs to fetch in one iteration [default: 50000]" |
| echo -e " --time-limit-fetch <time_limit_fetch>\tNumber of minutes allocated to the fetching [default: 180]" |
| echo -e " --num-threads <num_threads>\t\tNumber of threads for fetching / sitemap processing [default: 50]" |
| echo -e " --sitemaps-from-hostdb <frequency>\tWhether and how often to process sitemaps based on HostDB." |
| echo -e " \t\t\t\t\tSupported values are:" |
| echo -e " \t\t\t\t\t - never [default]" |
| echo -e " \t\t\t\t\t - always (processing takes place in every iteration)" |
| echo -e " \t\t\t\t\t - once (processing only takes place in the first iteration)" |
| |
| exit 1 |
| } |
| |
| # default values |
| INDEXFLAG=false |
| HOSTDBUPDATE=false |
| HOSTDBGENERATE=false |
| JAVA_PROPERTIES="" |
| WAIT=-1 # don't wait if there are no URLs to fetch |
| SEEDDIR="" |
| NUM_SLAVES=1 |
| NUM_TASKS=2 # 2 x NUM_SLAVES |
| SIZE_FETCHLIST=50000 # 25K x NUM_TASKS |
| TIME_LIMIT_FETCH=180 |
| NUM_THREADS=50 |
| SITEMAPS_FROM_HOSTDB_FREQUENCY=never |
| |
| while [[ $# > 0 ]] |
| do |
| case $1 in |
| -i|--index) |
| INDEXFLAG=true |
| shift |
| ;; |
| -D) |
| JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}" |
| shift 2 |
| ;; |
| -s) |
| SEEDDIR="${2}" |
| shift 2 |
| ;; |
| -sm) |
| SITEMAPDIR="${2}" |
| shift 2 |
| ;; |
| -w|--wait) |
| WAIT="${2}" |
| shift 2 |
| ;; |
| --num-slaves) |
| NUM_SLAVES="${2}" |
| shift 2 |
| ;; |
| --num-tasks) |
| NUM_TASKS="${2}" |
| shift 2 |
| ;; |
| --size-fetchlist) |
| SIZE_FETCHLIST="${2}" |
| shift 2 |
| ;; |
| --time-limit-fetch) |
| TIME_LIMIT_FETCH="${2}" |
| shift 2 |
| ;; |
| --num-threads) |
| NUM_THREADS="${2}" |
| shift 2 |
| ;; |
| --sitemaps-from-hostdb) |
| SITEMAPS_FROM_HOSTDB_FREQUENCY="${2}" |
| shift 2 |
| ;; |
| --hostdbupdate) |
| HOSTDBUPDATE=true |
| shift |
| ;; |
| --hostdbgenerate) |
| HOSTDBGENERATE=true |
| shift |
| ;; |
| *) |
| break |
| ;; |
| esac |
| done |
| |
| if [[ ! "$SITEMAPS_FROM_HOSTDB_FREQUENCY" =~ ^(never|always|once)$ ]]; then |
| echo "Error: --sitemaps-from-hostdb <frequency> has to be one of never, always, once." |
| echo -e "" |
| __print_usage |
| fi |
| |
| if [[ $# != 2 ]]; then |
| __print_usage |
| fi |
| |
| CRAWL_PATH="$1" |
| LIMIT="$2" |
| |
| # convert wait time to seconds for compatibility reasons |
| if [ "$WAIT" != "-1" ]; then |
| WAIT=$( __to_seconds "$WAIT" ) |
| echo "Time to wait (--wait) = $WAIT sec." |
| fi |
| |
| bin="`dirname "$0"`" |
| bin="`cd "$bin"; pwd`" |
| |
| # determines whether mode based on presence of job file |
| mode=local |
| if [ -f "${bin}"/../*nutch*.job ]; then |
| mode=distributed |
| fi |
| if [[ "$mode" = "local" ]]; then |
| NUM_SLAVES=1 |
| fi |
| |
| # note that some of the options listed here could be set in the |
| # corresponding hadoop site xml param file |
| commonOptions="-D mapreduce.job.reduces=$NUM_TASKS -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true" |
| |
| # check that hadoop can be found on the path |
| if [ $mode = "distributed" ]; then |
| if [ $(which hadoop | wc -l ) -eq 0 ]; then |
| echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode." |
| exit -1; |
| fi |
| fi |
| |
| |
| function __bin_nutch { |
| # run $bin/nutch, exit if exit value indicates error |
| |
| echo "$bin/nutch $@" ;# echo command and arguments |
| "$bin/nutch" "$@" |
| |
| RETCODE=$? |
| if [ $RETCODE -ne 0 ] |
| then |
| echo "Error running:" |
| echo " $bin/nutch $@" |
| echo "Failed with exit value $RETCODE." |
| exit $RETCODE |
| fi |
| } |
| |
| # check if directory exists locally or on hdfs |
| function __directory_exists { |
| if [[ "$mode" == local && -d "$1" ]]; then |
| return 0 |
| elif [[ "$mode" == distributed ]] && hadoop fs -test -d "$1"; then |
| return 0 |
| else |
| return 1 |
| fi |
| } |
| |
| function __update_hostdb { |
| if __directory_exists "$CRAWL_PATH"/crawldb; then |
| echo "Updating HostDB" |
| __bin_nutch updatehostdb -crawldb "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb |
| fi |
| } |
| |
| # initial injection |
| if [[ ! -z $SEEDDIR ]]; then |
| echo "Injecting seed URLs" |
| __bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR" |
| fi |
| |
| # sitemap processing based on sitemap definition file(s) |
| if [[ ! -z $SITEMAPDIR ]]; then |
| echo "Processing sitemaps defined in $SITEMAPDIR" |
| __bin_nutch sitemap "$CRAWL_PATH/crawldb" -sitemapUrls "$SITEMAPDIR" -threads $NUM_THREADS |
| fi |
| |
| # main loop : rounds of generate - fetch - parse - update |
| for ((a=1; ; a++)) |
| do |
| if [ -e ".STOP" ]; then |
| echo "STOP file found - escaping loop" |
| break |
| fi |
| |
| if [ $LIMIT -ne -1 ]; then |
| if [ $a -gt $LIMIT ]; then |
| echo `date` ": Finished loop with $LIMIT iterations" |
| break |
| fi |
| echo `date` ": Iteration $a of $LIMIT" |
| else |
| echo `date` ": Iteration $a" |
| fi |
| |
| if [[ "$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "always" || ("$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "once" && $a -eq 1) ]]; then |
| # create / update HostDB on first run |
| [[ $a -eq 1 ]] && __update_hostdb |
| |
| # sitemap processing based on HostDB |
| if __directory_exists "$CRAWL_PATH"/hostdb; then |
| echo "Processing sitemaps based on hosts in HostDB" |
| __bin_nutch sitemap "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb -threads $NUM_THREADS |
| fi |
| fi |
| |
| echo "Generating a new segment" |
| if [[ "$HOSTDBGENERATE" == "true" ]] && __directory_exists "$CRAWL_PATH"/hostdb; then |
| generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter -hostdb "$CRAWL_PATH"/hostdb) |
| else |
| generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter) |
| fi |
| |
| echo "$bin/nutch generate ${generate_args[@]}" |
| $bin/nutch generate "${generate_args[@]}" |
| RETCODE=$? |
| if [ $RETCODE -eq 0 ]; then |
| : # ok: no error |
| elif [ $RETCODE -eq 1 ]; then |
| echo "Generate returned 1 (no new segments created)" |
| |
| if [ "$WAIT" -ne -1 ]; then |
| echo "Waiting for $WAIT sec. ..." |
| sleep $WAIT |
| continue |
| else |
| echo "Escaping loop: no more URLs to fetch now" |
| break |
| fi |
| else |
| echo "Error running:" |
| echo " $bin/nutch generate ${generate_args[@]}" |
| echo "Failed with exit value $RETCODE." |
| exit $RETCODE |
| fi |
| |
| # capture the name of the segment |
| # call hadoop in distributed mode |
| # or use ls |
| |
| if [ $mode = "local" ]; then |
| SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1` |
| else |
| SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments | sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1` |
| fi |
| |
| echo "Operating on segment : $SEGMENT" |
| |
| # fetching the segment |
| echo "Fetching : $SEGMENT" |
| __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$TIME_LIMIT_FETCH "$CRAWL_PATH"/segments/$SEGMENT -threads $NUM_THREADS |
| |
| # parsing the segment |
| echo "Parsing : $SEGMENT" |
| # enable the skipping of records for the parsing so that a dodgy document |
| # so that it does not fail the full task |
| skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1" |
| __bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT |
| |
| # updatedb with this segment |
| echo "CrawlDB update" |
| __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT |
| |
| # note that the link inversion - indexing routine can be done within the main loop |
| # on a per segment basis |
| echo "Link inversion" |
| __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT |
| |
| echo "Dedup on crawldb" |
| __bin_nutch dedup "$CRAWL_PATH"/crawldb |
| |
| if $INDEXFLAG; then |
| echo "Indexing $SEGMENT to index" |
| __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT |
| |
| echo "Cleaning up index if possible" |
| __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb |
| else |
| echo "Skipping indexing ..." |
| fi |
| |
| echo "HostDB update" |
| if $HOSTDBUPDATE; then |
| __update_hostdb |
| fi |
| |
| ####################################################### |
| # The following commands fall into WebGraph territory |
| # and should be uncommented based on your requirements |
| ####################################################### |
| #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/" |
| #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH" |
| |
| #echo "Running Loops Job on WebGraph within $CRAWL_PATH" |
| #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH" |
| |
| #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH" |
| #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH" |
| |
| #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH" |
| #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH" |
| |
| #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores" |
| #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores |
| |
| done |
| |
| exit 0 |