| #!/bin/bash |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| # Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] [-s <Seed Dir>] <Crawl Dir> <Num Rounds> |
| # -i|--index Indexes crawl results into a configured indexer |
| # -w|--wait NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs |
| # are scheduled for fetching. Suffix can be: s for second, |
| # m for minute, h for hour and d for day. If no suffix is |
| # specified second is used by default. |
| # -D A Java property to pass to Nutch calls |
| # -s Path to seeds file(s) |
| # Crawl Dir Directory where the crawl/link/segments dirs are saved |
| # Num Rounds The number of rounds to run this crawl for |
| # |
| # |
| # UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND |
| # INDEXING FOR EACH SEGMENT |
| |
| INDEXFLAG=false |
| JAVA_PROPERTIES="" |
| WAIT=-1 # don't wait if there are no URLs to fetch |
| |
| function __to_seconds() { |
| NUMBER=$(echo $1 | tr -dc '0-9') |
| MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]') |
| |
| case $MODIFIER in |
| m|M) |
| SECONDS=`expr $NUMBER \* 60` |
| ;; |
| h|H) |
| SECONDS=`expr $NUMBER \* 120` |
| ;; |
| d|D) |
| SECONDS=`expr $NUMBER \* 86400` |
| ;; |
| s|S|*) |
| SECONDS=$NUMBER |
| ;; |
| esac |
| |
| echo $SECONDS |
| } |
| |
| SEEDDIR="" |
| |
| while [[ $# > 0 ]] |
| do |
| case $1 in |
| -i|--index) |
| INDEXFLAG=true |
| shift |
| ;; |
| -D) |
| JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}" |
| shift 2 |
| ;; |
| -s) |
| SEEDDIR="${2}" |
| shift 2 |
| ;; |
| -w|--wait) |
| WAIT="${2}" |
| shift 2 |
| ;; |
| *) |
| break |
| ;; |
| esac |
| done |
| |
| if [[ $# != 2 ]]; then |
| echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] [-s <Seed Dir>] <Crawl Dir> <Num Rounds>" |
| echo -e "\t-i|--index\tIndexes crawl results into a configured indexer" |
| echo -e "\t-D\t\tA Java property to pass to Nutch calls" |
| echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs" |
| echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second," |
| echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is" |
| echo -e "\t\t\tspecified second is used by default." |
| echo -e "\t-s Seed Dir\tPath to seeds file(s)" |
| echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved" |
| echo -e "\tNum Rounds\tThe number of rounds to run this crawl for" |
| exit 1 |
| fi |
| |
| CRAWL_PATH="$1" |
| LIMIT="$2" |
| |
| # convert wait time to seconds for compatibility reasons |
| if [ "$WAIT" != "-1" ]; then |
| WAIT=$( __to_seconds "$WAIT" ) |
| echo "Time to wait (--wait) = $WAIT sec." |
| fi |
| |
| ############################################# |
| # MODIFY THE PARAMETERS BELOW TO YOUR NEEDS # |
| ############################################# |
| |
| # set the number of slaves nodes |
| numSlaves=1 |
| |
| # and the total number of available tasks |
| # sets Hadoop parameter "mapreduce.job.reduces" |
| numTasks=`expr $numSlaves \* 2` |
| |
| # number of urls to fetch in one iteration |
| # 250K per task? |
| sizeFetchlist=`expr $numSlaves \* 50000` |
| |
| # time limit for feching |
| timeLimitFetch=180 |
| |
| # num threads for fetching |
| numThreads=50 |
| |
| ############################################# |
| |
| bin="`dirname "$0"`" |
| bin="`cd "$bin"; pwd`" |
| |
| # determines whether mode based on presence of job file |
| mode=local |
| if [ -f "${bin}"/../*nutch*.job ]; then |
| mode=distributed |
| fi |
| |
| # note that some of the options listed here could be set in the |
| # corresponding hadoop site xml param file |
| commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true" |
| |
| # check that hadoop can be found on the path |
| if [ $mode = "distributed" ]; then |
| if [ $(which hadoop | wc -l ) -eq 0 ]; then |
| echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode." |
| exit -1; |
| fi |
| fi |
| |
| |
| function __bin_nutch { |
| # run $bin/nutch, exit if exit value indicates error |
| |
| echo "$bin/nutch $@" ;# echo command and arguments |
| "$bin/nutch" "$@" |
| |
| RETCODE=$? |
| if [ $RETCODE -ne 0 ] |
| then |
| echo "Error running:" |
| echo " $bin/nutch $@" |
| echo "Failed with exit value $RETCODE." |
| exit $RETCODE |
| fi |
| } |
| |
| # initial injection |
| if [[ ! -z $SEEDDIR ]] |
| then |
| echo "Injecting seed URLs" |
| __bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR" |
| fi |
| |
| # main loop : rounds of generate - fetch - parse - update |
| for ((a=1; ; a++)) |
| do |
| if [ -e ".STOP" ] |
| then |
| echo "STOP file found - escaping loop" |
| break |
| fi |
| |
| if [ $LIMIT -ne -1 ]; then |
| if [ $a -gt $LIMIT ]; then |
| echo `date` ": Finished loop with $LIMIT iterations" |
| break |
| fi |
| echo `date` ": Iteration $a of $LIMIT" |
| else |
| echo `date` ": Iteration $a" |
| fi |
| |
| echo "Generating a new segment" |
| generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter) |
| echo "$bin/nutch generate ${generate_args[@]}" |
| $bin/nutch generate "${generate_args[@]}" |
| RETCODE=$? |
| if [ $RETCODE -eq 0 ]; then |
| : # ok: no error |
| elif [ $RETCODE -eq 1 ]; then |
| echo "Generate returned 1 (no new segments created)" |
| |
| if [ "$WAIT" -ne -1 ]; then |
| echo "Waiting for $WAIT sec. ..." |
| sleep $WAIT |
| continue |
| else |
| echo "Escaping loop: no more URLs to fetch now" |
| break |
| fi |
| else |
| echo "Error running:" |
| echo " $bin/nutch generate ${generate_args[@]}" |
| echo "Failed with exit value $RETCODE." |
| exit $RETCODE |
| fi |
| |
| # capture the name of the segment |
| # call hadoop in distributed mode |
| # or use ls |
| |
| if [ $mode = "local" ]; then |
| SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1` |
| else |
| SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments | sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1` |
| fi |
| |
| echo "Operating on segment : $SEGMENT" |
| |
| # fetching the segment |
| echo "Fetching : $SEGMENT" |
| __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads |
| |
| # parsing the segment |
| echo "Parsing : $SEGMENT" |
| # enable the skipping of records for the parsing so that a dodgy document |
| # so that it does not fail the full task |
| skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1" |
| __bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT |
| |
| # updatedb with this segment |
| echo "CrawlDB update" |
| __bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT |
| |
| # note that the link inversion - indexing routine can be done within the main loop |
| # on a per segment basis |
| echo "Link inversion" |
| __bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT |
| |
| echo "Dedup on crawldb" |
| __bin_nutch dedup "$CRAWL_PATH"/crawldb |
| |
| if $INDEXFLAG; then |
| echo "Indexing $SEGMENT to index" |
| __bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT |
| |
| echo "Cleaning up index if possible" |
| __bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb |
| else |
| echo "Skipping indexing ..." |
| fi |
| |
| ####################################################### |
| # The following commands fall into WebGraph territory |
| # and should be uncommented based on your requirements |
| ####################################################### |
| #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/" |
| #__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH" |
| |
| #echo "Running Loops Job on WebGraph within $CRAWL_PATH" |
| #__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH" |
| |
| #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH" |
| #__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH" |
| |
| #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH" |
| #__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH" |
| |
| #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores" |
| #__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores |
| |
| done |
| |
| exit 0 |