| #!/bin/bash |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| # The Crawl command script : crawl <seedDir> <crawlId> <solrURL> <numberOfRounds> |
| # |
| # |
| # UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND |
| # INDEXING FOR EACH BATCH |
| |
| SEEDDIR="$1" |
| CRAWL_ID="$2" |
| if [ "$#" -eq 3 ]; then |
| LIMIT="$3" |
| elif [ "$#" -eq 4 ]; then |
| SOLRURL="$3" |
| LIMIT="$4" |
| else |
| echo "Unknown # of arguments $#" |
| echo "Usage: crawl <seedDir> <crawlID> [<solrUrl>] <numberOfRounds>" |
| exit -1; |
| fi |
| |
| if [ "$SEEDDIR" = "" ]; then |
| echo "Missing seedDir : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>" |
| exit -1; |
| fi |
| |
| if [ "$CRAWL_ID" = "" ]; then |
| echo "Missing crawlID : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>" |
| exit -1; |
| fi |
| |
| if [ "$SOLRURL" = "" ]; then |
| echo "No SOLRURL specified. Skipping indexing." |
| fi |
| |
| if [ "$LIMIT" = "" ]; then |
| echo "Missing numberOfRounds : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>" |
| exit -1; |
| fi |
| |
| ############################################# |
| # MODIFY THE PARAMETERS BELOW TO YOUR NEEDS # |
| ############################################# |
| |
| # set the number of slaves nodes |
| numSlaves=1 |
| |
| # and the total number of available tasks |
| # sets Hadoop parameter "mapred.reduce.tasks" |
| numTasks=`expr $numSlaves \* 2` |
| |
| # number of urls to fetch in one iteration |
| # 250K per task? |
| sizeFetchlist=`expr $numSlaves \* 50000` |
| |
| # time limit for feching |
| timeLimitFetch=180 |
| |
| # Adds <days> to the current time to facilitate |
| # crawling urls already fetched sooner then |
| # db.default.fetch.interval. |
| addDays=0 |
| ############################################# |
| |
| bin="`dirname "$0"`" |
| bin="`cd "$bin"; pwd`" |
| |
| # determines whether mode based on presence of job file |
| mode=local |
| if [ -f "${bin}"/../*nutch*.job ]; then |
| mode=distributed |
| fi |
| |
| # note that some of the options listed here could be set in the |
| # corresponding hadoop site xml param file |
| commonOptions="-D mapred.reduce.tasks=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapred.reduce.tasks.speculative.execution=false -D mapred.map.tasks.speculative.execution=false -D mapred.compress.map.output=true" |
| |
| # check that hadoop can be found on the path |
| if [ $mode = "distributed" ]; then |
| if [ $(which hadoop | wc -l ) -eq 0 ]; then |
| echo "Can't find Hadoop executable. Add HADOOP_HOME/bin to the path or run in local mode." |
| exit -1; |
| fi |
| fi |
| |
| |
| function __bin_nutch { |
| # run $bin/nutch, exit if exit value indicates error |
| |
| echo "$bin/nutch $@" ;# echo command and arguments |
| "$bin/nutch" "$@" |
| |
| RETCODE=$? |
| if [ $RETCODE -ne 0 ] |
| then |
| echo "Error running:" |
| echo " $bin/nutch $@" |
| echo "Failed with exit value $RETCODE." |
| exit $RETCODE |
| fi |
| } |
| |
| |
| |
| # initial injection |
| echo "Injecting seed URLs" |
| __bin_nutch inject "$SEEDDIR" -crawlId "$CRAWL_ID" |
| |
| |
| # main loop : rounds of generate - fetch - parse - update |
| for ((a=1; a <= LIMIT ; a++)) |
| do |
| if [ -e ".STOP" ] |
| then |
| echo "STOP file found - escaping loop" |
| break |
| fi |
| |
| echo `date` ": Iteration $a of $LIMIT" |
| |
| echo "Generating batchId" |
| batchId=`date +%s`-$RANDOM |
| |
| echo "Generating a new fetchlist" |
| generate_args=($commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId "$CRAWL_ID" -batchId $batchId) |
| echo "$bin/nutch generate ${generate_args[@]}" |
| $bin/nutch generate "${generate_args[@]}" |
| RETCODE=$? |
| if [ $RETCODE -eq 0 ]; then |
| : # ok: no error |
| elif [ $RETCODE -eq 1 ]; then |
| echo "Generate returned 1 (no new segments created)" |
| echo "Escaping loop: no more URLs to fetch now" |
| break |
| else |
| echo "Error running:" |
| echo " $bin/nutch generate ${generate_args[@]}" |
| echo "Failed with exit value $RETCODE." |
| exit $RETCODE |
| fi |
| |
| echo "Fetching : " |
| __bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch $batchId -crawlId "$CRAWL_ID" -threads 50 |
| |
| # parsing the batch |
| echo "Parsing : " |
| # enable the skipping of records for the parsing so that a dodgy document |
| # so that it does not fail the full task |
| skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1" |
| __bin_nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId "$CRAWL_ID" |
| |
| # updatedb with this batch |
| echo "CrawlDB update for $CRAWL_ID" |
| __bin_nutch updatedb $commonOptions $batchId -crawlId "$CRAWL_ID" |
| |
| if [ -n "$SOLRURL" ]; then |
| echo "Indexing $CRAWL_ID on SOLR index -> $SOLRURL" |
| __bin_nutch index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId "$CRAWL_ID" |
| |
| echo "SOLR dedup -> $SOLRURL" |
| __bin_nutch solrdedup $commonOptions $SOLRURL |
| else |
| echo "Skipping indexing tasks: no SOLR url provided." |
| fi |
| |
| done |
| |
| exit 0 |
| |