blob: 8478267b0963736d6a55c36d99cb8b54e7ffdc61 [file] [log] [blame]
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The Crawl command script : crawl <seedDir> <crawlId> <solrURL> <numberOfRounds>
#
#
# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
# INDEXING FOR EACH BATCH
SEEDDIR="$1"
CRAWL_ID="$2"
if [ "$#" -eq 3 ]; then
LIMIT="$3"
elif [ "$#" -eq 4 ]; then
SOLRURL="$3"
LIMIT="$4"
else
echo "Unknown # of arguments $#"
echo "Usage: crawl <seedDir> <crawlID> [<solrUrl>] <numberOfRounds>"
exit -1;
fi
if [ "$SEEDDIR" = "" ]; then
echo "Missing seedDir : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
exit -1;
fi
if [ "$CRAWL_ID" = "" ]; then
echo "Missing crawlID : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
exit -1;
fi
if [ "$SOLRURL" = "" ]; then
echo "No SOLRURL specified. Skipping indexing."
fi
if [ "$LIMIT" = "" ]; then
echo "Missing numberOfRounds : crawl <seedDir> <crawlID> [<solrURL>] <numberOfRounds>"
exit -1;
fi
#############################################
# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
#############################################
# set the number of slaves nodes
numSlaves=1
# and the total number of available tasks
# sets Hadoop parameter "mapred.reduce.tasks"
numTasks=`expr $numSlaves \* 2`
# number of urls to fetch in one iteration
# 250K per task?
sizeFetchlist=`expr $numSlaves \* 50000`
# time limit for feching
timeLimitFetch=180
# Adds <days> to the current time to facilitate
# crawling urls already fetched sooner then
# db.default.fetch.interval.
addDays=0
#############################################
bin="`dirname "$0"`"
bin="`cd "$bin"; pwd`"
# determines whether mode based on presence of job file
mode=local
if [ -f "${bin}"/../*nutch*.job ]; then
mode=distributed
fi
# note that some of the options listed here could be set in the
# corresponding hadoop site xml param file
commonOptions="-D mapred.reduce.tasks=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapred.reduce.tasks.speculative.execution=false -D mapred.map.tasks.speculative.execution=false -D mapred.compress.map.output=true"
# check that hadoop can be found on the path
if [ $mode = "distributed" ]; then
if [ $(which hadoop | wc -l ) -eq 0 ]; then
echo "Can't find Hadoop executable. Add HADOOP_HOME/bin to the path or run in local mode."
exit -1;
fi
fi
function __bin_nutch {
# run $bin/nutch, exit if exit value indicates error
echo "$bin/nutch $@" ;# echo command and arguments
"$bin/nutch" "$@"
RETCODE=$?
if [ $RETCODE -ne 0 ]
then
echo "Error running:"
echo " $bin/nutch $@"
echo "Failed with exit value $RETCODE."
exit $RETCODE
fi
}
# initial injection
echo "Injecting seed URLs"
__bin_nutch inject "$SEEDDIR" -crawlId "$CRAWL_ID"
# main loop : rounds of generate - fetch - parse - update
for ((a=1; a <= LIMIT ; a++))
do
if [ -e ".STOP" ]
then
echo "STOP file found - escaping loop"
break
fi
echo `date` ": Iteration $a of $LIMIT"
echo "Generating batchId"
batchId=`date +%s`-$RANDOM
echo "Generating a new fetchlist"
generate_args=($commonOptions -topN $sizeFetchlist -noNorm -noFilter -adddays $addDays -crawlId "$CRAWL_ID" -batchId $batchId)
echo "$bin/nutch generate ${generate_args[@]}"
$bin/nutch generate "${generate_args[@]}"
RETCODE=$?
if [ $RETCODE -eq 0 ]; then
: # ok: no error
elif [ $RETCODE -eq 1 ]; then
echo "Generate returned 1 (no new segments created)"
echo "Escaping loop: no more URLs to fetch now"
break
else
echo "Error running:"
echo " $bin/nutch generate ${generate_args[@]}"
echo "Failed with exit value $RETCODE."
exit $RETCODE
fi
echo "Fetching : "
__bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch $batchId -crawlId "$CRAWL_ID" -threads 50
# parsing the batch
echo "Parsing : "
# enable the skipping of records for the parsing so that a dodgy document
# so that it does not fail the full task
skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1"
__bin_nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId "$CRAWL_ID"
# updatedb with this batch
echo "CrawlDB update for $CRAWL_ID"
__bin_nutch updatedb $commonOptions $batchId -crawlId "$CRAWL_ID"
if [ -n "$SOLRURL" ]; then
echo "Indexing $CRAWL_ID on SOLR index -> $SOLRURL"
__bin_nutch index $commonOptions -D solr.server.url=$SOLRURL -all -crawlId "$CRAWL_ID"
echo "SOLR dedup -> $SOLRURL"
__bin_nutch solrdedup $commonOptions $SOLRURL
else
echo "Skipping indexing tasks: no SOLR url provided."
fi
done
exit 0