blob: da169353ad8cce81c4ce3279be56e4fc2475835f [file] [log] [blame]
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Usage: crawl [-i|--index] [-D "key=value"] [-w|--wait] [-s <Seed Dir>] <Crawl Dir> <Num Rounds>
# -i|--index Indexes crawl results into a configured indexer
# -w|--wait NUMBER[SUFFIX] Time to wait before generating a new segment when no URLs
# are scheduled for fetching. Suffix can be: s for second,
# m for minute, h for hour and d for day. If no suffix is
# specified second is used by default.
# -D A Java property to pass to Nutch calls
# -s Path to seeds file(s)
# Crawl Dir Directory where the crawl/link/segments dirs are saved
# Num Rounds The number of rounds to run this crawl for
#
#
# UNLIKE THE NUTCH ALL-IN-ONE-CRAWL COMMAND THIS SCRIPT DOES THE LINK INVERSION AND
# INDEXING FOR EACH SEGMENT
INDEXFLAG=false
JAVA_PROPERTIES=""
WAIT=-1 # don't wait if there are no URLs to fetch
function __to_seconds() {
NUMBER=$(echo $1 | tr -dc '0-9')
MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]')
case $MODIFIER in
m|M)
SECONDS=`expr $NUMBER \* 60`
;;
h|H)
SECONDS=`expr $NUMBER \* 120`
;;
d|D)
SECONDS=`expr $NUMBER \* 86400`
;;
s|S|*)
SECONDS=$NUMBER
;;
esac
echo $SECONDS
}
SEEDDIR=""
while [[ $# > 0 ]]
do
case $1 in
-i|--index)
INDEXFLAG=true
shift
;;
-D)
JAVA_PROPERTIES="-D${2} ${JAVA_PROPERTIES}"
shift 2
;;
-s)
SEEDDIR="${2}"
shift 2
;;
-w|--wait)
WAIT="${2}"
shift 2
;;
*)
break
;;
esac
done
if [[ $# != 2 ]]; then
echo "Usage: crawl [-i|--index] [-D \"key=value\"] [-w|--wait] [-s <Seed Dir>] <Crawl Dir> <Num Rounds>"
echo -e "\t-i|--index\tIndexes crawl results into a configured indexer"
echo -e "\t-D\t\tA Java property to pass to Nutch calls"
echo -e "\t-w|--wait\tNUMBER[SUFFIX] Time to wait before generating a new segment when no URLs"
echo -e "\t\t\tare scheduled for fetching. Suffix can be: s for second,"
echo -e "\t\t\tm for minute, h for hour and d for day. If no suffix is"
echo -e "\t\t\tspecified second is used by default."
echo -e "\t-s Seed Dir\tPath to seeds file(s)"
echo -e "\tCrawl Dir\tDirectory where the crawl/link/segments dirs are saved"
echo -e "\tNum Rounds\tThe number of rounds to run this crawl for"
exit 1
fi
CRAWL_PATH="$1"
LIMIT="$2"
# convert wait time to seconds for compatibility reasons
if [ "$WAIT" != "-1" ]; then
WAIT=$( __to_seconds "$WAIT" )
echo "Time to wait (--wait) = $WAIT sec."
fi
#############################################
# MODIFY THE PARAMETERS BELOW TO YOUR NEEDS #
#############################################
# set the number of slaves nodes
numSlaves=1
# and the total number of available tasks
# sets Hadoop parameter "mapreduce.job.reduces"
numTasks=`expr $numSlaves \* 2`
# number of urls to fetch in one iteration
# 250K per task?
sizeFetchlist=`expr $numSlaves \* 50000`
# time limit for feching
timeLimitFetch=180
# num threads for fetching
numThreads=50
#############################################
bin="`dirname "$0"`"
bin="`cd "$bin"; pwd`"
# determines whether mode based on presence of job file
mode=local
if [ -f "${bin}"/../*nutch*.job ]; then
mode=distributed
fi
# note that some of the options listed here could be set in the
# corresponding hadoop site xml param file
commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"
# check that hadoop can be found on the path
if [ $mode = "distributed" ]; then
if [ $(which hadoop | wc -l ) -eq 0 ]; then
echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
exit -1;
fi
fi
function __bin_nutch {
# run $bin/nutch, exit if exit value indicates error
echo "$bin/nutch $@" ;# echo command and arguments
"$bin/nutch" "$@"
RETCODE=$?
if [ $RETCODE -ne 0 ]
then
echo "Error running:"
echo " $bin/nutch $@"
echo "Failed with exit value $RETCODE."
exit $RETCODE
fi
}
# initial injection
if [[ ! -z $SEEDDIR ]]
then
echo "Injecting seed URLs"
__bin_nutch inject "$CRAWL_PATH"/crawldb "$SEEDDIR"
fi
# main loop : rounds of generate - fetch - parse - update
for ((a=1; ; a++))
do
if [ -e ".STOP" ]
then
echo "STOP file found - escaping loop"
break
fi
if [ $LIMIT -ne -1 ]; then
if [ $a -gt $LIMIT ]; then
echo `date` ": Finished loop with $LIMIT iterations"
break
fi
echo `date` ": Iteration $a of $LIMIT"
else
echo `date` ": Iteration $a"
fi
echo "Generating a new segment"
generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $sizeFetchlist -numFetchers $numSlaves -noFilter)
echo "$bin/nutch generate ${generate_args[@]}"
$bin/nutch generate "${generate_args[@]}"
RETCODE=$?
if [ $RETCODE -eq 0 ]; then
: # ok: no error
elif [ $RETCODE -eq 1 ]; then
echo "Generate returned 1 (no new segments created)"
if [ "$WAIT" -ne -1 ]; then
echo "Waiting for $WAIT sec. ..."
sleep $WAIT
continue
else
echo "Escaping loop: no more URLs to fetch now"
break
fi
else
echo "Error running:"
echo " $bin/nutch generate ${generate_args[@]}"
echo "Failed with exit value $RETCODE."
exit $RETCODE
fi
# capture the name of the segment
# call hadoop in distributed mode
# or use ls
if [ $mode = "local" ]; then
SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1`
else
SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments | sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
fi
echo "Operating on segment : $SEGMENT"
# fetching the segment
echo "Fetching : $SEGMENT"
__bin_nutch fetch $commonOptions -D fetcher.timelimit.mins=$timeLimitFetch "$CRAWL_PATH"/segments/$SEGMENT -noParsing -threads $numThreads
# parsing the segment
echo "Parsing : $SEGMENT"
# enable the skipping of records for the parsing so that a dodgy document
# so that it does not fail the full task
skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
__bin_nutch parse $commonOptions $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT
# updatedb with this segment
echo "CrawlDB update"
__bin_nutch updatedb $commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT
# note that the link inversion - indexing routine can be done within the main loop
# on a per segment basis
echo "Link inversion"
__bin_nutch invertlinks "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
echo "Dedup on crawldb"
__bin_nutch dedup "$CRAWL_PATH"/crawldb
if $INDEXFLAG; then
echo "Indexing $SEGMENT to index"
__bin_nutch index $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
echo "Cleaning up index if possible"
__bin_nutch clean $JAVA_PROPERTIES "$CRAWL_PATH"/crawldb
else
echo "Skipping indexing ..."
fi
#######################################################
# The following commands fall into WebGraph territory
# and should be uncommented based on your requirements
#######################################################
#echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
#__bin_nutch webgraph $commonOptions -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
#echo "Running Loops Job on WebGraph within $CRAWL_PATH"
#__bin_nutch org.apache.nutch.scoring.webgraph.Loops $commonOptions -webgraphdb "$CRAWL_PATH"
#echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
#__bin_nutch linkrank $commonOptions -webgraphdb "$CRAWL_PATH"
#echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH"
#__bin_nutch scoreupdater $commonOptions -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"
#echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
#__bin_nutch nodedumper $commonOptions -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
done
exit 0