| #!/bin/bash |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one or more |
| # contributor license agreements. See the NOTICE file distributed with |
| # this work for additional information regarding copyright ownership. |
| # The ASF licenses this file to You under the Apache License, Version 2.0 |
| # (the "License"); you may not use this file except in compliance with |
| # the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| # Usage: crawl [options] <crawl_dir> <num_rounds> |
| # |
| # Arguments: |
| # <crawl_dir> Directory where the crawl/host/link/segments dirs are saved |
| # <num_rounds> The number of rounds to run this crawl for |
| # |
| # Options: |
| # -i|--index Indexes crawl results into a configured indexer |
| # -D <propery>=<value> A Nutch or Hadoop property to pass to Nutch calls overwriting |
| # properties defined in configuration files, e.g. |
| # increase content limit to 2MB: |
| # -D http.content.limit=2097152 |
| # (in distributed mode) configure memory of map and reduce tasks: |
| # -D mapreduce.map.memory.mb=4608 -D mapreduce.map.java.opts=-Xmx4096m |
| # -D mapreduce.reduce.memory.mb=4608 -D mapreduce.reduce.java.opts=-Xmx4096m |
| # -w|--wait <NUMBER[SUFFIX]> Time to wait before generating a new segment when no URLs |
| # are scheduled for fetching. Suffix can be: s for second, |
| # m for minute, h for hour and d for day. If no suffix is |
| # specified second is used by default. [default: -1] |
| # -s <seed_dir> Path to seeds file(s) |
| # -sm <sitemap_dir> Path to sitemap URL file(s) |
| # |
| # --hostdbupdate Boolean indicator if we call hostdbupdate or not |
| # --hostdbgenerate Boolean indicator if we use hostdb in generate or not |
| # |
| # --num-fetchers <num_fetchers> Number of tasks used for fetching (fetcher map tasks) [default: 1] |
| # Note: This can only be set when running in distributed mode and |
| # should correspond to the number of worker nodes in the cluster. |
| # --num-tasks <num_tasks> Number of reducer tasks [default: 2] |
| # --size-fetchlist <size_fetchlist> Number of URLs to fetch in one iteration [default: 50000] |
| # --time-limit-fetch <time_limit_fetch> Number of minutes allocated to the fetching [default: 180] |
| # --num-threads <num_threads> Number of threads for fetching / sitemap processing [default: 50] |
| # |
| # -dedup-group <none|host|domain> Deduplication group method [default: none] |
| # |
| |
| function __to_seconds() { |
| NUMBER=$(echo $1 | tr -dc '0-9') |
| MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]') |
| |
| case $MODIFIER in |
| m|M) |
| SECONDS=`expr $NUMBER \* 60` |
| ;; |
| h|H) |
| SECONDS=`expr $NUMBER \* 120` |
| ;; |
| d|D) |
| SECONDS=`expr $NUMBER \* 86400` |
| ;; |
| s|S|*) |
| SECONDS=$NUMBER |
| ;; |
| esac |
| |
| echo $SECONDS |
| } |
| |
| function __print_usage { |
| echo "Usage: crawl [options] <crawl_dir> <num_rounds>" |
| echo -e "" |
| echo -e "Arguments:" |
| echo -e " <crawl_dir>\t\t\t\tDirectory where the crawl/host/link/segments dirs are saved" |
| echo -e " <num_rounds>\t\t\t\tThe number of rounds to run this crawl for" |
| echo -e "" |
| echo -e "Options:" |
| echo -e " -i|--index\t\t\t\tIndexes crawl results into a configured indexer" |
| echo -e " -D\t\t\t\t\tA Nutch or Hadoop property to pass to Nutch calls overwriting" |
| echo -e " \t\t\t\t\tproperties defined in configuration files, e.g." |
| echo -e " \t\t\t\t\tincrease content limit to 2MB:" |
| echo -e " \t\t\t\t\t -D http.content.limit=2097152" |
| echo -e " \t\t\t\t\t(distributed mode only) configure memory of map and reduce tasks:" |
| echo -e " \t\t\t\t\t -D mapreduce.map.memory.mb=4608 -D mapreduce.map.java.opts=-Xmx4096m" |
| echo -e " \t\t\t\t\t -D mapreduce.reduce.memory.mb=4608 -D mapreduce.reduce.java.opts=-Xmx4096m" |
| echo -e " -w|--wait <NUMBER[SUFFIX]>\t\tTime to wait before generating a new segment when no URLs" |
| echo -e " \t\t\t\t\tare scheduled for fetching. Suffix can be: s for second," |
| echo -e " \t\t\t\t\tm for minute, h for hour and d for day. If no suffix is" |
| echo -e " \t\t\t\t\tspecified second is used by default. [default: -1]" |
| echo -e " -s <seed_dir>\t\t\t\tPath to seeds file(s)" |
| echo -e " -sm <sitemap_dir>\t\t\tPath to sitemap URL file(s)" |
| echo -e " --hostdbupdate\t\t\tBoolean flag showing if we either update or not update hostdb for each round" |
| echo -e " --hostdbgenerate\t\t\tBoolean flag showing if we use hostdb in generate or not" |
| echo -e " --num-fetchers <num_fetchers>\t\tNumber of tasks used for fetching (fetcher map tasks) [default: 1]" |
| echo -e " \t\t\t\t\tNote: This can only be set when running in distributed mode and" |
| echo -e " \t\t\t\t\t should correspond to the number of worker nodes in the cluster." |
| echo -e " --num-tasks <num_tasks>\t\tNumber of reducer tasks [default: 2]" |
| echo -e " --size-fetchlist <size_fetchlist>\tNumber of URLs to fetch in one iteration [default: 50000]" |
| echo -e " --time-limit-fetch <time_limit_fetch>\tNumber of minutes allocated to the fetching [default: 180]" |
| echo -e " --num-threads <num_threads>\t\tNumber of threads for fetching / sitemap processing [default: 50]" |
| echo -e " --sitemaps-from-hostdb <frequency>\tWhether and how often to process sitemaps based on HostDB." |
| echo -e " \t\t\t\t\tSupported values are:" |
| echo -e " \t\t\t\t\t - never [default]" |
| echo -e " \t\t\t\t\t - always (processing takes place in every iteration)" |
| echo -e " \t\t\t\t\t - once (processing only takes place in the first iteration)" |
| echo -e " -dedup-group <none|host|domain>\tDeduplication group method [default: none]" |
| |
| exit 1 |
| } |
| |
| # default values |
| INDEXFLAG=false |
| HOSTDBUPDATE=false |
| HOSTDBGENERATE=false |
| HADOOP_PROPERTIES=() |
| WAIT=-1 # don't wait if there are no URLs to fetch |
| SEEDDIR="" |
| NUM_FETCHERS=1 |
| NUM_TASKS=2 # 2 x NUM_FETCHERS |
| SIZE_FETCHLIST=50000 # 25K x NUM_TASKS |
| TIME_LIMIT_FETCH=180 |
| NUM_THREADS=50 |
| SITEMAPS_FROM_HOSTDB_FREQUENCY=never |
| DEDUP_GROUP=none |
| |
| while [[ $# > 0 ]] |
| do |
| case $1 in |
| -i|--index) |
| INDEXFLAG=true |
| shift |
| ;; |
| -D) |
| HADOOP_PROPERTIES=("${HADOOP_PROPERTIES[@]}" -D"${2}") |
| shift 2 |
| ;; |
| -s) |
| SEEDDIR="${2}" |
| shift 2 |
| ;; |
| -sm) |
| SITEMAPDIR="${2}" |
| shift 2 |
| ;; |
| -w|--wait) |
| WAIT="${2}" |
| shift 2 |
| ;; |
| --num-slaves) |
| # back-ward compatibility: NUTCH-2759 renamed option --num-slaves to --num-fetchers |
| NUM_FETCHERS="${2}" |
| shift 2 |
| ;; |
| --num-fetchers) |
| NUM_FETCHERS="${2}" |
| shift 2 |
| ;; |
| --num-tasks) |
| NUM_TASKS="${2}" |
| shift 2 |
| ;; |
| --size-fetchlist) |
| SIZE_FETCHLIST="${2}" |
| shift 2 |
| ;; |
| --time-limit-fetch) |
| TIME_LIMIT_FETCH="${2}" |
| shift 2 |
| ;; |
| --num-threads) |
| NUM_THREADS="${2}" |
| shift 2 |
| ;; |
| --sitemaps-from-hostdb) |
| SITEMAPS_FROM_HOSTDB_FREQUENCY="${2}" |
| shift 2 |
| ;; |
| --dedup-group) |
| DEDUP_GROUP="${2}" |
| shift 2 |
| ;; |
| --hostdbupdate) |
| HOSTDBUPDATE=true |
| shift |
| ;; |
| --hostdbgenerate) |
| HOSTDBGENERATE=true |
| shift |
| ;; |
| *) |
| break |
| ;; |
| esac |
| done |
| |
| if [[ ! "$SITEMAPS_FROM_HOSTDB_FREQUENCY" =~ ^(never|always|once)$ ]]; then |
| echo "Error: --sitemaps-from-hostdb <frequency> has to be one of never, always, once." |
| echo -e "" |
| __print_usage |
| fi |
| |
| if [[ ! "$DEDUP_GROUP" =~ ^(none|host|domain)$ ]]; then |
| echo "Error: --dedup-group <mode> has to be one of none, host, domain." |
| echo -e "" |
| __print_usage |
| fi |
| |
| if [[ $# != 2 ]]; then |
| __print_usage |
| fi |
| |
| CRAWL_PATH="$1" |
| LIMIT="$2" |
| |
| # convert wait time to seconds for compatibility reasons |
| if [ "$WAIT" != "-1" ]; then |
| WAIT=$( __to_seconds "$WAIT" ) |
| echo "Time to wait (--wait) = $WAIT sec." |
| fi |
| |
| bin="`dirname "$0"`" |
| bin="`cd "$bin"; pwd`" |
| |
| # determines whether mode based on presence of job file |
| mode=local |
| if [ -f "${bin}"/../*nutch*.job ]; then |
| mode=distributed |
| fi |
| if [[ "$mode" = "local" ]]; then |
| if [[ "$NUM_FETCHERS" -ne 1 ]]; then |
| echo "Ignoring configured number of fetchers (--num_fetchers): a single fetcher task is used when running in local mode." |
| fi |
| NUM_FETCHERS=1 |
| fi |
| |
| # note that some of the options listed here could be set in the |
| # corresponding hadoop site xml param file |
| commonOptions=("${HADOOP_PROPERTIES[@]}" -Dmapreduce.job.reduces=$NUM_TASKS -Dmapreduce.reduce.speculative=false -Dmapreduce.map.speculative=false -Dmapreduce.map.output.compress=true) |
| |
| # check that hadoop can be found on the path |
| if [ $mode = "distributed" ]; then |
| if [ $(which hadoop | wc -l ) -eq 0 ]; then |
| echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode." |
| exit -1; |
| fi |
| fi |
| |
| |
| function __bin_nutch { |
| # run $bin/nutch, exit if exit value indicates error |
| |
| echo "$bin/nutch $@" ;# echo command and arguments |
| "$bin/nutch" "$@" |
| |
| RETCODE=$? |
| if [ $RETCODE -ne 0 ] |
| then |
| echo "Error running:" |
| echo " $bin/nutch $@" |
| echo "Failed with exit value $RETCODE." |
| exit $RETCODE |
| fi |
| } |
| |
| # check if directory exists locally or on hdfs |
| function __directory_exists { |
| if [[ "$mode" == local && -d "$1" ]]; then |
| return 0 |
| elif [[ "$mode" == distributed ]] && hadoop fs -test -d "$1"; then |
| return 0 |
| else |
| return 1 |
| fi |
| } |
| |
| function __update_hostdb { |
| if __directory_exists "$CRAWL_PATH"/crawldb; then |
| echo "Updating HostDB" |
| __bin_nutch updatehostdb "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb |
| fi |
| } |
| |
| # initial injection |
| if [[ ! -z $SEEDDIR ]]; then |
| echo "Injecting seed URLs" |
| __bin_nutch inject "${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$SEEDDIR" |
| fi |
| |
| # sitemap processing based on sitemap definition file(s) |
| if [[ ! -z $SITEMAPDIR ]]; then |
| echo "Processing sitemaps defined in $SITEMAPDIR" |
| __bin_nutch sitemap "${commonOptions[@]}" "$CRAWL_PATH/crawldb" -sitemapUrls "$SITEMAPDIR" -threads $NUM_THREADS |
| fi |
| |
| # main loop : rounds of generate - fetch - parse - update |
| for ((a=1; ; a++)) |
| do |
| if [ -e ".STOP" ]; then |
| echo "STOP file found - escaping loop" |
| break |
| fi |
| |
| if [ $LIMIT -ne -1 ]; then |
| if [ $a -gt $LIMIT ]; then |
| echo `date` ": Finished loop with $LIMIT iterations" |
| break |
| fi |
| echo `date` ": Iteration $a of $LIMIT" |
| else |
| echo `date` ": Iteration $a" |
| fi |
| |
| if [[ "$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "always" || ("$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "once" && $a -eq 1) ]]; then |
| # create / update HostDB on first run |
| [[ $a -eq 1 ]] && __update_hostdb |
| |
| # sitemap processing based on HostDB |
| if __directory_exists "$CRAWL_PATH"/hostdb; then |
| echo "Processing sitemaps based on hosts in HostDB" |
| __bin_nutch sitemap "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb -threads $NUM_THREADS |
| fi |
| fi |
| |
| echo "Generating a new segment" |
| if [[ "$HOSTDBGENERATE" == "true" ]] && __directory_exists "$CRAWL_PATH"/hostdb; then |
| generate_args=("${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter -hostdb "$CRAWL_PATH"/hostdb) |
| else |
| generate_args=("${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter) |
| fi |
| |
| echo "$bin/nutch generate ${generate_args[@]}" |
| $bin/nutch generate "${generate_args[@]}" |
| RETCODE=$? |
| if [ $RETCODE -eq 0 ]; then |
| : # ok: no error |
| elif [ $RETCODE -eq 1 ]; then |
| echo "Generate returned 1 (no new segments created)" |
| |
| if [ "$WAIT" -ne -1 ]; then |
| echo "Waiting for $WAIT sec. ..." |
| sleep $WAIT |
| continue |
| else |
| echo "Escaping loop: no more URLs to fetch now" |
| break |
| fi |
| else |
| echo "Error running:" |
| echo " $bin/nutch generate ${generate_args[@]}" |
| echo "Failed with exit value $RETCODE." |
| exit $RETCODE |
| fi |
| |
| # capture the name of the segment |
| # call hadoop in distributed mode |
| # or use ls |
| |
| if [ $mode = "local" ]; then |
| SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1` |
| else |
| SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments | sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1` |
| fi |
| |
| echo "Operating on segment : $SEGMENT" |
| |
| # fetching the segment |
| echo "Fetching : $SEGMENT" |
| __bin_nutch fetch "${commonOptions[@]}" -D fetcher.timelimit.mins=$TIME_LIMIT_FETCH "$CRAWL_PATH"/segments/$SEGMENT -threads $NUM_THREADS |
| |
| # parsing the segment |
| echo "Parsing : $SEGMENT" |
| # enable the skipping of records for the parsing so that a dodgy document |
| # so that it does not fail the full task |
| skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1" |
| __bin_nutch parse "${commonOptions[@]}" $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT |
| |
| # updatedb with this segment |
| echo "CrawlDB update" |
| __bin_nutch updatedb "${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT |
| |
| echo "HostDB update" |
| if $HOSTDBUPDATE; then |
| __update_hostdb |
| fi |
| |
| # Note that all steps below in this loop (link inversion, deduplication, indexing) |
| # can be done |
| # - either inside the loop on a per segment basis |
| # - or after the loop over all segments created in all loop iterations |
| # (both invertlinks and index accept multiple segments as input) |
| # The latter is more efficient but the index is then updated later. |
| echo "Link inversion" |
| __bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -noNormalize -nofilter |
| |
| echo "Dedup on crawldb" |
| __bin_nutch dedup "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -group "$DEDUP_GROUP" |
| |
| if $INDEXFLAG; then |
| echo "Indexing $SEGMENT to index" |
| __bin_nutch index "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -deleteGone |
| else |
| echo "Skipping indexing ..." |
| fi |
| |
| done |
| |
| ####################################################### |
| # The following commands fall into WebGraph territory |
| # and should be uncommented based on your requirements |
| ####################################################### |
| #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/" |
| #__bin_nutch webgraph "${commonOptions[@]}" -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH" |
| |
| #echo "Running Loops Job on WebGraph within $CRAWL_PATH" |
| #__bin_nutch org.apache.nutch.scoring.webgraph.Loops "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH" |
| |
| #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH" |
| #__bin_nutch linkrank "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH" |
| |
| #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH" |
| #__bin_nutch scoreupdater "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH" |
| |
| #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores" |
| #__bin_nutch nodedumper "${commonOptions[@]}" -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores |
| |
| exit 0 |