Add possibility to setup deduplication group mode in crawl script (#557)

commit: 88a17f26b4160720bacb3ead1cad71ae24a559bc [log] [tgz]
author: Jakob Berlin <j.berlin@ewerk.com> Thu Dec 17 17:59:30 2020 +0100
committer: GitHub <noreply@github.com> Thu Dec 17 08:59:30 2020 -0800
tree: 93bc9fbff0df0ea1084652d3e3279a722d5099d5
parent: 8d8e08b354fd94fced548c0b73623a375bcc8b2b [diff]
diff --git a/src/bin/crawl b/src/bin/crawl
index 23a2940..db42218 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl

@@ -48,6 +48,8 @@
 #   --time-limit-fetch <time_limit_fetch> Number of minutes allocated to the fetching [default: 180]
 #   --num-threads <num_threads>           Number of threads for fetching / sitemap processing [default: 50]
 #
+#   -dedup-group <none|host|domain>       Deduplication group method [default: none]
+#
 
 function __to_seconds() {
   NUMBER=$(echo $1 | tr -dc '0-9')
@@ -107,6 +109,7 @@
   echo -e "  \t\t\t\t\t  - never [default]"
   echo -e "  \t\t\t\t\t  - always (processing takes place in every iteration)"
   echo -e "  \t\t\t\t\t  - once (processing only takes place in the first iteration)"
+  echo -e "  -dedup-group <none|host|domain>\tDeduplication group method [default: none]"
 
   exit 1
 }
@@ -124,6 +127,7 @@
 TIME_LIMIT_FETCH=180
 NUM_THREADS=50
 SITEMAPS_FROM_HOSTDB_FREQUENCY=never
+DEDUP_GROUP=none
 
 while [[ $# > 0 ]]
 do
@@ -177,6 +181,10 @@
             SITEMAPS_FROM_HOSTDB_FREQUENCY="${2}"
             shift 2
             ;;
+        --dedup-group)
+            DEDUP_GROUP="${2}"
+            shift 2
+            ;;
         --hostdbupdate)
             HOSTDBUPDATE=true
             shift
@@ -197,6 +205,12 @@
   __print_usage
 fi
 
+if [[ ! "$DEDUP_GROUP" =~ ^(none|host|domain)$ ]]; then
+  echo "Error: --dedup-group <mode> has to be one of none, host, domain."
+  echo -e ""
+  __print_usage
+fi
+
 if [[ $# != 2 ]]; then
   __print_usage
 fi
@@ -385,7 +399,7 @@
   __bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -noNormalize -nofilter
 
   echo "Dedup on crawldb"
-  __bin_nutch dedup "${commonOptions[@]}" "$CRAWL_PATH"/crawldb
+  __bin_nutch dedup "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -group "$DEDUP_GROUP"
 
   if $INDEXFLAG; then
       echo "Indexing $SEGMENT to index"
commit	88a17f26b4160720bacb3ead1cad71ae24a559bc	[log] [tgz]
author	Jakob Berlin <j.berlin@ewerk.com>	Thu Dec 17 17:59:30 2020 +0100
committer	GitHub <noreply@github.com>	Thu Dec 17 08:59:30 2020 -0800
tree	93bc9fbff0df0ea1084652d3e3279a722d5099d5
parent	8d8e08b354fd94fced548c0b73623a375bcc8b2b [diff]