Merge pull request #486 from sebastian-nagel/NUTCH-2184-indexer-no-crawldb NUTCH-2184 Enable IndexingJob to function with no crawldb

commit: c4dd7c1d41d44c751eee0444fcffa8ac536f65ff [log] [tgz]
author: Sebastian Nagel <snagel@apache.org> Thu Jan 09 11:36:01 2020 +0100
committer: GitHub <noreply@github.com> Thu Jan 09 11:36:01 2020 +0100
tree: 334ec61655ce5547cbf64b42376ac52fb77a754b
parent: 3bbc6dd529da3b112e8e74c40bd084887a66ec26 [diff]
parent: 57802d105259624bea20ea0e8be4cb3858d5716b [diff]
diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index fedfeb7..cfb6dea 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java

@@ -50,6 +50,24 @@
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
 
+/**
+ * <p>
+ * This class is typically invoked from within
+ * {@link org.apache.nutch.indexer.IndexingJob} and handles all MapReduce
+ * functionality required when undertaking indexing.
+ * </p>
+ * <p>
+ * This is a consequence of one or more indexing plugins being invoked which
+ * extend {@link org.apache.nutch.indexer.IndexWriter}.
+ * </p>
+ * <p>
+ * See
+ * {@link org.apache.nutch.indexer.IndexerMapReduce#initMRJob(Path, Path, Collection, JobConf, boolean)}
+ * for details on the specific data structures and parameters required for
+ * indexing.
+ * </p>
+ *
+ */
 public class IndexerMapReduce extends Configured {
 
   private static final Logger LOG = LoggerFactory
@@ -100,7 +118,7 @@
             .normalize(url, URLNormalizers.SCOPE_INDEXER);
         normalized = normalized.trim();
       } catch (Exception e) {
-        LOG.warn("Skipping " + url + ":" + e);
+        LOG.warn("Skipping {}: {}", url, e);
         normalized = null;
       }
     }
@@ -418,12 +436,26 @@
   public static void initMRJob(Path crawlDb, Path linkDb,
       Collection<Path> segments, Job job, boolean addBinaryContent) throws IOException{
 
-    LOG.info("IndexerMapReduce: crawldb: {}", crawlDb);
-
-    if (linkDb != null)
-      LOG.info("IndexerMapReduce: linkdb: {}", linkDb);
-
     Configuration conf = job.getConfiguration();
+
+    if (crawlDb != null) {
+      LOG.info("IndexerMapReduce: crawldb: {}", crawlDb);
+      Path currentCrawlDb = new Path(crawlDb, CrawlDb.CURRENT_NAME);
+      try {
+        if (currentCrawlDb.getFileSystem(conf).exists(currentCrawlDb)) {
+          FileInputFormat.addInputPath(job, currentCrawlDb);
+        } else {
+          LOG.warn(
+              "Ignoring crawlDb for indexing, no crawlDb found in path: {}",
+              crawlDb);
+        }
+      } catch (IOException e) {
+        LOG.warn("Failed to use crawlDb ({}) for indexing", crawlDb, e);
+      }
+    } else {
+      LOG.info("IndexerMapReduce: no crawldb provided for indexing");
+    }
+
     for (final Path segment : segments) {
       LOG.info("IndexerMapReduces: adding segment: {}", segment);
       FileInputFormat.addInputPath(job, new Path(segment,
@@ -438,9 +470,8 @@
       }
     }
 
-    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
-
     if (linkDb != null) {
+      LOG.info("IndexerMapReduce: linkdb: {}", linkDb);
       Path currentLinkDb = new Path(linkDb, LinkDb.CURRENT_NAME);
       try {
         if (currentLinkDb.getFileSystem(conf).exists(currentLinkDb)) {

diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java b/src/java/org/apache/nutch/indexer/IndexingJob.java
index 30c5504..e476adc 100644
--- a/src/java/org/apache/nutch/indexer/IndexingJob.java
+++ b/src/java/org/apache/nutch/indexer/IndexingJob.java

@@ -167,15 +167,59 @@
     }
   }
 
+  private static void usage() {
+    System.err.println(
+        "Usage: Indexer (<crawldb> | -nocrawldb) (<segment> ... | -dir <segments>) [general options]");
+    System.err.println("");
+    System.err.println("Index given segments using configured indexer plugins");
+    System.err.println("");
+    System.err.println(
+        "The CrawlDb is optional but it is required to send deletion requests for duplicates");
+    System.err.println(
+        "and to read the proper document score/boost/weight passed to the indexers.");
+    System.err.println("");
+    System.err.println("Required arguments:");
+    System.err.println("");
+    System.err.println("\t<crawldb>\tpath to CrawlDb, or");
+    System.err.println(
+        "\t-nocrawldb\tflag to indicate that no CrawlDb shall be used");
+    System.err.println("");
+    System.err.println("\t<segment> ...\tpath(s) to segment, or");
+    System.err.println("\t-dir <segments>\tpath to segments/ directory,");
+    System.err.println(
+        "\t               \t(all subdirectories are read as segments)");
+    System.err.println("");
+    System.err.println("General options:");
+    System.err.println("\t");
+    System.err.println(
+        "\t-linkdb <linkdb>\tuse LinkDb to index anchor texts of incoming links");
+    System.err.println(
+        "\t-params k1=v1&k2=v2...\tparameters passed to indexer plugins");
+    System.err.println(
+        "\t                      \t(via property indexer.additional.params)");
+    System.err.println("");
+    System.err.println(
+        "\t-noCommit\tdo not call the commit method of indexer plugins");
+    System.err.println(
+        "\t-deleteGone\tsend deletion requests for 404s, redirects, duplicates");
+    System.err
+        .println("\t-filter   \tskip documents with URL rejected by configured URL filters");
+    System.err.println("\t-normalize\tnormalize URLs before indexing");
+    System.err.println(
+        "\t-addBinaryContent\tindex raw/binary content in field `binaryContent`");
+    System.err.println("\t-base64   \tuse Base64 encoding for binary content");
+    System.err.println("");
+  }
+
   public int run(String[] args) throws Exception {
-    if (args.length < 2) {
-      System.err
-      //.println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize]");
-      .println("Usage: Indexer <crawldb> [-linkdb <linkdb>] [-params k1=v1&k2=v2...] (<segment> ... | -dir <segments>) [-noCommit] [-deleteGone] [-filter] [-normalize] [-addBinaryContent] [-base64]");
+    if (args.length == 0) {
+      usage();
       return -1;
     }
 
-    final Path crawlDb = new Path(args[0]);
+    Path crawlDb = null;
+    boolean noCrawlDb = false;
+
     Path linkDb = null;
 
     final List<Path> segments = new ArrayList<>();
@@ -188,10 +232,12 @@
     boolean addBinaryContent = false;
     boolean base64 = false;
 
-    for (int i = 1; i < args.length; i++) {
+    for (int i = 0; i < args.length; i++) {
       FileSystem fs = null;
       Path dir = null;
-      if (args[i].equals("-linkdb")) {
+      if (args[i].equals("-nocrawldb")) {
+        noCrawlDb = true;
+      } else if (args[i].equals("-linkdb")) {
         linkDb = new Path(args[++i]);
       } else if (args[i].equals("-dir")) {
         dir = new Path(args[++i]);
@@ -218,7 +264,14 @@
         base64 = true;
       } else if (args[i].equals("-params")) {
         params = args[++i];
+      } else if (crawlDb == null && !noCrawlDb) {
+        /*
+         * expect CrawlDb as first non-option argument unless -nocrawldb is
+         * given
+         */
+        crawlDb = new Path(args[i]);
       } else {
+        // remaining arguments are segments
         dir = new Path(args[i]);
         fs = dir.getFileSystem(getConf());
         if (SegmentChecker.isIndexable(dir,fs)) {
@@ -227,6 +280,12 @@
       }
     }
 
+    if (segments.size() == 0) {
+      usage();
+      System.err.println("No indexable segments passed as arguments. At least one segment is required!");
+      return -1;
+    }
+
     try {
       index(crawlDb, linkDb, segments, noCommit, deleteGone, params, filter, normalize, addBinaryContent, base64);
       return 0;
commit	c4dd7c1d41d44c751eee0444fcffa8ac536f65ff	[log] [tgz]
author	Sebastian Nagel <snagel@apache.org>	Thu Jan 09 11:36:01 2020 +0100
committer	GitHub <noreply@github.com>	Thu Jan 09 11:36:01 2020 +0100
tree	334ec61655ce5547cbf64b42376ac52fb77a754b
parent	3bbc6dd529da3b112e8e74c40bd084887a66ec26 [diff]
parent	57802d105259624bea20ea0e8be4cb3858d5716b [diff]