Merge pull request #525 from sebastian-nagel/NUTCH-1945 NUTCH-1945 Test for XLSX parser

commit: e61a8a3b0af5540ffe23f862e2af69092114f506 [log] [tgz]
author: Sebastian Nagel <snagel@apache.org> Tue May 12 15:35:09 2020 +0200
committer: GitHub <noreply@github.com> Tue May 12 15:35:09 2020 +0200
tree: 6c63331240c3356816d0d9899205e0ee8192e83b
parent: ec93b3359d207fef62378cbd15bb63c7acc33f66 [diff]
parent: 0341f0dfa156d3963e88b2cb9507013b0eef8668 [diff]
diff --git a/build.xml b/build.xml
index 5eb157e..bc8d8fb 100644
--- a/build.xml
+++ b/build.xml

@@ -877,6 +877,11 @@
       <fileset dir="runtime/local/plugins"/>
     </copy>
 
+    <copy todir="${bin.dist.version.dir}/plugins">
+      <fileset dir="${plugins.dir}">
+        <include name="**/README.*" />
+      </fileset>
+    </copy>
   </target>
 
   <!-- ================================================================== -->

diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReader.java b/src/java/org/apache/nutch/crawl/CrawlDbReader.java
index dfcc87a..603b2e3 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbReader.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbReader.java

@@ -979,7 +979,7 @@
 
     if (args.length < 2) {
       System.err.println(
-          "Usage: CrawlDbReader <crawldb> (-stats | -dump <out_dir> | -topN <nnnn> <out_dir> [<min>] | -url <url>)");
+          "Usage: CrawlDbReader <crawldb> (-stats | -dump <out_dir> | -topN <nnnn> <out_dir> [<min>] | -url <url> | -listen <port>)");
       System.err
           .println("\t<crawldb>\tdirectory name where crawldb is located");
       System.err
@@ -1002,6 +1002,10 @@
           "\t\t[-sample <fraction>]\tOnly process a random sample with this ratio");
       System.err
           .println("\t-url <url>\tprint information on <url> to System.out");
+      System.err
+        .println("\t-listen <port> [-keepClientCnxOpen]\tlisten on <port> for URLs and");
+      System.err
+            .println("\t\t\tsend information about <url> back");
       System.err.println(
           "\t-topN <nnnn> <out_dir> [<min>]\tdump top <nnnn> urls sorted by score to <out_dir>");
       System.err

diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index 5dcd2ea..04c2ae8 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java

@@ -841,6 +841,14 @@
           String.format(Locale.ROOT, "%6d", counter.getValue()),
           counter.getName());
     }
+    if (!getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
+      /*
+       * generated items are not marked in CrawlDb, and CrawlDb will not
+       * accessed anymore: we already can release the lock
+       */
+      LockUtil.removeLockFile(getConf(), lock);
+      lock = null;
+    }
 
     // read the subdirectories generated in the temp
     // output and turn them into segments
@@ -858,15 +866,13 @@
       }
     } catch (Exception e) {
       LOG.warn("Generator: exception while partitioning segments, exiting ...");
-      LockUtil.removeLockFile(getConf(), lock);
-      fs.delete(tempDir, true);
+      NutchJob.cleanupAfterFailure(tempDir, lock, fs);
       return null;
     }
 
     if (generatedSegments.size() == 0) {
       LOG.warn("Generator: 0 records selected for fetching, exiting ...");
-      LockUtil.removeLockFile(getConf(), lock);
-      fs.delete(tempDir, true);
+      NutchJob.cleanupAfterFailure(tempDir, lock, fs);
       return null;
     }
 
@@ -913,7 +919,9 @@
       fs.delete(tempDir2, true);
     }
 
-    LockUtil.removeLockFile(getConf(), lock);
+    if (lock != null) {
+      LockUtil.removeLockFile(getConf(), lock);
+    }
     fs.delete(tempDir, true);
 
     long end = System.currentTimeMillis();

diff --git a/src/java/org/apache/nutch/crawl/LinkDbReader.java b/src/java/org/apache/nutch/crawl/LinkDbReader.java
index 5d422b4..4cacd81 100644
--- a/src/java/org/apache/nutch/crawl/LinkDbReader.java
+++ b/src/java/org/apache/nutch/crawl/LinkDbReader.java

@@ -225,13 +225,17 @@
   public int run(String[] args) throws Exception {
     if (args.length < 2) {
       System.err
-          .println("Usage: LinkDbReader <linkdb> (-dump <out_dir> [-regex <regex>]) | -url <url>");
+          .println("Usage: LinkDbReader <linkdb> (-dump <out_dir> [-regex <regex>] | -url <url> | -listen <port>)");
       System.err
           .println("\t-dump <out_dir>\tdump whole link db to a text file in <out_dir>");
       System.err
           .println("\t\t-regex <regex>\trestrict to url's matching expression");
       System.err
           .println("\t-url <url>\tprint information about <url> to System.out");
+      System.err
+          .println("\t-listen <port> [-keepClientCnxOpen]\tlisten on <port> for URLs and");
+      System.err
+          .println("\t\t\tsend information about <url> back");
       return -1;
     }
 

diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
index 4f849a0..84d9f6d 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

@@ -58,6 +58,7 @@
   protected URLNormalizers normalizers = null;
   protected boolean dumpText = false;
   protected boolean followRedirects = false;
+  protected boolean checkRobotsTxt = false;
   protected boolean doIndex = false;
   // used to simulate the metadata propagated from injection
   protected HashMap<String, String> metadata = new HashMap<>();
@@ -82,6 +83,7 @@
         + "                  \t before other command-specific options)\n"
         + "  -normalize      \tnormalize URLs\n" //
         + "  -followRedirects\tfollow redirects when fetching URL\n" //
+        + "  -checkRobotsTxt\tfail if the robots.txt disallows fetching\n" //
         + "  -dumpText       \tshow the entire plain-text content,\n" //"
         + "                  \tnot only the first 100 characters\n" //
         + "  -doIndex        \tpass document to configured index writers\n" //
@@ -103,6 +105,8 @@
         normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
       } else if (args[i].equals("-followRedirects")) {
         followRedirects = true;
+      } else if (args[i].equals("-checkRobotsTxt")) {
+        checkRobotsTxt = true;
       } else if (args[i].equals("-dumpText")) {
         dumpText = true;
       } else if (args[i].equals("-doIndex")) {
@@ -164,13 +168,15 @@
       }
     }
 
-    ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
+    ProtocolOutput protocolOutput = getProtocolOutput(url, datum,
+        checkRobotsTxt);
     Text turl = new Text(url);
-    
+
     // Following redirects and not reached maxRedirects?
     int numRedirects = 0;
-    while (!protocolOutput.getStatus().isSuccess() && followRedirects
-        && protocolOutput.getStatus().isRedirect() && maxRedirects >= numRedirects) {
+    while (protocolOutput != null && !protocolOutput.getStatus().isSuccess()
+        && followRedirects && protocolOutput.getStatus().isRedirect()
+        && maxRedirects >= numRedirects) {
       String[] stuff = protocolOutput.getStatus().getArgs();
       url = stuff[0];
       LOG.info("Follow redirect to {}", url);
@@ -182,10 +188,15 @@
       turl.set(url);
 
       // try again
-      protocolOutput = getProtocolOutput(url, datum);
+      protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
       numRedirects++;
     }
 
+    if (checkRobotsTxt && protocolOutput == null) {
+      System.err.println("Fetch disallowed by robots.txt");
+      return -1;
+    }
+
     if (!protocolOutput.getStatus().isSuccess()) {
       System.err.println("Fetch failed with protocol status: "
           + protocolOutput.getStatus());

diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java b/src/java/org/apache/nutch/parse/ParserChecker.java
index 2a976ba..4dbfcfa 100644
--- a/src/java/org/apache/nutch/parse/ParserChecker.java
+++ b/src/java/org/apache/nutch/parse/ParserChecker.java

@@ -69,6 +69,7 @@
   protected URLNormalizers normalizers = null;
   protected boolean dumpText = false;
   protected boolean followRedirects = false;
+  protected boolean checkRobotsTxt = false;
   // used to simulate the metadata propagated from injection
   protected HashMap<String, String> metadata = new HashMap<>();
   protected String forceAsContentType = null;
@@ -94,9 +95,11 @@
         + "                  \t before other command-specific options)\n"
         + "  -normalize      \tnormalize URLs\n" //
         + "  -followRedirects\tfollow redirects when fetching URL\n" //
+        + "  -checkRobotsTxt\tfail if the robots.txt disallows fetching\n" //
         + "  -dumpText       \talso show the plain-text extracted by parsers\n" //
         + "  -forceAs <mimeType>\tforce parsing as <mimeType>\n" //
         + "  -md <key>=<value>\tmetadata added to CrawlDatum before parsing\n";
+
     // Print help when no args given
     if (args.length < 1) {
       System.err.println(usage);
@@ -109,6 +112,8 @@
         normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
       } else if (args[i].equals("-followRedirects")) {
         followRedirects = true;
+      } else if (args[i].equals("-checkRobotsTxt")) {
+        checkRobotsTxt = true;
       } else if (args[i].equals("-forceAs")) {
         forceAsContentType = args[++i];
       } else if (args[i].equals("-dumpText")) {
@@ -172,13 +177,15 @@
       }
     }
 
-    ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
+    ProtocolOutput protocolOutput = getProtocolOutput(url, datum,
+        checkRobotsTxt);
     Text turl = new Text(url);
-    
+
     // Following redirects and not reached maxRedirects?
     int numRedirects = 0;
-    while (!protocolOutput.getStatus().isSuccess() && followRedirects
-        && protocolOutput.getStatus().isRedirect() && maxRedirects >= numRedirects) {
+    while (protocolOutput != null && !protocolOutput.getStatus().isSuccess()
+        && followRedirects && protocolOutput.getStatus().isRedirect()
+        && maxRedirects >= numRedirects) {
       String[] stuff = protocolOutput.getStatus().getArgs();
       url = stuff[0];
       LOG.info("Follow redirect to {}", url);
@@ -190,10 +197,15 @@
       turl.set(url);
 
       // try again
-      protocolOutput = getProtocolOutput(url, datum);
+      protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
       numRedirects++;
     }
 
+    if (checkRobotsTxt && protocolOutput == null) {
+      System.err.println("Fetch disallowed by robots.txt");
+      return -1;
+    }
+
     if (!protocolOutput.getStatus().isSuccess()) {
       System.err.println("Fetch failed with protocol status: "
           + protocolOutput.getStatus());

diff --git a/src/java/org/apache/nutch/tools/FreeGenerator.java b/src/java/org/apache/nutch/tools/FreeGenerator.java
index 8c537d9..b51be74 100644
--- a/src/java/org/apache/nutch/tools/FreeGenerator.java
+++ b/src/java/org/apache/nutch/tools/FreeGenerator.java

@@ -146,27 +146,33 @@
   @Override
   public int run(String[] args) throws Exception {
     if (args.length < 2) {
-      System.err
-          .println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
+      System.err.println(
+          "Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize] [-numFetchers <n>]");
       System.err
           .println("\tinputDir\tinput directory containing one or more input files.");
       System.err
-          .println("\t\tEach text file contains a list of URLs, one URL per line");
+          .println("\t        \tEach text file contains a list of URLs, one URL per line");
       System.err
           .println("\tsegmentsDir\toutput directory, where new segment will be created");
-      System.err.println("\t-filter\trun current URLFilters on input URLs");
+      System.err.println("\t-filter   \trun current URLFilters on input URLs");
       System.err
           .println("\t-normalize\trun current URLNormalizers on input URLs");
+      System.err.println(
+          "\t-numFetchers <n>\tnumber of generated fetch lists, determines number of fetcher tasks");
       return -1;
     }
     boolean filter = false;
     boolean normalize = false;
+    int numFetchers = -1;
     if (args.length > 2) {
       for (int i = 2; i < args.length; i++) {
         if (args[i].equals("-filter")) {
           filter = true;
         } else if (args[i].equals("-normalize")) {
           normalize = true;
+        } else if ("-numFetchers".equals(args[i])) {
+          numFetchers = Integer.parseInt(args[i + 1]);
+          i++;
         } else {
           LOG.error("Unknown argument: " + args[i] + ", exiting ...");
           return -1;
@@ -191,7 +197,17 @@
     job.setPartitionerClass(URLPartitioner.class);
     job.setReducerClass(FG.FGReducer.class);
     String segName = Generator.generateSegmentName();
-    job.setNumReduceTasks(Integer.parseInt(conf.get("mapreduce.job.maps")));
+    if (numFetchers == -1) {
+      /* for politeness create exactly one partition per fetch task */
+      numFetchers = Integer.parseInt(conf.get("mapreduce.job.maps"));
+    }
+    if ("local".equals(conf.get("mapreduce.framework.name"))
+        && numFetchers != 1) {
+      // override
+      LOG.info(
+          "FreeGenerator: running in local mode, generating exactly one partition.");
+      numFetchers = 1;
+    }
     job.setOutputFormatClass(SequenceFileOutputFormat.class);
     job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(CrawlDatum.class);

diff --git a/src/java/org/apache/nutch/util/AbstractChecker.java b/src/java/org/apache/nutch/util/AbstractChecker.java
index b41bbc9..616e3dd 100644
--- a/src/java/org/apache/nutch/util/AbstractChecker.java
+++ b/src/java/org/apache/nutch/util/AbstractChecker.java

@@ -36,6 +36,8 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import crawlercommons.robots.BaseRobotRules;
+
 /**
  * Scaffolding class for the various Checker implementations. Can process cmdline input, stdin and TCP connections.
  * 
@@ -188,10 +190,21 @@
     }
   }
 
-  protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) throws Exception {
+  protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum,
+      boolean checkRobotsTxt) throws Exception {
     ProtocolFactory factory = new ProtocolFactory(getConf());
     Protocol protocol = factory.getProtocol(url);
     Text turl = new Text(url);
+    if (checkRobotsTxt) {
+      System.err.print("Checking robots.txt ...");
+      BaseRobotRules rules = protocol.getRobotRules(turl, datum, null);
+      if (rules.isAllowed(url)) {
+        System.err.println(" (allowed)");
+      } else {
+        System.err.println("\nDenied by robots.txt: " + url);
+        return null;
+      }
+    }
     return protocol.getProtocolOutput(turl, datum);
   }
 

diff --git a/src/java/org/apache/nutch/util/NutchJob.java b/src/java/org/apache/nutch/util/NutchJob.java
index 991e506..13257d2 100644
--- a/src/java/org/apache/nutch/util/NutchJob.java
+++ b/src/java/org/apache/nutch/util/NutchJob.java

@@ -41,7 +41,15 @@
     return Job.getInstance(conf);
   } 
 
-  /*
+  /**
+   * Clean up the file system in case of a job failure.
+   */
+  public static void cleanupAfterFailure(Path tempDir, FileSystem fs)
+      throws IOException {
+    cleanupAfterFailure(tempDir, null, fs);
+  }
+
+  /**
    * Clean up the file system in case of a job failure.
    */
   public static void cleanupAfterFailure(Path tempDir, Path lock, FileSystem fs)
@@ -50,7 +58,9 @@
       if (fs.exists(tempDir)) {
         fs.delete(tempDir, true);
       }
-      LockUtil.removeLockFile(fs, lock);
+      if (lock != null) {
+        LockUtil.removeLockFile(fs, lock);
+      }
     } catch (IOException e) {
       LOG.error("NutchJob cleanup failed: {}", e.getMessage());
       throw e;
commit	e61a8a3b0af5540ffe23f862e2af69092114f506	[log] [tgz]
author	Sebastian Nagel <snagel@apache.org>	Tue May 12 15:35:09 2020 +0200
committer	GitHub <noreply@github.com>	Tue May 12 15:35:09 2020 +0200
tree	6c63331240c3356816d0d9899205e0ee8192e83b
parent	ec93b3359d207fef62378cbd15bb63c7acc33f66 [diff]
parent	0341f0dfa156d3963e88b2cb9507013b0eef8668 [diff]