Merge pull request #521 from sebastian-nagel/NUTCH-2002-checkers-robotstxt NUTCH-2002 parse and index checkers to check robots.txt

commit: 40472c19fdc3ccf47409e6dc5d203cc7fac60419 [log] [tgz]
author: Sebastian Nagel <snagel@apache.org> Tue May 05 15:59:04 2020 +0200
committer: GitHub <noreply@github.com> Tue May 05 15:59:04 2020 +0200
tree: ed997318ba59c1901d338885d5bcce7850716036
parent: aa0c75e72725598e7bd8b9715734b4463bdc7828 [diff]
parent: 46db3ed71355fefda42a008ece75094f51859ab2 [diff]
diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
index 4f849a0..84d9f6d 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

@@ -58,6 +58,7 @@
   protected URLNormalizers normalizers = null;
   protected boolean dumpText = false;
   protected boolean followRedirects = false;
+  protected boolean checkRobotsTxt = false;
   protected boolean doIndex = false;
   // used to simulate the metadata propagated from injection
   protected HashMap<String, String> metadata = new HashMap<>();
@@ -82,6 +83,7 @@
         + "                  \t before other command-specific options)\n"
         + "  -normalize      \tnormalize URLs\n" //
         + "  -followRedirects\tfollow redirects when fetching URL\n" //
+        + "  -checkRobotsTxt\tfail if the robots.txt disallows fetching\n" //
         + "  -dumpText       \tshow the entire plain-text content,\n" //"
         + "                  \tnot only the first 100 characters\n" //
         + "  -doIndex        \tpass document to configured index writers\n" //
@@ -103,6 +105,8 @@
         normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
       } else if (args[i].equals("-followRedirects")) {
         followRedirects = true;
+      } else if (args[i].equals("-checkRobotsTxt")) {
+        checkRobotsTxt = true;
       } else if (args[i].equals("-dumpText")) {
         dumpText = true;
       } else if (args[i].equals("-doIndex")) {
@@ -164,13 +168,15 @@
       }
     }
 
-    ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
+    ProtocolOutput protocolOutput = getProtocolOutput(url, datum,
+        checkRobotsTxt);
     Text turl = new Text(url);
-    
+
     // Following redirects and not reached maxRedirects?
     int numRedirects = 0;
-    while (!protocolOutput.getStatus().isSuccess() && followRedirects
-        && protocolOutput.getStatus().isRedirect() && maxRedirects >= numRedirects) {
+    while (protocolOutput != null && !protocolOutput.getStatus().isSuccess()
+        && followRedirects && protocolOutput.getStatus().isRedirect()
+        && maxRedirects >= numRedirects) {
       String[] stuff = protocolOutput.getStatus().getArgs();
       url = stuff[0];
       LOG.info("Follow redirect to {}", url);
@@ -182,10 +188,15 @@
       turl.set(url);
 
       // try again
-      protocolOutput = getProtocolOutput(url, datum);
+      protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
       numRedirects++;
     }
 
+    if (checkRobotsTxt && protocolOutput == null) {
+      System.err.println("Fetch disallowed by robots.txt");
+      return -1;
+    }
+
     if (!protocolOutput.getStatus().isSuccess()) {
       System.err.println("Fetch failed with protocol status: "
           + protocolOutput.getStatus());

diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java b/src/java/org/apache/nutch/parse/ParserChecker.java
index 2a976ba..4dbfcfa 100644
--- a/src/java/org/apache/nutch/parse/ParserChecker.java
+++ b/src/java/org/apache/nutch/parse/ParserChecker.java

@@ -69,6 +69,7 @@
   protected URLNormalizers normalizers = null;
   protected boolean dumpText = false;
   protected boolean followRedirects = false;
+  protected boolean checkRobotsTxt = false;
   // used to simulate the metadata propagated from injection
   protected HashMap<String, String> metadata = new HashMap<>();
   protected String forceAsContentType = null;
@@ -94,9 +95,11 @@
         + "                  \t before other command-specific options)\n"
         + "  -normalize      \tnormalize URLs\n" //
         + "  -followRedirects\tfollow redirects when fetching URL\n" //
+        + "  -checkRobotsTxt\tfail if the robots.txt disallows fetching\n" //
         + "  -dumpText       \talso show the plain-text extracted by parsers\n" //
         + "  -forceAs <mimeType>\tforce parsing as <mimeType>\n" //
         + "  -md <key>=<value>\tmetadata added to CrawlDatum before parsing\n";
+
     // Print help when no args given
     if (args.length < 1) {
       System.err.println(usage);
@@ -109,6 +112,8 @@
         normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
       } else if (args[i].equals("-followRedirects")) {
         followRedirects = true;
+      } else if (args[i].equals("-checkRobotsTxt")) {
+        checkRobotsTxt = true;
       } else if (args[i].equals("-forceAs")) {
         forceAsContentType = args[++i];
       } else if (args[i].equals("-dumpText")) {
@@ -172,13 +177,15 @@
       }
     }
 
-    ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
+    ProtocolOutput protocolOutput = getProtocolOutput(url, datum,
+        checkRobotsTxt);
     Text turl = new Text(url);
-    
+
     // Following redirects and not reached maxRedirects?
     int numRedirects = 0;
-    while (!protocolOutput.getStatus().isSuccess() && followRedirects
-        && protocolOutput.getStatus().isRedirect() && maxRedirects >= numRedirects) {
+    while (protocolOutput != null && !protocolOutput.getStatus().isSuccess()
+        && followRedirects && protocolOutput.getStatus().isRedirect()
+        && maxRedirects >= numRedirects) {
       String[] stuff = protocolOutput.getStatus().getArgs();
       url = stuff[0];
       LOG.info("Follow redirect to {}", url);
@@ -190,10 +197,15 @@
       turl.set(url);
 
       // try again
-      protocolOutput = getProtocolOutput(url, datum);
+      protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
       numRedirects++;
     }
 
+    if (checkRobotsTxt && protocolOutput == null) {
+      System.err.println("Fetch disallowed by robots.txt");
+      return -1;
+    }
+
     if (!protocolOutput.getStatus().isSuccess()) {
       System.err.println("Fetch failed with protocol status: "
           + protocolOutput.getStatus());

diff --git a/src/java/org/apache/nutch/util/AbstractChecker.java b/src/java/org/apache/nutch/util/AbstractChecker.java
index b41bbc9..616e3dd 100644
--- a/src/java/org/apache/nutch/util/AbstractChecker.java
+++ b/src/java/org/apache/nutch/util/AbstractChecker.java

@@ -36,6 +36,8 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import crawlercommons.robots.BaseRobotRules;
+
 /**
  * Scaffolding class for the various Checker implementations. Can process cmdline input, stdin and TCP connections.
  * 
@@ -188,10 +190,21 @@
     }
   }
 
-  protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) throws Exception {
+  protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum,
+      boolean checkRobotsTxt) throws Exception {
     ProtocolFactory factory = new ProtocolFactory(getConf());
     Protocol protocol = factory.getProtocol(url);
     Text turl = new Text(url);
+    if (checkRobotsTxt) {
+      System.err.print("Checking robots.txt ...");
+      BaseRobotRules rules = protocol.getRobotRules(turl, datum, null);
+      if (rules.isAllowed(url)) {
+        System.err.println(" (allowed)");
+      } else {
+        System.err.println("\nDenied by robots.txt: " + url);
+        return null;
+      }
+    }
     return protocol.getProtocolOutput(turl, datum);
   }
commit	40472c19fdc3ccf47409e6dc5d203cc7fac60419	[log] [tgz]
author	Sebastian Nagel <snagel@apache.org>	Tue May 05 15:59:04 2020 +0200
committer	GitHub <noreply@github.com>	Tue May 05 15:59:04 2020 +0200
tree	ed997318ba59c1901d338885d5bcce7850716036
parent	aa0c75e72725598e7bd8b9715734b4463bdc7828 [diff]
parent	46db3ed71355fefda42a008ece75094f51859ab2 [diff]