Merge pull request #521 from sebastian-nagel/NUTCH-2002-checkers-robotstxt
NUTCH-2002 parse and index checkers to check robots.txt
diff --git a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
index 4f849a0..84d9f6d 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
@@ -58,6 +58,7 @@
protected URLNormalizers normalizers = null;
protected boolean dumpText = false;
protected boolean followRedirects = false;
+ protected boolean checkRobotsTxt = false;
protected boolean doIndex = false;
// used to simulate the metadata propagated from injection
protected HashMap<String, String> metadata = new HashMap<>();
@@ -82,6 +83,7 @@
+ " \t before other command-specific options)\n"
+ " -normalize \tnormalize URLs\n" //
+ " -followRedirects\tfollow redirects when fetching URL\n" //
+ + " -checkRobotsTxt\tfail if the robots.txt disallows fetching\n" //
+ " -dumpText \tshow the entire plain-text content,\n" //"
+ " \tnot only the first 100 characters\n" //
+ " -doIndex \tpass document to configured index writers\n" //
@@ -103,6 +105,8 @@
normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
} else if (args[i].equals("-followRedirects")) {
followRedirects = true;
+ } else if (args[i].equals("-checkRobotsTxt")) {
+ checkRobotsTxt = true;
} else if (args[i].equals("-dumpText")) {
dumpText = true;
} else if (args[i].equals("-doIndex")) {
@@ -164,13 +168,15 @@
}
}
- ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
+ ProtocolOutput protocolOutput = getProtocolOutput(url, datum,
+ checkRobotsTxt);
Text turl = new Text(url);
-
+
// Following redirects and not reached maxRedirects?
int numRedirects = 0;
- while (!protocolOutput.getStatus().isSuccess() && followRedirects
- && protocolOutput.getStatus().isRedirect() && maxRedirects >= numRedirects) {
+ while (protocolOutput != null && !protocolOutput.getStatus().isSuccess()
+ && followRedirects && protocolOutput.getStatus().isRedirect()
+ && maxRedirects >= numRedirects) {
String[] stuff = protocolOutput.getStatus().getArgs();
url = stuff[0];
LOG.info("Follow redirect to {}", url);
@@ -182,10 +188,15 @@
turl.set(url);
// try again
- protocolOutput = getProtocolOutput(url, datum);
+ protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
numRedirects++;
}
+ if (checkRobotsTxt && protocolOutput == null) {
+ System.err.println("Fetch disallowed by robots.txt");
+ return -1;
+ }
+
if (!protocolOutput.getStatus().isSuccess()) {
System.err.println("Fetch failed with protocol status: "
+ protocolOutput.getStatus());
diff --git a/src/java/org/apache/nutch/parse/ParserChecker.java b/src/java/org/apache/nutch/parse/ParserChecker.java
index 2a976ba..4dbfcfa 100644
--- a/src/java/org/apache/nutch/parse/ParserChecker.java
+++ b/src/java/org/apache/nutch/parse/ParserChecker.java
@@ -69,6 +69,7 @@
protected URLNormalizers normalizers = null;
protected boolean dumpText = false;
protected boolean followRedirects = false;
+ protected boolean checkRobotsTxt = false;
// used to simulate the metadata propagated from injection
protected HashMap<String, String> metadata = new HashMap<>();
protected String forceAsContentType = null;
@@ -94,9 +95,11 @@
+ " \t before other command-specific options)\n"
+ " -normalize \tnormalize URLs\n" //
+ " -followRedirects\tfollow redirects when fetching URL\n" //
+ + " -checkRobotsTxt\tfail if the robots.txt disallows fetching\n" //
+ " -dumpText \talso show the plain-text extracted by parsers\n" //
+ " -forceAs <mimeType>\tforce parsing as <mimeType>\n" //
+ " -md <key>=<value>\tmetadata added to CrawlDatum before parsing\n";
+
// Print help when no args given
if (args.length < 1) {
System.err.println(usage);
@@ -109,6 +112,8 @@
normalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
} else if (args[i].equals("-followRedirects")) {
followRedirects = true;
+ } else if (args[i].equals("-checkRobotsTxt")) {
+ checkRobotsTxt = true;
} else if (args[i].equals("-forceAs")) {
forceAsContentType = args[++i];
} else if (args[i].equals("-dumpText")) {
@@ -172,13 +177,15 @@
}
}
- ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
+ ProtocolOutput protocolOutput = getProtocolOutput(url, datum,
+ checkRobotsTxt);
Text turl = new Text(url);
-
+
// Following redirects and not reached maxRedirects?
int numRedirects = 0;
- while (!protocolOutput.getStatus().isSuccess() && followRedirects
- && protocolOutput.getStatus().isRedirect() && maxRedirects >= numRedirects) {
+ while (protocolOutput != null && !protocolOutput.getStatus().isSuccess()
+ && followRedirects && protocolOutput.getStatus().isRedirect()
+ && maxRedirects >= numRedirects) {
String[] stuff = protocolOutput.getStatus().getArgs();
url = stuff[0];
LOG.info("Follow redirect to {}", url);
@@ -190,10 +197,15 @@
turl.set(url);
// try again
- protocolOutput = getProtocolOutput(url, datum);
+ protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
numRedirects++;
}
+ if (checkRobotsTxt && protocolOutput == null) {
+ System.err.println("Fetch disallowed by robots.txt");
+ return -1;
+ }
+
if (!protocolOutput.getStatus().isSuccess()) {
System.err.println("Fetch failed with protocol status: "
+ protocolOutput.getStatus());
diff --git a/src/java/org/apache/nutch/util/AbstractChecker.java b/src/java/org/apache/nutch/util/AbstractChecker.java
index b41bbc9..616e3dd 100644
--- a/src/java/org/apache/nutch/util/AbstractChecker.java
+++ b/src/java/org/apache/nutch/util/AbstractChecker.java
@@ -36,6 +36,8 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import crawlercommons.robots.BaseRobotRules;
+
/**
* Scaffolding class for the various Checker implementations. Can process cmdline input, stdin and TCP connections.
*
@@ -188,10 +190,21 @@
}
}
- protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) throws Exception {
+ protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum,
+ boolean checkRobotsTxt) throws Exception {
ProtocolFactory factory = new ProtocolFactory(getConf());
Protocol protocol = factory.getProtocol(url);
Text turl = new Text(url);
+ if (checkRobotsTxt) {
+ System.err.print("Checking robots.txt ...");
+ BaseRobotRules rules = protocol.getRobotRules(turl, datum, null);
+ if (rules.isAllowed(url)) {
+ System.err.println(" (allowed)");
+ } else {
+ System.err.println("\nDenied by robots.txt: " + url);
+ return null;
+ }
+ }
return protocol.getProtocolOutput(turl, datum);
}