Merge pull request #537 from sebastian-nagel/NUTCH-2801-robots-checker
[NUTCH-2801] RobotsRulesParser command-line checker to use http.robots.agents as fall-back
diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
index 159f34f..2cb52a6 100644
--- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
+++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
@@ -133,7 +133,7 @@
String[] confWhiteList = conf.getStrings("http.robot.rules.whitelist");
if (confWhiteList == null) {
- LOG.info("robots.txt whitelist not configured.");
+ LOG.debug("robots.txt whitelist not configured.");
}
else {
for (int i = 0; i < confWhiteList.length; i++) {
@@ -262,14 +262,16 @@
"",
"<agent-names>\tcomma-separated list of agent names",
"\tused to select rules from the robots.txt file.",
- "\tIf no agent name is given the property http.agent.name is used.",
- "\tIf http.agent.name is empty, robots.txt is checked for rules",
- "\tassigned to the user agent `*' (meaning any other).",
+ "\tIf no agent name is given the properties http.agent.name",
+ "\tand http.robots.agents are used.",
+ "\tIf also http.agent.name and http.robots.agents are empty,",
+ "\trobots.txt is checked for rules assigned to the user",
+ "\tagent `*' (meaning any other).",
"",
"Important properties:",
" -D fetcher.store.robotstxt=true",
"\toutput content and HTTP meta data of fetched robots.txt (if not a local file)",
- " -D http.agent.name=...\tsame as argument <agent-names>",
+ " -D http.agent.name=...\t(primary) agent name",
" -D http.robots.agents=...\tadditional agent names",
" -D http.robot.rules.whitelist=..."};
for (String s : help) {
@@ -315,7 +317,8 @@
if (args.length > 2) {
// set agent name from command-line in configuration and update parser
String agents = args[2];
- conf.set("http.agent.name", agents);
+ conf.set("http.robots.agents", agents);
+ conf.set("http.agent.name", agents.split(",")[0]);
setConf(conf);
}
@@ -376,13 +379,24 @@
*/
private static class TestRobotRulesParser extends RobotRulesParser {
- public TestRobotRulesParser(Configuration conf) {
- // make sure that agent name is set so that setConf() does not complain,
- // the agent name is later overwritten by command-line argument
- if (conf.get("http.agent.name") == null) {
- conf.set("http.agent.name", "*");
+ public void setConf(Configuration conf) {
+ /*
+ * Make sure that agent name is not empty so that
+ * RobotRulesParser.setConf() does not complain.
+ *
+ * If provided the agent names passed as command-line argument are
+ * checked, see RobotRulesParser.run(...). Also http.agent.name is then
+ * filled taking the first agent name from command-line.
+ */
+ if (conf.get("http.agent.name", "").isEmpty()) {
+ String firstRobotsAgent = conf.get("http.robots.agents", "").split(",")[0].trim();
+ if (firstRobotsAgent.isEmpty()) {
+ conf.set("http.agent.name", "*");
+ } else {
+ conf.set("http.agent.name", firstRobotsAgent);
+ }
}
- setConf(conf);
+ super.setConf(conf);
}
/**
@@ -407,7 +421,7 @@
openStream.read(robotsBytes);
openStream.close();
rules = robotParser.parseContent(url.toString(), robotsBytes,
- "text/plain", this.conf.get("http.agent.name"));
+ "text/plain", agentNames);
} catch (IOException e) {
LOG.error("Failed to open robots.txt file " + url
+ StringUtils.stringifyException(e));
@@ -421,7 +435,7 @@
public static void main(String[] args) throws Exception {
Configuration conf = NutchConfiguration.create();
- int res = ToolRunner.run(conf, new TestRobotRulesParser(conf), args);
+ int res = ToolRunner.run(conf, new TestRobotRulesParser(), args);
System.exit(res);
}