NUTCH-2803 Rename property http.robot.rules.whitelist
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 6932eb5..f9737ac 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -118,7 +118,7 @@
</property>
<property>
- <name>http.robot.rules.whitelist</name>
+ <name>http.robot.rules.allowlist</name>
<value></value>
<description>Comma separated list of hostnames or IP addresses to ignore
robot rules parsing for. Use with care and only if you are explicitly
diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
index 159f34f..a836cab 100644
--- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
+++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
@@ -86,7 +86,7 @@
protected String agentNames;
/** set of host names or IPs to be explicitly excluded from robots.txt checking */
- protected Set<String> whiteList = new HashSet<>();
+ protected Set<String> allowList = new HashSet<>();
/* Matcher user for efficiently matching URLs against a set of suffixes. */
private SuffixStringMatcher matcher = null;
@@ -131,22 +131,22 @@
agentNames = sb.toString();
}
- String[] confWhiteList = conf.getStrings("http.robot.rules.whitelist");
- if (confWhiteList == null) {
- LOG.info("robots.txt whitelist not configured.");
+ String[] confAllowList = conf.getStrings("http.robot.rules.allowlist");
+ if (confAllowList == null) {
+ LOG.info("robots.txt allowlist not configured.");
}
else {
- for (int i = 0; i < confWhiteList.length; i++) {
- if (confWhiteList[i].isEmpty()) {
- LOG.info("Empty whitelisted URL skipped!");
+ for (int i = 0; i < confAllowList.length; i++) {
+ if (confAllowList[i].isEmpty()) {
+ LOG.info("Empty allowlisted URL skipped!");
continue;
}
- whiteList.add(confWhiteList[i]);
+ allowList.add(confAllowList[i]);
}
- if (whiteList.size() > 0) {
- matcher = new SuffixStringMatcher(whiteList);
- LOG.info("Whitelisted hosts: " + whiteList);
+ if (allowList.size() > 0) {
+ matcher = new SuffixStringMatcher(allowList);
+ LOG.info("Allowlisted hosts: " + allowList);
}
}
}
@@ -159,9 +159,9 @@
}
/**
- * Check whether a URL belongs to a whitelisted host.
+ * Check whether a URL belongs to a allowlisted host.
*/
- public boolean isWhiteListed(URL url) {
+ public boolean isAllowListed(URL url) {
boolean match = false;
String urlString = url.getHost();
@@ -271,7 +271,7 @@
"\toutput content and HTTP meta data of fetched robots.txt (if not a local file)",
" -D http.agent.name=...\tsame as argument <agent-names>",
" -D http.robots.agents=...\tadditional agent names",
- " -D http.robot.rules.whitelist=..."};
+ " -D http.robot.rules.allowlist=..."};
for (String s : help) {
System.err.println(s);
}
@@ -347,8 +347,8 @@
// testPath can be just a path or a complete URL
URL url = new URL(testPath);
String status;
- if (isWhiteListed(url)) {
- status = "whitelisted";
+ if (isAllowListed(url)) {
+ status = "allowlisted";
} else if (rules.isAllowed(testPath)) {
status = "allowed";
} else {
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
index f761bd0..34277b0 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
@@ -96,8 +96,8 @@
public BaseRobotRules getRobotRulesSet(Protocol http, URL url,
List<Content> robotsTxtContent) {
- if (LOG.isTraceEnabled() && isWhiteListed(url)) {
- LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
+ if (LOG.isTraceEnabled() && isAllowListed(url)) {
+ LOG.trace("Ignoring robots.txt (host is allowlisted) for URL: {}", url);
}
String cacheKey = getCacheKey(url);
@@ -112,12 +112,12 @@
boolean cacheRule = true;
URL redir = null;
- if (isWhiteListed(url)) {
- // check in advance whether a host is whitelisted
+ if (isAllowListed(url)) {
+ // check in advance whether a host is allowlisted
// (we do not need to fetch robots.txt)
robotRules = EMPTY_RULES;
- LOG.info("Whitelisted host found for: {}", url);
- LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}",
+ LOG.info("Allowlisted host found for: {}", url);
+ LOG.info("Ignoring robots.txt for all URLs from allowlisted host: {}",
url.getHost());
} else {
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
index b28d021..45a3da4 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
@@ -78,8 +78,8 @@
// case
String host = url.getHost().toLowerCase(); // normalize to lower case
- if (LOG.isTraceEnabled() && isWhiteListed(url)) {
- LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
+ if (LOG.isTraceEnabled() && isAllowListed(url)) {
+ LOG.trace("Ignoring robots.txt (host is allowlisted) for URL: {}", url);
}
BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
@@ -92,12 +92,12 @@
boolean cacheRule = true;
- if (isWhiteListed(url)) {
- // check in advance whether a host is whitelisted
+ if (isAllowListed(url)) {
+ // check in advance whether a host is allowlisted
// (we do not need to fetch robots.txt)
robotRules = EMPTY_RULES;
- LOG.info("Whitelisted host found for: {}", url);
- LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
+ LOG.info("Allowlisted host found for: {}", url);
+ LOG.info("Ignoring robots.txt for all URLs from allowlisted host: {}", host);
} else {
try {