NUTCH-2803 Rename property http.robot.rules.whitelist

commit: 8971ccc3ec96d80f22373782145e23dc14fba8b9 [log] [tgz]
author: Lewis John McGibbney <lewis.mcgibbney@gmail.com> Fri Jul 10 11:35:26 2020 -0700
committer: Lewis John McGibbney <lewis.mcgibbney@gmail.com> Fri Jul 10 11:35:26 2020 -0700
tree: a816fb1e68b633c216edd9ef4e6422ccdaff824d
parent: a1adce70147d3e0cf5e0bc6dba26147aa5b93c66 [diff]
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 6932eb5..f9737ac 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml

@@ -118,7 +118,7 @@
 </property>
 
 <property>
-  <name>http.robot.rules.whitelist</name>
+  <name>http.robot.rules.allowlist</name>
   <value></value>
   <description>Comma separated list of hostnames or IP addresses to ignore 
   robot rules parsing for. Use with care and only if you are explicitly

diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
index 159f34f..a836cab 100644
--- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
+++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java

@@ -86,7 +86,7 @@
   protected String agentNames;
 
   /** set of host names or IPs to be explicitly excluded from robots.txt checking */
-  protected Set<String> whiteList = new HashSet<>();
+  protected Set<String> allowList = new HashSet<>();
   
   /* Matcher user for efficiently matching URLs against a set of suffixes. */
   private SuffixStringMatcher matcher = null;
@@ -131,22 +131,22 @@
       agentNames = sb.toString();
     }
 
-    String[] confWhiteList = conf.getStrings("http.robot.rules.whitelist");
-    if (confWhiteList == null) {
-      LOG.info("robots.txt whitelist not configured.");
+    String[] confAllowList = conf.getStrings("http.robot.rules.allowlist");
+    if (confAllowList == null) {
+      LOG.info("robots.txt allowlist not configured.");
     }
     else {
-      for (int i = 0; i < confWhiteList.length; i++) {
-        if (confWhiteList[i].isEmpty()) {
-      	  LOG.info("Empty whitelisted URL skipped!");
+      for (int i = 0; i < confAllowList.length; i++) {
+        if (confAllowList[i].isEmpty()) {
+      	  LOG.info("Empty allowlisted URL skipped!");
       	  continue;
         }
-        whiteList.add(confWhiteList[i]);
+        allowList.add(confAllowList[i]);
       }
       
-      if (whiteList.size() > 0) {
-        matcher = new SuffixStringMatcher(whiteList);
-        LOG.info("Whitelisted hosts: " + whiteList);
+      if (allowList.size() > 0) {
+        matcher = new SuffixStringMatcher(allowList);
+        LOG.info("Allowlisted hosts: " + allowList);
       }
     }
   }
@@ -159,9 +159,9 @@
   }
 
   /**
-   * Check whether a URL belongs to a whitelisted host.
+   * Check whether a URL belongs to a allowlisted host.
    */
-  public boolean isWhiteListed(URL url) {
+  public boolean isAllowListed(URL url) {
     boolean match = false;
     String urlString = url.getHost();
     
@@ -271,7 +271,7 @@
           "\toutput content and HTTP meta data of fetched robots.txt (if not a local file)",
           " -D http.agent.name=...\tsame as argument <agent-names>",
           " -D http.robots.agents=...\tadditional agent names",
-          " -D http.robot.rules.whitelist=..."};
+          " -D http.robot.rules.allowlist=..."};
       for (String s : help) {
         System.err.println(s);
       }
@@ -347,8 +347,8 @@
           // testPath can be just a path or a complete URL
           URL url = new URL(testPath);
           String status;
-          if (isWhiteListed(url)) {
-            status = "whitelisted";
+          if (isAllowListed(url)) {
+            status = "allowlisted";
           } else if (rules.isAllowed(testPath)) {
             status = "allowed";
           } else {

diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
index f761bd0..34277b0 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java

@@ -96,8 +96,8 @@
   public BaseRobotRules getRobotRulesSet(Protocol http, URL url,
       List<Content> robotsTxtContent) {
 
-    if (LOG.isTraceEnabled() && isWhiteListed(url)) {
-      LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
+    if (LOG.isTraceEnabled() && isAllowListed(url)) {
+      LOG.trace("Ignoring robots.txt (host is allowlisted) for URL: {}", url);
     }
 
     String cacheKey = getCacheKey(url);
@@ -112,12 +112,12 @@
     boolean cacheRule = true;
     URL redir = null;
 
-    if (isWhiteListed(url)) {
-      // check in advance whether a host is whitelisted
+    if (isAllowListed(url)) {
+      // check in advance whether a host is allowlisted
       // (we do not need to fetch robots.txt)
       robotRules = EMPTY_RULES;
-      LOG.info("Whitelisted host found for: {}", url);
-      LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}",
+      LOG.info("Allowlisted host found for: {}", url);
+      LOG.info("Ignoring robots.txt for all URLs from allowlisted host: {}",
           url.getHost());
 
     } else {

diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
index b28d021..45a3da4 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java

@@ -78,8 +78,8 @@
                                                        // case
     String host = url.getHost().toLowerCase(); // normalize to lower case
 
-    if (LOG.isTraceEnabled() && isWhiteListed(url)) {
-      LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
+    if (LOG.isTraceEnabled() && isAllowListed(url)) {
+      LOG.trace("Ignoring robots.txt (host is allowlisted) for URL: {}", url);
     }
 
     BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
@@ -92,12 +92,12 @@
 
     boolean cacheRule = true;
 
-    if (isWhiteListed(url)) {
-      // check in advance whether a host is whitelisted
+    if (isAllowListed(url)) {
+      // check in advance whether a host is allowlisted
       // (we do not need to fetch robots.txt)
       robotRules = EMPTY_RULES;
-      LOG.info("Whitelisted host found for: {}", url);
-      LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
+      LOG.info("Allowlisted host found for: {}", url);
+      LOG.info("Ignoring robots.txt for all URLs from allowlisted host: {}", host);
 
     } else {
       try {
commit	8971ccc3ec96d80f22373782145e23dc14fba8b9	[log] [tgz]
author	Lewis John McGibbney <lewis.mcgibbney@gmail.com>	Fri Jul 10 11:35:26 2020 -0700
committer	Lewis John McGibbney <lewis.mcgibbney@gmail.com>	Fri Jul 10 11:35:26 2020 -0700
tree	a816fb1e68b633c216edd9ef4e6422ccdaff824d
parent	a1adce70147d3e0cf5e0bc6dba26147aa5b93c66 [diff]