Merge pull request #576 from sebastian-nagel/NUTCH-2859-urlnormalizer-protocol-domain-rules NUTCH-2859: urlnormalizer-protocol: allow to normalize domains

commit: 6c02da053d8ce65e0283a144ab59586e563608b8 [log] [tgz]
author: Sebastian Nagel <snagel@apache.org> Tue Apr 06 16:52:03 2021 +0200
committer: GitHub <noreply@github.com> Tue Apr 06 16:52:03 2021 +0200
tree: 556708366c51fbc0da57dce02fe2b5b71fb84050
parent: 2837039b9c5b52a88c2029a5e29c81cecd8953f3 [diff]
parent: 081c826745356e041c6d4e9d3e3c96ae91eddd2b [diff]
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 5548a30..3e867e6 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml

@@ -1445,6 +1445,21 @@
   </description>
 </property>
 
+<property>
+  <name>urlnormalizer.protocols.file</name>
+  <value>protocols.txt</value>
+  <description>urlnormalizer-protocol configuration file</description>
+</property>
+
+<property>
+  <name>urlnormalizer.protocols.rules</name>
+  <value></value>
+  <description>urlnormalizer-protocol rule definitions: if not empty,
+  takes precedence over rules defined in the rule file (see
+  urlnormalizer.protocols.file)</description>
+</property>
+
+
 <!-- mime properties -->
 
 <!--

diff --git a/conf/protocols.txt.template b/conf/protocols.txt.template
index 14d48ff..140fb8e 100644
--- a/conf/protocols.txt.template
+++ b/conf/protocols.txt.template

@@ -4,4 +4,10 @@
 # protocol. Useful in cases where a host accepts both http and https, doubling
 # the site's size.
 #
+# Also all hosts of a domain can be addressed by adding a "host" pattern
+# starting with "*.". E.g., "*.wikipedia.org" will match all subdomains of
+# the domain "wikipedia.org"
+#
+# Note: if the URL includes a port number, the protocol is left unchanged.
+#
 # format: <host>\t<protocol>\n

diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index fd0aa18..8f3a1fd 100644
--- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java

@@ -123,6 +123,8 @@
 
     // check that port number is normalized
     normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html");
+    normalizeTest("https://foo.com:443/index.html",
+        "https://foo.com/index.html");
     normalizeTest("http://foo.com:81/", "http://foo.com:81/");
     // check that empty port is removed
     normalizeTest("http://example.com:/", "http://example.com/");

diff --git a/src/plugin/urlnormalizer-protocol/data/protocols.txt b/src/plugin/urlnormalizer-protocol/data/protocols.txt
index 7091cd7..7e49703 100644
--- a/src/plugin/urlnormalizer-protocol/data/protocols.txt
+++ b/src/plugin/urlnormalizer-protocol/data/protocols.txt

@@ -1,7 +1,21 @@
-# format: host\tprotocol\n
+# Example configuration file for urlnormalizer-protocol
+#
+# URL's of hosts listed in the configuration are normalized to the target
+# protocol. Useful in cases where a host accepts both http and https, doubling
+# the site's size.
+#
+# Also all hosts of a domain can be addressed by adding a "host" pattern
+# starting with "*.". E.g., "*.wikipedia.org" will match all subdomains of
+# the domain "wikipedia.org"
+#
+# Note: if the URL includes a port number, the protocol is left unchanged.
+#
+# format: <host>\t<protocol>\n
 
 example.org	http
 example.net	http
 
 example.io	https
 example.nl	https
+
+*.example.com	https

diff --git a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
index f60c291..e1afde8 100644
--- a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
+++ b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java

@@ -18,26 +18,38 @@
 
 import java.lang.invoke.MethodHandles;
 import java.io.BufferedReader;
-import java.io.FileReader;
 import java.io.IOException;
+import java.io.InputStreamReader;
 import java.io.Reader;
 import java.io.StringReader;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.TreeMap;
 
 import org.apache.commons.lang.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.net.URLNormalizer;
 import org.apache.nutch.plugin.Extension;
 import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.SuffixStringMatcher;
 
 /**
- * @author markus@openindex.io
+ * URL normalizer to normalize the protocol for all URLs of a given host or
+ * domain, e.g. normalize <code>http://nutch.apache.org/path/</code> to
+ * <code>https://www.apache.org/path/</code> if it's known that the host
+ * <code>nutch.apache.org</code> supports https and http-URLs either cause
+ * duplicate content or are redirected to https.
+ * 
+ * See {@link org.apache.nutch.net.urlnormalizer.protocol} for details and
+ * configuration.
  */
 public class ProtocolURLNormalizer implements URLNormalizer {
 
@@ -46,14 +58,20 @@
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
 
-  private static final char QUESTION_MARK = '?';
-  private static final String PROTOCOL_DELIMITER = "://";
-
-  private static String attributeFile = null;
+  private String attributeFile = null;
   
-  // We record a map of hosts and boolean, the boolean denotes whether the host should
-  // have slashes after URL paths. True means slash, false means remove the slash
-  private static final Map<String,String> protocolsMap = new HashMap<String,String>();
+  // We record a map of hosts and the protocol string to be used for this host
+  private final Map<String,String> protocolsMap = new HashMap<>();
+
+  // Unify protocol strings to reduce the memory footprint (usually there are only
+  // two values (http and https)
+  private final Map<String,String> protocols = new TreeMap<>();
+
+  // Map of domain suffixes and protocol to be used for all hosts below this domain
+  private final Map<String,String> domainProtocolsMap = new HashMap<>();
+  // Matcher for domain suffixes
+  private SuffixStringMatcher domainMatcher = null;
+
 
   private synchronized void readConfiguration(Reader configReader) throws IOException {
     if (protocolsMap.size() > 0) {
@@ -76,10 +94,32 @@
 
         host = line.substring(0, delimiterIndex);
         protocol = line.substring(delimiterIndex + 1).trim();
-        
-        protocolsMap.put(host, protocol);
+
+        /*
+         * dedup protocol values to reduce memory footprint of map: equal
+         * strings are represented by the same string object
+         */
+        protocols.putIfAbsent(protocol, protocol);
+        protocol = protocols.get(protocol);
+
+        if (host.startsWith("*.")) {
+          // domain pattern (eg. "*.example.com"):
+          // - use ".example.com" for suffix matching,
+          //   including the leading dot to avoid mismatches
+          //   ("www.myexample.com")
+          domainProtocolsMap.put(host.substring(1), protocol);
+          // but also match the bare domain name "example.com"
+          protocolsMap.put(host.substring(2), protocol);
+        } else {
+          protocolsMap.put(host, protocol);
+        }
       }
     }
+    if (domainProtocolsMap.size() > 0) {
+      domainMatcher = new SuffixStringMatcher(domainProtocolsMap.keySet());
+    }
+    LOG.info("Configuration file read: rules for {} hosts and {} domains",
+        protocolsMap.size(), domainProtocolsMap.size());
   }
 
   public Configuration getConf() {
@@ -102,7 +142,7 @@
     }
 
     // handle blank non empty input
-    if (attributeFile != null && attributeFile.trim().equals("")) {
+    if (attributeFile != null && attributeFile.trim().isEmpty()) {
       attributeFile = null;
     }
 
@@ -127,7 +167,7 @@
     String file = conf.get("urlnormalizer.protocols.file", attributeFile);
     String stringRules = conf.get("urlnormalizer.protocols.rules");
     Reader reader = null;
-    if (stringRules != null) { // takes precedence over files
+    if (stringRules != null && !stringRules.isEmpty()) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
       LOG.info("Reading {} rules file {}", pluginName, file);
@@ -135,11 +175,12 @@
     }
     try {
       if (reader == null) {
-        reader = new FileReader(file);
+        Path path = new Path(file);
+        FileSystem fs = path.getFileSystem(conf);
+        reader = new InputStreamReader(fs.open(path));
       }
       readConfiguration(reader);
-    }
-    catch (IOException e) {
+    } catch (IOException | IllegalArgumentException e) {
       LOG.error("Error reading " + pluginName + " rule file " + file, e);
     }
   }
@@ -155,27 +196,29 @@
     // Get the host
     String host = u.getHost();
 
+    // Is there a (non-default) port set?
+    if (u.getPort() != -1) {
+      // do not change the protocol if the port is set
+      return url;
+    }
+
+    String requiredProtocol = null;
+
     // Do we have a rule for this host?
-    if (protocolsMap.containsKey(host)) {    
-      String protocol = u.getProtocol();
-      String requiredProtocol = protocolsMap.get(host);
-      
-      // Incorrect protocol?
-      if (!protocol.equals(requiredProtocol)) {
-        // Rebuild URL with new protocol
-        StringBuilder buffer = new StringBuilder(requiredProtocol);
-        buffer.append(PROTOCOL_DELIMITER);
-        buffer.append(host);
-        buffer.append(u.getPath());
-        
-        String queryString = u.getQuery();
-        if (queryString != null) {
-          buffer.append(QUESTION_MARK);
-          buffer.append(queryString);
-        }
-        
-        url = buffer.toString();
-      }
+    if (protocolsMap.containsKey(host)) {
+      requiredProtocol = protocolsMap.get(host);
+    } else if (domainMatcher != null) {
+      String domainMatch = domainMatcher.longestMatch(host);
+      if (domainMatch != null) {
+        requiredProtocol = domainProtocolsMap.get(domainMatch);
+     }
+    }
+
+    // Incorrect protocol?
+    if (requiredProtocol != null && !u.getProtocol().equals(requiredProtocol)) {
+      // Rebuild URL with new protocol
+      url = new URL(requiredProtocol, host, u.getPort(), u.getFile())
+          .toString();
     }
 
     return url;

diff --git a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/package-info.java b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/package-info.java
new file mode 100644
index 0000000..04f1255
--- /dev/null
+++ b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/package-info.java

@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL normalizer to normalize the protocol for all URLs of a given host or
+ * domain.
+ * 
+ * E.g., normalize <code>http://nutch.apache.org/path/</code> to
+ * <code>https://www.apache.org/path/</code> if it's known that the host
+ * <code>nutch.apache.org</code> supports https and http-URLs either cause
+ * duplicate content or are redirected to https.
+ *
+ * The configuration of rules follows the schema:
+ * 
+ * <pre>
+ * &lt;host&gt; \t &lt;protcol&gt;
+ * </pre>
+ * 
+ * for example
+ * 
+ * <pre>
+ * nutch.apache.org \t https
+ * *.example.com \t http
+ * </pre>
+ * 
+ * These rules will normalize all URLs of the host <code>nutch.apache.org</code>
+ * to use https while every URL from <code>example.com</code> and its subdomains
+ * is normalized to be based on http.
+ *
+ * A "host" pattern which starts with <code>*.</code> will match all hosts
+ * (subdomains) of the given domain, or more generally matches domain suffixes
+ * separated by a dot.
+ * 
+ * Rules are usually configured via the configuration file "protocols.txt". The
+ * filename is specified by the property
+ * <code>urlnormalizer.protocols.file</code>. Alternatively, if the property
+ * <code>urlnormalizer.protocols.rules</code> defines a non-empty string, these
+ * rules take precedence of those specified in the rule file.
+ */
+package org.apache.nutch.net.urlnormalizer.protocol;
+

diff --git a/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
index 1b9760b..de9f77a 100644
--- a/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
+++ b/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java

@@ -36,19 +36,52 @@
     normalizer.setConf(conf);
 
     // No change
-    assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT));
-    assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT));
-    
+    assertEquals("http://example.org/", normalizer
+        .normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.net/", normalizer
+        .normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT));
+
     // https to http
-    assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT));
-    assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT));
-    
+    assertEquals("http://example.org/", normalizer
+        .normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://example.net/", normalizer
+        .normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT));
+
     // no change
-    assertEquals("https://example.io/", normalizer.normalize("https://example.io/", URLNormalizers.SCOPE_DEFAULT));
-    assertEquals("https://example.nl/", normalizer.normalize("https://example.nl/", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("https://example.io/", normalizer
+        .normalize("https://example.io/", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("https://example.nl/", normalizer
+        .normalize("https://example.nl/", URLNormalizers.SCOPE_DEFAULT));
     
     // http to https
-    assertEquals("https://example.io/", normalizer.normalize("http://example.io/", URLNormalizers.SCOPE_DEFAULT));
-    assertEquals("https://example.nl/", normalizer.normalize("http://example.nl/", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("https://example.io/", normalizer
+        .normalize("http://example.io/", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("https://example.nl/", normalizer
+        .normalize("http://example.nl/", URLNormalizers.SCOPE_DEFAULT));
+
+    // verify proper (de)serialization of URLs
+    assertEquals("https://example.io/path?q=uery", normalizer.normalize(
+        "http://example.io/path?q=uery", URLNormalizers.SCOPE_DEFAULT));
+
+    // verify that URLs including a port are left unchanged (port and protocol
+    // are kept)
+    assertEquals("http://example.io:8080/path?q=uery", normalizer.normalize(
+        "http://example.io:8080/path?q=uery", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("https://example.org:8443/path", normalizer.normalize(
+        "https://example.org:8443/path", URLNormalizers.SCOPE_DEFAULT));
+
+    // verify normalization of all subdomains (host pattern *.example.com)
+    assertEquals("https://example.com/", normalizer
+        .normalize("http://example.com/", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("https://www.example.com/", normalizer
+        .normalize("http://www.example.com/", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("https://www.subdomain.example.com/", normalizer.normalize(
+        "http://www.subdomain.example.com/", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://myexample.com/", normalizer
+        .normalize("http://myexample.com/", URLNormalizers.SCOPE_DEFAULT));
+    assertEquals("http://www.subdomain.example.com:8080/path?q=uery",
+        normalizer.normalize(
+            "http://www.subdomain.example.com:8080/path?q=uery",
+            URLNormalizers.SCOPE_DEFAULT));
   }
 }
commit	6c02da053d8ce65e0283a144ab59586e563608b8	[log] [tgz]
author	Sebastian Nagel <snagel@apache.org>	Tue Apr 06 16:52:03 2021 +0200
committer	GitHub <noreply@github.com>	Tue Apr 06 16:52:03 2021 +0200
tree	556708366c51fbc0da57dce02fe2b5b71fb84050
parent	2837039b9c5b52a88c2029a5e29c81cecd8953f3 [diff]
parent	081c826745356e041c6d4e9d3e3c96ae91eddd2b [diff]