Merge pull request #576 from sebastian-nagel/NUTCH-2859-urlnormalizer-protocol-domain-rules
NUTCH-2859: urlnormalizer-protocol: allow to normalize domains
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 5548a30..3e867e6 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1445,6 +1445,21 @@
</description>
</property>
+<property>
+ <name>urlnormalizer.protocols.file</name>
+ <value>protocols.txt</value>
+ <description>urlnormalizer-protocol configuration file</description>
+</property>
+
+<property>
+ <name>urlnormalizer.protocols.rules</name>
+ <value></value>
+ <description>urlnormalizer-protocol rule definitions: if not empty,
+ takes precedence over rules defined in the rule file (see
+ urlnormalizer.protocols.file)</description>
+</property>
+
+
<!-- mime properties -->
<!--
diff --git a/conf/protocols.txt.template b/conf/protocols.txt.template
index 14d48ff..140fb8e 100644
--- a/conf/protocols.txt.template
+++ b/conf/protocols.txt.template
@@ -4,4 +4,10 @@
# protocol. Useful in cases where a host accepts both http and https, doubling
# the site's size.
#
+# Also all hosts of a domain can be addressed by adding a "host" pattern
+# starting with "*.". E.g., "*.wikipedia.org" will match all subdomains of
+# the domain "wikipedia.org"
+#
+# Note: if the URL includes a port number, the protocol is left unchanged.
+#
# format: <host>\t<protocol>\n
diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index fd0aa18..8f3a1fd 100644
--- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -123,6 +123,8 @@
// check that port number is normalized
normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html");
+ normalizeTest("https://foo.com:443/index.html",
+ "https://foo.com/index.html");
normalizeTest("http://foo.com:81/", "http://foo.com:81/");
// check that empty port is removed
normalizeTest("http://example.com:/", "http://example.com/");
diff --git a/src/plugin/urlnormalizer-protocol/data/protocols.txt b/src/plugin/urlnormalizer-protocol/data/protocols.txt
index 7091cd7..7e49703 100644
--- a/src/plugin/urlnormalizer-protocol/data/protocols.txt
+++ b/src/plugin/urlnormalizer-protocol/data/protocols.txt
@@ -1,7 +1,21 @@
-# format: host\tprotocol\n
+# Example configuration file for urlnormalizer-protocol
+#
+# URL's of hosts listed in the configuration are normalized to the target
+# protocol. Useful in cases where a host accepts both http and https, doubling
+# the site's size.
+#
+# Also all hosts of a domain can be addressed by adding a "host" pattern
+# starting with "*.". E.g., "*.wikipedia.org" will match all subdomains of
+# the domain "wikipedia.org"
+#
+# Note: if the URL includes a port number, the protocol is left unchanged.
+#
+# format: <host>\t<protocol>\n
example.org http
example.net http
example.io https
example.nl https
+
+*.example.com https
diff --git a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
index f60c291..e1afde8 100644
--- a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
+++ b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
@@ -18,26 +18,38 @@
import java.lang.invoke.MethodHandles;
import java.io.BufferedReader;
-import java.io.FileReader;
import java.io.IOException;
+import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
+import java.util.TreeMap;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.net.URLNormalizer;
import org.apache.nutch.plugin.Extension;
import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.SuffixStringMatcher;
/**
- * @author markus@openindex.io
+ * URL normalizer to normalize the protocol for all URLs of a given host or
+ * domain, e.g. normalize <code>http://nutch.apache.org/path/</code> to
+ * <code>https://www.apache.org/path/</code> if it's known that the host
+ * <code>nutch.apache.org</code> supports https and http-URLs either cause
+ * duplicate content or are redirected to https.
+ *
+ * See {@link org.apache.nutch.net.urlnormalizer.protocol} for details and
+ * configuration.
*/
public class ProtocolURLNormalizer implements URLNormalizer {
@@ -46,14 +58,20 @@
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
- private static final char QUESTION_MARK = '?';
- private static final String PROTOCOL_DELIMITER = "://";
-
- private static String attributeFile = null;
+ private String attributeFile = null;
- // We record a map of hosts and boolean, the boolean denotes whether the host should
- // have slashes after URL paths. True means slash, false means remove the slash
- private static final Map<String,String> protocolsMap = new HashMap<String,String>();
+ // We record a map of hosts and the protocol string to be used for this host
+ private final Map<String,String> protocolsMap = new HashMap<>();
+
+ // Unify protocol strings to reduce the memory footprint (usually there are only
+ // two values (http and https)
+ private final Map<String,String> protocols = new TreeMap<>();
+
+ // Map of domain suffixes and protocol to be used for all hosts below this domain
+ private final Map<String,String> domainProtocolsMap = new HashMap<>();
+ // Matcher for domain suffixes
+ private SuffixStringMatcher domainMatcher = null;
+
private synchronized void readConfiguration(Reader configReader) throws IOException {
if (protocolsMap.size() > 0) {
@@ -76,10 +94,32 @@
host = line.substring(0, delimiterIndex);
protocol = line.substring(delimiterIndex + 1).trim();
-
- protocolsMap.put(host, protocol);
+
+ /*
+ * dedup protocol values to reduce memory footprint of map: equal
+ * strings are represented by the same string object
+ */
+ protocols.putIfAbsent(protocol, protocol);
+ protocol = protocols.get(protocol);
+
+ if (host.startsWith("*.")) {
+ // domain pattern (eg. "*.example.com"):
+ // - use ".example.com" for suffix matching,
+ // including the leading dot to avoid mismatches
+ // ("www.myexample.com")
+ domainProtocolsMap.put(host.substring(1), protocol);
+ // but also match the bare domain name "example.com"
+ protocolsMap.put(host.substring(2), protocol);
+ } else {
+ protocolsMap.put(host, protocol);
+ }
}
}
+ if (domainProtocolsMap.size() > 0) {
+ domainMatcher = new SuffixStringMatcher(domainProtocolsMap.keySet());
+ }
+ LOG.info("Configuration file read: rules for {} hosts and {} domains",
+ protocolsMap.size(), domainProtocolsMap.size());
}
public Configuration getConf() {
@@ -102,7 +142,7 @@
}
// handle blank non empty input
- if (attributeFile != null && attributeFile.trim().equals("")) {
+ if (attributeFile != null && attributeFile.trim().isEmpty()) {
attributeFile = null;
}
@@ -127,7 +167,7 @@
String file = conf.get("urlnormalizer.protocols.file", attributeFile);
String stringRules = conf.get("urlnormalizer.protocols.rules");
Reader reader = null;
- if (stringRules != null) { // takes precedence over files
+ if (stringRules != null && !stringRules.isEmpty()) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
LOG.info("Reading {} rules file {}", pluginName, file);
@@ -135,11 +175,12 @@
}
try {
if (reader == null) {
- reader = new FileReader(file);
+ Path path = new Path(file);
+ FileSystem fs = path.getFileSystem(conf);
+ reader = new InputStreamReader(fs.open(path));
}
readConfiguration(reader);
- }
- catch (IOException e) {
+ } catch (IOException | IllegalArgumentException e) {
LOG.error("Error reading " + pluginName + " rule file " + file, e);
}
}
@@ -155,27 +196,29 @@
// Get the host
String host = u.getHost();
+ // Is there a (non-default) port set?
+ if (u.getPort() != -1) {
+ // do not change the protocol if the port is set
+ return url;
+ }
+
+ String requiredProtocol = null;
+
// Do we have a rule for this host?
- if (protocolsMap.containsKey(host)) {
- String protocol = u.getProtocol();
- String requiredProtocol = protocolsMap.get(host);
-
- // Incorrect protocol?
- if (!protocol.equals(requiredProtocol)) {
- // Rebuild URL with new protocol
- StringBuilder buffer = new StringBuilder(requiredProtocol);
- buffer.append(PROTOCOL_DELIMITER);
- buffer.append(host);
- buffer.append(u.getPath());
-
- String queryString = u.getQuery();
- if (queryString != null) {
- buffer.append(QUESTION_MARK);
- buffer.append(queryString);
- }
-
- url = buffer.toString();
- }
+ if (protocolsMap.containsKey(host)) {
+ requiredProtocol = protocolsMap.get(host);
+ } else if (domainMatcher != null) {
+ String domainMatch = domainMatcher.longestMatch(host);
+ if (domainMatch != null) {
+ requiredProtocol = domainProtocolsMap.get(domainMatch);
+ }
+ }
+
+ // Incorrect protocol?
+ if (requiredProtocol != null && !u.getProtocol().equals(requiredProtocol)) {
+ // Rebuild URL with new protocol
+ url = new URL(requiredProtocol, host, u.getPort(), u.getFile())
+ .toString();
}
return url;
diff --git a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/package-info.java b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/package-info.java
new file mode 100644
index 0000000..04f1255
--- /dev/null
+++ b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/package-info.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL normalizer to normalize the protocol for all URLs of a given host or
+ * domain.
+ *
+ * E.g., normalize <code>http://nutch.apache.org/path/</code> to
+ * <code>https://www.apache.org/path/</code> if it's known that the host
+ * <code>nutch.apache.org</code> supports https and http-URLs either cause
+ * duplicate content or are redirected to https.
+ *
+ * The configuration of rules follows the schema:
+ *
+ * <pre>
+ * <host> \t <protcol>
+ * </pre>
+ *
+ * for example
+ *
+ * <pre>
+ * nutch.apache.org \t https
+ * *.example.com \t http
+ * </pre>
+ *
+ * These rules will normalize all URLs of the host <code>nutch.apache.org</code>
+ * to use https while every URL from <code>example.com</code> and its subdomains
+ * is normalized to be based on http.
+ *
+ * A "host" pattern which starts with <code>*.</code> will match all hosts
+ * (subdomains) of the given domain, or more generally matches domain suffixes
+ * separated by a dot.
+ *
+ * Rules are usually configured via the configuration file "protocols.txt". The
+ * filename is specified by the property
+ * <code>urlnormalizer.protocols.file</code>. Alternatively, if the property
+ * <code>urlnormalizer.protocols.rules</code> defines a non-empty string, these
+ * rules take precedence of those specified in the rule file.
+ */
+package org.apache.nutch.net.urlnormalizer.protocol;
+
diff --git a/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
index 1b9760b..de9f77a 100644
--- a/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
+++ b/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
@@ -36,19 +36,52 @@
normalizer.setConf(conf);
// No change
- assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT));
- assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT));
-
+ assertEquals("http://example.org/", normalizer
+ .normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.net/", normalizer
+ .normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT));
+
// https to http
- assertEquals("http://example.org/", normalizer.normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT));
- assertEquals("http://example.net/", normalizer.normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT));
-
+ assertEquals("http://example.org/", normalizer
+ .normalize("https://example.org/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://example.net/", normalizer
+ .normalize("https://example.net/", URLNormalizers.SCOPE_DEFAULT));
+
// no change
- assertEquals("https://example.io/", normalizer.normalize("https://example.io/", URLNormalizers.SCOPE_DEFAULT));
- assertEquals("https://example.nl/", normalizer.normalize("https://example.nl/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("https://example.io/", normalizer
+ .normalize("https://example.io/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("https://example.nl/", normalizer
+ .normalize("https://example.nl/", URLNormalizers.SCOPE_DEFAULT));
// http to https
- assertEquals("https://example.io/", normalizer.normalize("http://example.io/", URLNormalizers.SCOPE_DEFAULT));
- assertEquals("https://example.nl/", normalizer.normalize("http://example.nl/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("https://example.io/", normalizer
+ .normalize("http://example.io/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("https://example.nl/", normalizer
+ .normalize("http://example.nl/", URLNormalizers.SCOPE_DEFAULT));
+
+ // verify proper (de)serialization of URLs
+ assertEquals("https://example.io/path?q=uery", normalizer.normalize(
+ "http://example.io/path?q=uery", URLNormalizers.SCOPE_DEFAULT));
+
+ // verify that URLs including a port are left unchanged (port and protocol
+ // are kept)
+ assertEquals("http://example.io:8080/path?q=uery", normalizer.normalize(
+ "http://example.io:8080/path?q=uery", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("https://example.org:8443/path", normalizer.normalize(
+ "https://example.org:8443/path", URLNormalizers.SCOPE_DEFAULT));
+
+ // verify normalization of all subdomains (host pattern *.example.com)
+ assertEquals("https://example.com/", normalizer
+ .normalize("http://example.com/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("https://www.example.com/", normalizer
+ .normalize("http://www.example.com/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("https://www.subdomain.example.com/", normalizer.normalize(
+ "http://www.subdomain.example.com/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://myexample.com/", normalizer
+ .normalize("http://myexample.com/", URLNormalizers.SCOPE_DEFAULT));
+ assertEquals("http://www.subdomain.example.com:8080/path?q=uery",
+ normalizer.normalize(
+ "http://www.subdomain.example.com:8080/path?q=uery",
+ URLNormalizers.SCOPE_DEFAULT));
}
}