Merge pull request #480 from sebastian-nagel/NUTCH-2746-url-normalizer-basic-idn
NUTCH-2746 Basic URL normalizer to normalize Unicode domain names
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 97fcbe4..01f4578 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1364,6 +1364,26 @@
</description>
</property>
+<property>
+ <name>urlnormalizer.basic.host.idn</name>
+ <value></value>
+ <description>Let urlnormalizer-basic
+ (org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer)
+ normalize Internationalized Domain Names (IDNs). Possible values
+ are: `toAscii` - convert the Unicode form to the ASCII (Punycode)
+ representation, `toUnicode` - convert ASCII (Punycode) to Unicode,
+ or if left empty no normalization of IDNs is performed.
+ </description>
+</property>
+
+<property>
+ <name>urlnormalizer.basic.host.trim-trailing-dot</name>
+ <value>false</value>
+ <description>urlnormalizer-basic: Trim a trailing dot in host names:
+ `https://example.org./` is normalized to `https://example.org/`.
+ </description>
+</property>
+
<!-- mime properties -->
<!--
diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index d0e8f5f..93bd336 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -20,6 +20,7 @@
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
+import java.net.IDN;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
@@ -29,7 +30,7 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.net.URLNormalizer;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.util.NutchConfiguration;
@@ -46,10 +47,13 @@
* percent-encoding</a> in URL paths</li>
* </ul>
*/
-public class BasicURLNormalizer extends Configured implements URLNormalizer {
+public class BasicURLNormalizer implements URLNormalizer {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
+ public final static String NORM_HOST_IDN = "urlnormalizer.basic.host.idn";
+ public final static String NORM_HOST_TRIM_TRAILING_DOT = "urlnormalizer.basic.host.trim-trailing-dot";
+
/**
* Pattern to detect whether a URL path could be normalized. Contains one of
* /. or ./ /.. or ../ //
@@ -128,6 +132,43 @@
|| (0x30 <= c && c <= 0x39);
}
+ private static boolean isAscii(String str) {
+ char[] chars = str.toCharArray();
+ for (char c : chars) {
+ if (c > 127) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private Configuration conf;
+
+ private boolean hostIDNtoASCII;
+ private boolean hostASCIItoIDN;
+ private boolean hostTrimTrailingDot;
+
+ public void BasicUrlNormalizer() {
+ }
+
+ @Override
+ public Configuration getConf() {
+ return conf;
+ }
+
+ @Override
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ String normIdn = conf.get(NORM_HOST_IDN, "");
+ if (normIdn.equalsIgnoreCase("toAscii")) {
+ hostIDNtoASCII = true;
+ } else if (normIdn.equalsIgnoreCase("toUnicode")) {
+ hostASCIItoIDN = true;
+ }
+ hostTrimTrailingDot = conf.getBoolean(NORM_HOST_TRIM_TRAILING_DOT, false);
+ }
+
+ @Override
public String normalize(String urlString, String scope)
throws MalformedURLException {
@@ -153,7 +194,7 @@
|| "ftp".equals(protocol)) {
if (host != null && url.getAuthority() != null) {
- String newHost = host.toLowerCase(Locale.ROOT); // lowercase host
+ String newHost = normalizeHostName(host);
if (!host.equals(newHost)) {
host = newHost;
changed = true;
@@ -353,6 +394,40 @@
return sb.toString();
}
+ private String normalizeHostName(String host) throws MalformedURLException {
+
+ // 1. lowercase host name
+ host = host.toLowerCase(Locale.ROOT);
+
+ // 2. if configured: convert between Unicode and ASCII forms
+ // for Internationalized Domain Names (IDNs)
+ if (hostIDNtoASCII && !isAscii(host)) {
+ try {
+ host = IDN.toASCII(host);
+ } catch (IllegalArgumentException | IndexOutOfBoundsException e) {
+ // IllegalArgumentException: thrown if the input string contains
+ // non-convertible Unicode codepoints
+ // IndexOutOfBoundsException: thrown (undocumented) if one "label"
+ // (non-ASCII dot-separated segment) is longer than 256 characters,
+ // cf. https://bugs.openjdk.java.net/browse/JDK-6806873
+ LOG.debug("Failed to convert IDN host {}: ", host, e);
+ throw (MalformedURLException) new MalformedURLException(
+ "Invalid IDN " + host + ": " + e.getMessage()).initCause(e);
+ }
+ } else if (hostASCIItoIDN && host.contains("xn--")) {
+ host = IDN.toUnicode(host);
+ }
+
+ // 3. optionally trim a trailing dot
+ if (hostTrimTrailingDot) {
+ if (host.endsWith(".")) {
+ host = host.substring(0, host.length()-1);
+ }
+ }
+
+ return host;
+ }
+
public static void main(String args[]) throws IOException {
BasicURLNormalizer normalizer = new BasicURLNormalizer();
normalizer.setConf(NutchConfiguration.create());
diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
index ae59a84..7d765f4 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
@@ -16,8 +16,26 @@
*/
/**
- * URL normalizer performing basic normalizations: remove default ports
- * and dot segments in path.
+ * URL normalizer performing basic normalizations:
+ * <ul>
+ * <li>remove default ports, e.g., port 80 for <code>http://</code> URLs</li>
+ * <li>remove needless slashes and dot segments in the path component</li>
+ * <li>remove anchors</li>
+ * <li>use percent-encoding (only) where needed</li>
+ * </ul>
+ *
+ * E.g.,
+ * <code>https://www.example.org/a/../b//./select%2Dlang.php?lang=español#anchor<code>
+ * is normalized to <code>https://www.example.org/b/select-lang.php?lang=espa%C3%B1ol</code>
+ *
+ * Optional and configurable normalizations are:
+ * <ul>
+ * <li>convert Internationalized Domain Names (IDNs) uniquely either to the
+ * ASCII (Punycode) or Unicode representation, see property
+ * <code>urlnormalizer.basic.host.idn</code></li>
+ * <li>remove a trailing dot from host names, see property
+ * <code>urlnormalizer.basic.host.trim-trailing-dot</code></li>
+ * </ul>
*/
package org.apache.nutch.net.urlnormalizer.basic;
diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 102b10c..5c6f710 100644
--- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -77,7 +77,7 @@
// check that control chars are always encoded into 2 digits
normalizeTest("http://foo.com/\u0001!", "http://foo.com/%01!");
- // check encoding of spanish chars
+ // check encoding of Spanish chars
normalizeTest("http://mydomain.com/en Espa\u00F1ol.aspx", "http://mydomain.com/en%20Espa%C3%B1ol.aspx");
}
@@ -212,11 +212,36 @@
@Test
public void testCurlyBraces() throws Exception {
- // check that leading and trailing spaces are removed
+ // check whether curly braces are properly escaped
normalizeTest("http://foo.com/{{stuff}} ", "http://foo.com/%7B%7Bstuff%7D%7D");
}
+ @Test
+ public void testHostName() throws Exception {
+ // (nothing to normalize in host name)
+ normalizeTest("https://www.example.org/", "https://www.example.org/");
+ // test Internationalized Domain Names
+ BasicURLNormalizer norm = new BasicURLNormalizer();
+ conf = NutchConfiguration.create();
+ conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii");
+ norm.setConf(conf);
+ normalizeTest(norm, "https://нэб.рф/", "https://xn--90ax2c.xn--p1ai/");
+ conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toUnicode");
+ norm.setConf(conf);
+ normalizeTest(norm, "https://xn--90ax2c.xn--p1ai/", "https://нэб.рф/");
+ // test removal of trailing dot
+ conf.setBoolean(BasicURLNormalizer.NORM_HOST_TRIM_TRAILING_DOT, true);
+ norm.setConf(conf);
+ normalizeTest(norm, "https://www.example.org./",
+ "https://www.example.org/");
+ }
+
private void normalizeTest(String weird, String normal) throws Exception {
+ normalizeTest(this.normalizer, weird, normal);
+ }
+
+ private void normalizeTest(BasicURLNormalizer normalizer, String weird,
+ String normal) throws Exception {
Assert.assertEquals("normalizing: " + weird, normal,
normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT));
try {