Merge pull request #480 from sebastian-nagel/NUTCH-2746-url-normalizer-basic-idn NUTCH-2746 Basic URL normalizer to normalize Unicode domain names

commit: ec45fe5cd1b7b7d7da8d1a04701cfd140a323178 [log] [tgz]
author: Sebastian Nagel <snagel@apache.org> Fri Nov 22 18:52:18 2019 +0100
committer: GitHub <noreply@github.com> Fri Nov 22 18:52:18 2019 +0100
tree: c48b3fff490512fb00bf527001d52cac4a23afd0
parent: c23afa8261cf3d72bd8382a9ec53647bf8447d7c [diff]
parent: c43b486ec29ab0a8960c1b51a71470f584749a78 [diff]
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 97fcbe4..01f4578 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml

@@ -1364,6 +1364,26 @@
   </description>
 </property>
 
+<property>
+  <name>urlnormalizer.basic.host.idn</name>
+  <value></value>
+  <description>Let urlnormalizer-basic
+  (org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer)
+  normalize Internationalized Domain Names (IDNs). Possible values
+  are: `toAscii` - convert the Unicode form to the ASCII (Punycode)
+  representation, `toUnicode` - convert ASCII (Punycode) to Unicode,
+  or if left empty no normalization of IDNs is performed.
+  </description>
+</property>
+
+<property>
+  <name>urlnormalizer.basic.host.trim-trailing-dot</name>
+  <value>false</value>
+  <description>urlnormalizer-basic: Trim a trailing dot in host names:
+  `https://example.org./` is normalized to `https://example.org/`.
+  </description>
+</property>
+
 <!-- mime properties -->
 
 <!--

diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index d0e8f5f..93bd336 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java

@@ -20,6 +20,7 @@
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
+import java.net.IDN;
 import java.net.MalformedURLException;
 import java.net.URISyntaxException;
 import java.net.URL;
@@ -29,7 +30,7 @@
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
-import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.net.URLNormalizer;
 import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.util.NutchConfiguration;
@@ -46,10 +47,13 @@
  * percent-encoding</a> in URL paths</li>
  * </ul>
  */
-public class BasicURLNormalizer extends Configured implements URLNormalizer {
+public class BasicURLNormalizer implements URLNormalizer {
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
 
+  public final static String NORM_HOST_IDN = "urlnormalizer.basic.host.idn";
+  public final static String NORM_HOST_TRIM_TRAILING_DOT = "urlnormalizer.basic.host.trim-trailing-dot";
+
   /**
    * Pattern to detect whether a URL path could be normalized. Contains one of
    * /. or ./ /.. or ../ //
@@ -128,6 +132,43 @@
         || (0x30 <= c && c <= 0x39);
   }
 
+  private static boolean isAscii(String str) {
+    char[] chars = str.toCharArray();
+    for (char c : chars) {
+      if (c > 127) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  private Configuration conf;
+
+  private boolean hostIDNtoASCII;
+  private boolean hostASCIItoIDN;
+  private boolean hostTrimTrailingDot;
+
+  public void BasicUrlNormalizer() {
+  }
+
+  @Override
+  public Configuration getConf() {
+    return conf;
+  }
+
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    String normIdn = conf.get(NORM_HOST_IDN, "");
+    if (normIdn.equalsIgnoreCase("toAscii")) {
+      hostIDNtoASCII = true;
+    } else if (normIdn.equalsIgnoreCase("toUnicode")) {
+      hostASCIItoIDN = true;
+    }
+    hostTrimTrailingDot = conf.getBoolean(NORM_HOST_TRIM_TRAILING_DOT, false);
+  }
+
+  @Override
   public String normalize(String urlString, String scope)
       throws MalformedURLException {
     
@@ -153,7 +194,7 @@
         || "ftp".equals(protocol)) {
 
       if (host != null && url.getAuthority() != null) {
-        String newHost = host.toLowerCase(Locale.ROOT); // lowercase host
+        String newHost = normalizeHostName(host);
         if (!host.equals(newHost)) {
           host = newHost;
           changed = true;
@@ -353,6 +394,40 @@
     return sb.toString();
   }
 
+  private String normalizeHostName(String host) throws MalformedURLException {
+
+    // 1. lowercase host name
+    host = host.toLowerCase(Locale.ROOT);
+
+    // 2. if configured: convert between Unicode and ASCII forms
+    //    for Internationalized Domain Names (IDNs)
+    if (hostIDNtoASCII && !isAscii(host)) {
+      try {
+        host = IDN.toASCII(host);
+      } catch (IllegalArgumentException | IndexOutOfBoundsException e) {
+        // IllegalArgumentException: thrown if the input string contains
+        // non-convertible Unicode codepoints
+        // IndexOutOfBoundsException: thrown (undocumented) if one "label"
+        // (non-ASCII dot-separated segment) is longer than 256 characters,
+        // cf. https://bugs.openjdk.java.net/browse/JDK-6806873
+        LOG.debug("Failed to convert IDN host {}: ", host, e);
+        throw (MalformedURLException) new MalformedURLException(
+            "Invalid IDN " + host + ": " + e.getMessage()).initCause(e);
+      }
+    } else if (hostASCIItoIDN && host.contains("xn--")) {
+      host = IDN.toUnicode(host);
+    }
+
+    // 3. optionally trim a trailing dot
+    if (hostTrimTrailingDot) {
+      if (host.endsWith(".")) {
+        host = host.substring(0, host.length()-1);
+      }
+    }
+
+    return host;
+  }
+
   public static void main(String args[]) throws IOException {
     BasicURLNormalizer normalizer = new BasicURLNormalizer();
     normalizer.setConf(NutchConfiguration.create());

diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
index ae59a84..7d765f4 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java

@@ -16,8 +16,26 @@
  */
 
 /**
- * URL normalizer performing basic normalizations: remove default ports
- * and dot segments in path.
+ * URL normalizer performing basic normalizations:
+ * <ul>
+ * <li>remove default ports, e.g., port 80 for <code>http://</code> URLs</li>
+ * <li>remove needless slashes and dot segments in the path component</li>
+ * <li>remove anchors</li>
+ * <li>use percent-encoding (only) where needed</li>
+ * </ul>
+ * 
+ * E.g.,
+ * <code>https://www.example.org/a/../b//./select%2Dlang.php?lang=español#anchor<code>
+ * is normalized to <code>https://www.example.org/b/select-lang.php?lang=espa%C3%B1ol</code>
+ * 
+ * Optional and configurable normalizations are:
+ * <ul>
+ * <li>convert Internationalized Domain Names (IDNs) uniquely either to the
+ * ASCII (Punycode) or Unicode representation, see property
+ * <code>urlnormalizer.basic.host.idn</code></li>
+ * <li>remove a trailing dot from host names, see property
+ * <code>urlnormalizer.basic.host.trim-trailing-dot</code></li>
+ * </ul>
  */
 package org.apache.nutch.net.urlnormalizer.basic;
 

diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 102b10c..5c6f710 100644
--- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java

@@ -77,7 +77,7 @@
     // check that control chars are always encoded into 2 digits
     normalizeTest("http://foo.com/\u0001!", "http://foo.com/%01!");
 
-    // check encoding of spanish chars
+    // check encoding of Spanish chars
     normalizeTest("http://mydomain.com/en Espa\u00F1ol.aspx", "http://mydomain.com/en%20Espa%C3%B1ol.aspx");
   }
   
@@ -212,11 +212,36 @@
   
   @Test
   public void testCurlyBraces() throws Exception {
-    // check that leading and trailing spaces are removed
+    // check whether curly braces are properly escaped
     normalizeTest("http://foo.com/{{stuff}} ", "http://foo.com/%7B%7Bstuff%7D%7D");
   }
 
+  @Test
+  public void testHostName() throws Exception {
+    // (nothing to normalize in host name)
+    normalizeTest("https://www.example.org/", "https://www.example.org/");
+    // test Internationalized Domain Names
+    BasicURLNormalizer norm = new BasicURLNormalizer();
+    conf = NutchConfiguration.create();
+    conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii");
+    norm.setConf(conf);
+    normalizeTest(norm, "https://нэб.рф/", "https://xn--90ax2c.xn--p1ai/");
+    conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toUnicode");
+    norm.setConf(conf);
+    normalizeTest(norm, "https://xn--90ax2c.xn--p1ai/", "https://нэб.рф/");
+    // test removal of trailing dot
+    conf.setBoolean(BasicURLNormalizer.NORM_HOST_TRIM_TRAILING_DOT, true);
+    norm.setConf(conf);
+    normalizeTest(norm, "https://www.example.org./",
+        "https://www.example.org/");
+  }
+
   private void normalizeTest(String weird, String normal) throws Exception {
+    normalizeTest(this.normalizer, weird, normal);
+  }
+
+  private void normalizeTest(BasicURLNormalizer normalizer, String weird,
+      String normal) throws Exception {
     Assert.assertEquals("normalizing: " + weird, normal,
         normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT));
     try {
commit	ec45fe5cd1b7b7d7da8d1a04701cfd140a323178	[log] [tgz]
author	Sebastian Nagel <snagel@apache.org>	Fri Nov 22 18:52:18 2019 +0100
committer	GitHub <noreply@github.com>	Fri Nov 22 18:52:18 2019 +0100
tree	c48b3fff490512fb00bf527001d52cac4a23afd0
parent	c23afa8261cf3d72bd8382a9ec53647bf8447d7c [diff]
parent	c43b486ec29ab0a8960c1b51a71470f584749a78 [diff]