Merge pull request #552 from sebastian-nagel/NUTCH-2824

NUTCH-2824 urlnormalizer-basic to unescape percent-encoded host names
diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index 8794130..5479882 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -20,10 +20,12 @@
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
 import java.net.IDN;
 import java.net.MalformedURLException;
 import java.net.URISyntaxException;
 import java.net.URL;
+import java.net.URLDecoder;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.Locale;
@@ -396,10 +398,22 @@
 
   private String normalizeHostName(String host) throws MalformedURLException {
 
-    // 1. lowercase host name
+    // 1. unescape percent-encoded characters in host name
+    if (host.indexOf('%') != -1) {
+      try {
+        host = URLDecoder.decode(host, StandardCharsets.UTF_8.toString());
+      } catch (UnsupportedEncodingException | IllegalArgumentException e) {
+        LOG.debug("Failed to convert percent-encoded host name {}: ", host, e);
+        throw (MalformedURLException) new MalformedURLException(
+            "Invalid percent-encoded host name " + host + ": " + e.getMessage())
+                .initCause(e);
+      }
+    }
+
+    // 2. lowercase host name
     host = host.toLowerCase(Locale.ROOT);
 
-    // 2. if configured: convert between Unicode and ASCII forms
+    // 3. if configured: convert between Unicode and ASCII forms
     //    for Internationalized Domain Names (IDNs)
     if (hostIDNtoASCII && !isAscii(host)) {
       try {
@@ -418,7 +432,7 @@
       host = IDN.toUnicode(host);
     }
 
-    // 3. optionally trim a trailing dot
+    // 4. optionally trim a trailing dot
     if (hostTrimTrailingDot) {
       if (host.endsWith(".")) {
         host = host.substring(0, host.length()-1);
diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 5c6f710..fd0aa18 100644
--- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -116,6 +116,11 @@
     normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html");
     normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html");
 
+    // NUTCH-2824 unescape percent characters in host names
+    normalizeTest("https://example%2Ecom/", "https://example.com/");
+    normalizeTest("https://www.0251-sachverst%c3%a4ndiger.de/",
+        "https://www.0251-sachverst\u00e4ndiger.de/");
+
     // check that port number is normalized
     normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html");
     normalizeTest("http://foo.com:81/", "http://foo.com:81/");
@@ -130,8 +135,8 @@
     // check that references are removed
     normalizeTest("http://foo.com/foo.html#ref", "http://foo.com/foo.html");
 
-    // // check that encoding is normalized
-    // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
+    // check that encoding is normalized
+    normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
 
     // check that unnecessary "../" are removed
     normalizeTest("http://foo.com/..", "http://foo.com/");
@@ -226,6 +231,9 @@
     conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii");
     norm.setConf(conf);
     normalizeTest(norm, "https://нэб.рф/", "https://xn--90ax2c.xn--p1ai/");
+    // verify escaping of percent-encoded characters in IDNs (NUTCH-2824)
+    normalizeTest(norm, "https://www.0251-sachverst%c3%a4ndiger.de/",
+        "https://www.xn--0251-sachverstndiger-ozb.de/");
     conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toUnicode");
     norm.setConf(conf);
     normalizeTest(norm, "https://xn--90ax2c.xn--p1ai/", "https://нэб.рф/");
@@ -236,6 +244,23 @@
         "https://www.example.org/");
   }
 
+  /**
+   * Test that normalizer throws MalformedURLException for invalid URLs
+   */
+  @Test
+  public void testInvalidURLs() throws Exception {
+    // invalid percent-encoded sequence in host name
+    normalizeTestAssertThrowsMalformedURLException("https://example%2Xcom/");
+    // not a valid UTF-8 sequence in host name
+    // (only validated if parsed as Internationalized Domain Name)
+    BasicURLNormalizer norm = new BasicURLNormalizer();
+    conf = NutchConfiguration.create();
+    conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii");
+    norm.setConf(conf);
+    normalizeTestAssertThrowsMalformedURLException(norm,
+        "https://abc%FEdef.org/");
+  }
+
   private void normalizeTest(String weird, String normal) throws Exception {
     normalizeTest(this.normalizer, weird, normal);
   }
@@ -252,6 +277,23 @@
     }
   }
 
+  private void normalizeTestAssertThrowsMalformedURLException(String weird) throws Exception {
+    normalizeTestAssertThrowsMalformedURLException(this.normalizer, weird);
+  }
+
+  private void normalizeTestAssertThrowsMalformedURLException(
+      BasicURLNormalizer normalizer, String weird) throws Exception {
+    String normalized = null;
+    try {
+      normalized = normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT);
+    } catch (MalformedURLException e) {
+      // ok, expected
+      return;
+    }
+    Assert.fail("Expected MalformedURLException was not thrown on " + weird
+        + " (normalized: " + normalized + ")");
+  }
+
   public static void main(String[] args) throws Exception {
     new TestBasicURLNormalizer().testNormalizer();
   }