Merge pull request #552 from sebastian-nagel/NUTCH-2824
NUTCH-2824 urlnormalizer-basic to unescape percent-encoded host names
diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index 8794130..5479882 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -20,10 +20,12 @@
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
import java.net.IDN;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
+import java.net.URLDecoder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
@@ -396,10 +398,22 @@
private String normalizeHostName(String host) throws MalformedURLException {
- // 1. lowercase host name
+ // 1. unescape percent-encoded characters in host name
+ if (host.indexOf('%') != -1) {
+ try {
+ host = URLDecoder.decode(host, StandardCharsets.UTF_8.toString());
+ } catch (UnsupportedEncodingException | IllegalArgumentException e) {
+ LOG.debug("Failed to convert percent-encoded host name {}: ", host, e);
+ throw (MalformedURLException) new MalformedURLException(
+ "Invalid percent-encoded host name " + host + ": " + e.getMessage())
+ .initCause(e);
+ }
+ }
+
+ // 2. lowercase host name
host = host.toLowerCase(Locale.ROOT);
- // 2. if configured: convert between Unicode and ASCII forms
+ // 3. if configured: convert between Unicode and ASCII forms
// for Internationalized Domain Names (IDNs)
if (hostIDNtoASCII && !isAscii(host)) {
try {
@@ -418,7 +432,7 @@
host = IDN.toUnicode(host);
}
- // 3. optionally trim a trailing dot
+ // 4. optionally trim a trailing dot
if (hostTrimTrailingDot) {
if (host.endsWith(".")) {
host = host.substring(0, host.length()-1);
diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 5c6f710..fd0aa18 100644
--- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -116,6 +116,11 @@
normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html");
normalizeTest("http://Foo.Com/index.html", "http://foo.com/index.html");
+ // NUTCH-2824 unescape percent characters in host names
+ normalizeTest("https://example%2Ecom/", "https://example.com/");
+ normalizeTest("https://www.0251-sachverst%c3%a4ndiger.de/",
+ "https://www.0251-sachverst\u00e4ndiger.de/");
+
// check that port number is normalized
normalizeTest("http://foo.com:80/index.html", "http://foo.com/index.html");
normalizeTest("http://foo.com:81/", "http://foo.com:81/");
@@ -130,8 +135,8 @@
// check that references are removed
normalizeTest("http://foo.com/foo.html#ref", "http://foo.com/foo.html");
- // // check that encoding is normalized
- // normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
+ // check that encoding is normalized
+ normalizeTest("http://foo.com/%66oo.html", "http://foo.com/foo.html");
// check that unnecessary "../" are removed
normalizeTest("http://foo.com/..", "http://foo.com/");
@@ -226,6 +231,9 @@
conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii");
norm.setConf(conf);
normalizeTest(norm, "https://нэб.рф/", "https://xn--90ax2c.xn--p1ai/");
+ // verify escaping of percent-encoded characters in IDNs (NUTCH-2824)
+ normalizeTest(norm, "https://www.0251-sachverst%c3%a4ndiger.de/",
+ "https://www.xn--0251-sachverstndiger-ozb.de/");
conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toUnicode");
norm.setConf(conf);
normalizeTest(norm, "https://xn--90ax2c.xn--p1ai/", "https://нэб.рф/");
@@ -236,6 +244,23 @@
"https://www.example.org/");
}
+ /**
+ * Test that normalizer throws MalformedURLException for invalid URLs
+ */
+ @Test
+ public void testInvalidURLs() throws Exception {
+ // invalid percent-encoded sequence in host name
+ normalizeTestAssertThrowsMalformedURLException("https://example%2Xcom/");
+ // not a valid UTF-8 sequence in host name
+ // (only validated if parsed as Internationalized Domain Name)
+ BasicURLNormalizer norm = new BasicURLNormalizer();
+ conf = NutchConfiguration.create();
+ conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii");
+ norm.setConf(conf);
+ normalizeTestAssertThrowsMalformedURLException(norm,
+ "https://abc%FEdef.org/");
+ }
+
private void normalizeTest(String weird, String normal) throws Exception {
normalizeTest(this.normalizer, weird, normal);
}
@@ -252,6 +277,23 @@
}
}
+ private void normalizeTestAssertThrowsMalformedURLException(String weird) throws Exception {
+ normalizeTestAssertThrowsMalformedURLException(this.normalizer, weird);
+ }
+
+ private void normalizeTestAssertThrowsMalformedURLException(
+ BasicURLNormalizer normalizer, String weird) throws Exception {
+ String normalized = null;
+ try {
+ normalized = normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT);
+ } catch (MalformedURLException e) {
+ // ok, expected
+ return;
+ }
+ Assert.fail("Expected MalformedURLException was not thrown on " + weird
+ + " (normalized: " + normalized + ")");
+ }
+
public static void main(String[] args) throws Exception {
new TestBasicURLNormalizer().testNormalizer();
}