NUTCH-2824 urlnormalizer-basic to unescape percent-encoded host names
- add unit tests to verify that a declared MalformedURLException is thrown
on host names containing illegal percent-encoded sequences and
any (undeclared) runtime exceptions are caught and rethrown
diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index 05feb6d..fd0aa18 100644
--- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -244,6 +244,23 @@
"https://www.example.org/");
}
+ /**
+ * Test that normalizer throws MalformedURLException for invalid URLs
+ */
+ @Test
+ public void testInvalidURLs() throws Exception {
+ // invalid percent-encoded sequence in host name
+ normalizeTestAssertThrowsMalformedURLException("https://example%2Xcom/");
+ // not a valid UTF-8 sequence in host name
+ // (only validated if parsed as Internationalized Domain Name)
+ BasicURLNormalizer norm = new BasicURLNormalizer();
+ conf = NutchConfiguration.create();
+ conf.set(BasicURLNormalizer.NORM_HOST_IDN, "toAscii");
+ norm.setConf(conf);
+ normalizeTestAssertThrowsMalformedURLException(norm,
+ "https://abc%FEdef.org/");
+ }
+
private void normalizeTest(String weird, String normal) throws Exception {
normalizeTest(this.normalizer, weird, normal);
}
@@ -260,6 +277,23 @@
}
}
+ private void normalizeTestAssertThrowsMalformedURLException(String weird) throws Exception {
+ normalizeTestAssertThrowsMalformedURLException(this.normalizer, weird);
+ }
+
+ private void normalizeTestAssertThrowsMalformedURLException(
+ BasicURLNormalizer normalizer, String weird) throws Exception {
+ String normalized = null;
+ try {
+ normalized = normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT);
+ } catch (MalformedURLException e) {
+ // ok, expected
+ return;
+ }
+ Assert.fail("Expected MalformedURLException was not thrown on " + weird
+ + " (normalized: " + normalized + ")");
+ }
+
public static void main(String[] args) throws Exception {
new TestBasicURLNormalizer().testNormalizer();
}