blob: eaaf7d0c776241dac3bb12bd153e908b54558b42 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.util;
import java.net.URL;
import org.junit.Assert;
import org.junit.Test;
/** Test class for URLUtil */
public class TestURLUtil {
@Test
public void testGetDomainName() throws Exception {
URL url = null;
url = new URL("http://lucene.apache.org/nutch");
Assert.assertEquals("apache.org", URLUtil.getDomainName(url));
url = new URL("http://en.wikipedia.org/wiki/Java_coffee");
Assert.assertEquals("wikipedia.org", URLUtil.getDomainName(url));
url = new URL("http://140.211.11.130/foundation/contributing.html");
Assert.assertEquals("140.211.11.130", URLUtil.getDomainName(url));
url = new URL("http://www.example.co.uk:8080/index.html");
Assert.assertEquals("example.co.uk", URLUtil.getDomainName(url));
url = new URL("http://com");
Assert.assertEquals("com", URLUtil.getDomainName(url));
url = new URL("http://www.example.co.uk.com");
Assert.assertEquals("uk.com", URLUtil.getDomainName(url));
// "nn" is not a tld
url = new URL("http://example.com.nn");
Assert.assertEquals("nn", URLUtil.getDomainName(url));
url = new URL("http://");
Assert.assertEquals("", URLUtil.getDomainName(url));
url = new URL("http://www.edu.tr.xyz");
Assert.assertEquals("xyz", URLUtil.getDomainName(url));
url = new URL("http://www.example.c.se");
Assert.assertEquals("example.c.se", URLUtil.getDomainName(url));
// plc.co.im is listed as a domain suffix
url = new URL("http://www.example.plc.co.im");
Assert.assertEquals("example.plc.co.im", URLUtil.getDomainName(url));
// 2000.hu is listed as a domain suffix
url = new URL("http://www.example.2000.hu");
Assert.assertEquals("example.2000.hu", URLUtil.getDomainName(url));
// test non-ascii
url = new URL("http://www.example.商業.tw");
Assert.assertEquals("example.商業.tw", URLUtil.getDomainName(url));
}
@Test
public void testGetDomainSuffix() throws Exception {
URL url = null;
url = new URL("http://lucene.apache.org/nutch");
Assert.assertEquals("org", URLUtil.getDomainSuffix(url).getDomain());
url = new URL("http://140.211.11.130/foundation/contributing.html");
Assert.assertNull(URLUtil.getDomainSuffix(url));
url = new URL("http://www.example.co.uk:8080/index.html");
Assert.assertEquals("co.uk", URLUtil.getDomainSuffix(url).getDomain());
url = new URL("http://com");
Assert.assertEquals("com", URLUtil.getDomainSuffix(url).getDomain());
url = new URL("http://www.example.co.uk.com");
Assert.assertEquals("com", URLUtil.getDomainSuffix(url).getDomain());
// "nn" is not a tld
url = new URL("http://example.com.nn");
Assert.assertNull(URLUtil.getDomainSuffix(url));
url = new URL("http://");
Assert.assertNull(URLUtil.getDomainSuffix(url));
url = new URL("http://www.edu.tr.xyz");
Assert.assertNull(URLUtil.getDomainSuffix(url));
url = new URL("http://subdomain.example.edu.tr");
Assert.assertEquals("edu.tr", URLUtil.getDomainSuffix(url).getDomain());
url = new URL("http://subdomain.example.presse.fr");
Assert.assertEquals("presse.fr", URLUtil.getDomainSuffix(url).getDomain());
url = new URL("http://subdomain.example.presse.tr");
Assert.assertEquals("tr", URLUtil.getDomainSuffix(url).getDomain());
// plc.co.im is listed as a domain suffix
url = new URL("http://www.example.plc.co.im");
Assert.assertEquals("plc.co.im", URLUtil.getDomainSuffix(url).getDomain());
// 2000.hu is listed as a domain suffix
url = new URL("http://www.example.2000.hu");
Assert.assertEquals("2000.hu", URLUtil.getDomainSuffix(url).getDomain());
// test non-ascii
url = new URL("http://www.example.商業.tw");
Assert.assertEquals("商業.tw", URLUtil.getDomainSuffix(url).getDomain());
}
@Test
public void testGetHostSegments() throws Exception {
URL url;
String[] segments;
url = new URL("http://subdomain.example.edu.tr");
segments = URLUtil.getHostSegments(url);
Assert.assertEquals("subdomain", segments[0]);
Assert.assertEquals("example", segments[1]);
Assert.assertEquals("edu", segments[2]);
Assert.assertEquals("tr", segments[3]);
url = new URL("http://");
segments = URLUtil.getHostSegments(url);
Assert.assertEquals(1, segments.length);
Assert.assertEquals("", segments[0]);
url = new URL("http://140.211.11.130/foundation/contributing.html");
segments = URLUtil.getHostSegments(url);
Assert.assertEquals(1, segments.length);
Assert.assertEquals("140.211.11.130", segments[0]);
// test non-ascii
url = new URL("http://www.example.商業.tw");
segments = URLUtil.getHostSegments(url);
Assert.assertEquals("www", segments[0]);
Assert.assertEquals("example", segments[1]);
Assert.assertEquals("商業", segments[2]);
Assert.assertEquals("tw", segments[3]);
}
@Test
public void testChooseRepr() throws Exception {
String aDotCom = "http://www.a.com";
String bDotCom = "http://www.b.com";
String aSubDotCom = "http://www.news.a.com";
String aQStr = "http://www.a.com?y=1";
String aPath = "http://www.a.com/xyz/index.html";
String aPath2 = "http://www.a.com/abc/page.html";
String aPath3 = "http://www.news.a.com/abc/page.html";
// 1) different domain them keep dest, temp or perm
// a.com -> b.com*
Assert.assertEquals(bDotCom, URLUtil.chooseRepr(aDotCom, bDotCom, true));
Assert.assertEquals(bDotCom, URLUtil.chooseRepr(aDotCom, bDotCom, false));
// 2) permanent and root, keep src
// *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html
Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aQStr, false));
Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aPath, false));
// 3) permanent and not root and dest root, keep dest
// a.com/xyz/index.html -> a.com*
Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aPath, aDotCom, false));
// 4) permanent and neither root keep dest
// a.com/xyz/index.html -> a.com/abc/page.html*
Assert.assertEquals(aPath2, URLUtil.chooseRepr(aPath, aPath2, false));
// 5) temp and root and dest not root keep src
// *a.com -> a.com/xyz/index.html
Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aPath, true));
// 6) temp and not root and dest root keep dest
// a.com/xyz/index.html -> a.com*
Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aPath, aDotCom, true));
// 7) temp and neither root, keep shortest, if hosts equal by path else by
// hosts
// a.com/xyz/index.html -> a.com/abc/page.html*
// *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html
Assert.assertEquals(aPath2, URLUtil.chooseRepr(aPath, aPath2, true));
Assert.assertEquals(aPath, URLUtil.chooseRepr(aPath, aPath3, true));
// 8) temp and both root keep shortest sub domain
// *www.a.com -> www.news.a.com
Assert.assertEquals(aDotCom, URLUtil.chooseRepr(aDotCom, aSubDotCom, true));
}
// from RFC3986 section 5.4.1
private static String baseString = "http://a/b/c/d;p?q";
private static String[][] targets = new String[][] {
// unknown protocol {"g:h" , "g:h"},
{ "g", "http://a/b/c/g" }, { "./g", "http://a/b/c/g" },
{ "g/", "http://a/b/c/g/" }, { "/g", "http://a/g" },
{ "//g", "http://g" }, { "?y", "http://a/b/c/d;p?y" },
{ "g?y", "http://a/b/c/g?y" }, { "#s", "http://a/b/c/d;p?q#s" },
{ "g#s", "http://a/b/c/g#s" }, { "g?y#s", "http://a/b/c/g?y#s" },
{ ";x", "http://a/b/c/;x" }, { "g;x", "http://a/b/c/g;x" },
{ "g;x?y#s", "http://a/b/c/g;x?y#s" }, { "", "http://a/b/c/d;p?q" },
{ ".", "http://a/b/c/" }, { "./", "http://a/b/c/" },
{ "..", "http://a/b/" }, { "../", "http://a/b/" },
{ "../g", "http://a/b/g" }, { "../..", "http://a/" },
{ "../../", "http://a/" }, { "../../g", "http://a/g" } };
@Test
public void testResolveURL() throws Exception {
// test NUTCH-436
URL u436 = new URL("http://a/b/c/d;p?q#f");
Assert.assertEquals("http://a/b/c/d;p?q#f", u436.toString());
URL abs = URLUtil.resolveURL(u436, "?y");
Assert.assertEquals("http://a/b/c/d;p?y", abs.toString());
// test NUTCH-566
URL u566 = new URL("http://www.fleurie.org/entreprise.asp");
abs = URLUtil.resolveURL(u566, "?id_entrep=111");
Assert.assertEquals("http://www.fleurie.org/entreprise.asp?id_entrep=111",
abs.toString());
URL base = new URL(baseString);
Assert.assertEquals("base url parsing", baseString, base.toString());
for (int i = 0; i < targets.length; i++) {
URL u = URLUtil.resolveURL(base, targets[i][0]);
Assert.assertEquals(targets[i][1], targets[i][1], u.toString());
}
}
@Test
public void testToUNICODE() throws Exception {
Assert.assertEquals("http://www.çevir.com",
URLUtil.toUNICODE("http://www.xn--evir-zoa.com"));
Assert.assertEquals("http://uni-tübingen.de/",
URLUtil.toUNICODE("http://xn--uni-tbingen-xhb.de/"));
Assert
.assertEquals(
"http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1",
URLUtil
.toUNICODE("http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1"));
}
@Test
public void testToASCII() throws Exception {
Assert.assertEquals("http://www.xn--evir-zoa.com",
URLUtil.toASCII("http://www.çevir.com"));
Assert.assertEquals("http://xn--uni-tbingen-xhb.de/",
URLUtil.toASCII("http://uni-tübingen.de/"));
Assert
.assertEquals(
"http://www.medizin.xn--uni-tbingen-xhb.de:8080/search.php?q=abc#p1",
URLUtil
.toASCII("http://www.medizin.uni-tübingen.de:8080/search.php?q=abc#p1"));
}
@Test
public void testFileProtocol() throws Exception {
// keep one single slash NUTCH-XXX
Assert.assertEquals("file:/path/file.html",
URLUtil.toASCII("file:/path/file.html"));
Assert.assertEquals("file:/path/file.html",
URLUtil.toUNICODE("file:/path/file.html"));
}
}