blob: e500f5aa1f0321576f2cb4fcb8fd3bd57df57003 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.util;
import java.net.IDN;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.util.regex.Pattern;
import org.apache.nutch.util.domain.DomainSuffix;
import org.apache.nutch.util.domain.DomainSuffixes;
/** Utility class for URL analysis */
public class URLUtil {
/**
* Resolve relative URL-s and fix a java.net.URL error in handling of URLs
* with pure query targets.
*
* @param base
* base url
* @param target
* target url (may be relative)
* @return resolved absolute url.
* @throws MalformedURLException
*/
public static URL resolveURL(URL base, String target)
throws MalformedURLException {
target = target.trim();
// handle the case that there is a target that is a pure query,
// for example
// http://careers3.accenture.com/Careers/ASPX/Search.aspx?co=0&sk=0
// It has urls in the page of the form href="?co=0&sk=0&pg=1", and by
// default
// URL constructs the base+target combo as
// http://careers3.accenture.com/Careers/ASPX/?co=0&sk=0&pg=1, incorrectly
// dropping the Search.aspx target
//
// Browsers handle these just fine, they must have an exception similar to
// this
if (target.startsWith("?")) {
return fixPureQueryTargets(base, target);
}
return new URL(base, target);
}
/** Handle the case in RFC3986 section 5.4.1 example 7, and similar. */
static URL fixPureQueryTargets(URL base, String target)
throws MalformedURLException {
if (!target.startsWith("?"))
return new URL(base, target);
String basePath = base.getPath();
String baseRightMost = "";
int baseRightMostIdx = basePath.lastIndexOf("/");
if (baseRightMostIdx != -1) {
baseRightMost = basePath.substring(baseRightMostIdx + 1);
}
if (target.startsWith("?"))
target = baseRightMost + target;
return new URL(base, target);
}
private static Pattern IP_PATTERN = Pattern
.compile("(\\d{1,3}\\.){3}(\\d{1,3})");
/**
* Returns the domain name of the url. The domain name of a url is the
* substring of the url's hostname, w/o subdomain names. As an example <br>
* <code>
* getDomainName(conf, new URL(http://lucene.apache.org/))
* </code><br>
* will return <br>
* <code> apache.org</code>
* */
public static String getDomainName(URL url) {
DomainSuffixes tlds = DomainSuffixes.getInstance();
String host = url.getHost();
// it seems that java returns hostnames ending with .
if (host.endsWith("."))
host = host.substring(0, host.length() - 1);
if (IP_PATTERN.matcher(host).matches())
return host;
int index = 0;
String candidate = host;
for (; index >= 0;) {
index = candidate.indexOf('.');
String subCandidate = candidate.substring(index + 1);
if (tlds.isDomainSuffix(subCandidate)) {
return candidate;
}
candidate = subCandidate;
}
return candidate;
}
/**
* Returns the domain name of the url. The domain name of a url is the
* substring of the url's hostname, w/o subdomain names. As an example <br>
* <code>
* getDomainName(conf, new http://lucene.apache.org/)
* </code><br>
* will return <br>
* <code> apache.org</code>
*
* @throws MalformedURLException
*/
public static String getDomainName(String url) throws MalformedURLException {
return getDomainName(new URL(url));
}
/**
* Returns the top level domain name of the url. The top level domain name of
* a url is the substring of the url's hostname, w/o subdomain names. As an
* example <br>
* <code>
* getTopLevelDomainName(conf, new http://lucene.apache.org/)
* </code><br>
* will return <br>
* <code> org</code>
*
* @throws MalformedURLException
*/
public static String getTopLevelDomainName(URL url)
throws MalformedURLException {
String suffix = getDomainSuffix(url).toString();
int idx = suffix.lastIndexOf(".");
if (idx != -1) {
return suffix.substring(idx + 1);
} else {
return suffix;
}
}
/**
* Returns the top level domain name of the url. The top level domain name of
* a url is the substring of the url's hostname, w/o subdomain names. As an
* example <br>
* <code>
* getTopLevelDomainName(conf, new http://lucene.apache.org/)
* </code><br>
* will return <br>
* <code> org</code>
*
* @throws MalformedURLException
*/
public static String getTopLevelDomainName(String url)
throws MalformedURLException {
return getTopLevelDomainName(new URL(url));
}
/**
* Returns whether the given urls have the same domain name. As an example, <br>
* <code> isSameDomain(new URL("http://lucene.apache.org")
* , new URL("http://people.apache.org/"))
* <br> will return true. </code>
*
* @return true if the domain names are equal
*/
public static boolean isSameDomainName(URL url1, URL url2) {
return getDomainName(url1).equalsIgnoreCase(getDomainName(url2));
}
/**
* Returns whether the given urls have the same domain name. As an example, <br>
* <code> isSameDomain("http://lucene.apache.org"
* ,"http://people.apache.org/")
* <br> will return true. </code>
*
* @return true if the domain names are equal
* @throws MalformedURLException
*/
public static boolean isSameDomainName(String url1, String url2)
throws MalformedURLException {
return isSameDomainName(new URL(url1), new URL(url2));
}
/**
* Returns the {@link DomainSuffix} corresponding to the last public part of
* the hostname
*/
public static DomainSuffix getDomainSuffix(URL url) {
DomainSuffixes tlds = DomainSuffixes.getInstance();
String host = url.getHost();
if (IP_PATTERN.matcher(host).matches())
return null;
int index = 0;
String candidate = host;
for (; index >= 0;) {
index = candidate.indexOf('.');
String subCandidate = candidate.substring(index + 1);
DomainSuffix d = tlds.get(subCandidate);
if (d != null) {
return d;
}
candidate = subCandidate;
}
return null;
}
/**
* Returns the {@link DomainSuffix} corresponding to the last public part of
* the hostname
*/
public static DomainSuffix getDomainSuffix(String url)
throws MalformedURLException {
return getDomainSuffix(new URL(url));
}
/** Partitions of the hostname of the url by "." */
public static String[] getHostSegments(URL url) {
String host = url.getHost();
// return whole hostname, if it is an ipv4
// TODO : handle ipv6
if (IP_PATTERN.matcher(host).matches())
return new String[] { host };
return host.split("\\.");
}
/**
* Partitions of the hostname of the url by "."
*
* @throws MalformedURLException
*/
public static String[] getHostSegments(String url)
throws MalformedURLException {
return getHostSegments(new URL(url));
}
/**
* Given two urls, a src and a destination of a redirect, it returns the
* representative url.
* <p>
* This method implements an extended version of the algorithm used by the
* Yahoo! Slurp crawler described here:<br>
* <a href="http://help.yahoo.com/l/nz/yahooxtra/search/webcrawler/slurp-11.html"> How
* does the Yahoo! webcrawler handle redirects?</a> <br>
* <br>
* <ul>
* <li>Choose target url if either url is malformed.</li>
* <li>If different domains the keep the destination whether or not the
* redirect is temp or perm</li>
* <li>a.com -&gt; b.com*</li>
* <li>If the redirect is permanent and the source is root, keep the source.</li>
* <li>*a.com -&gt; a.com?y=1 || *a.com -&gt; a.com/xyz/index.html</li>
* <li>If the redirect is permanent and the source is not root and the
* destination is root, keep the destination</li>
* <li>a.com/xyz/index.html -&gt; a.com*</li>
* <li>If the redirect is permanent and neither the source nor the destination
* is root, then keep the destination</li>
* <li>a.com/xyz/index.html -&gt; a.com/abc/page.html*</li>
* <li>If the redirect is temporary and source is root and destination is not
* root, then keep the source</li>
* <li>*a.com -&gt; a.com/xyz/index.html</li>
* <li>If the redirect is temporary and source is not root and destination is
* root, then keep the destination</li>
* <li>a.com/xyz/index.html -&gt; a.com*</li>
* <li>If the redirect is temporary and neither the source or the destination
* is root, then keep the shortest url. First check for the shortest host, and
* if both are equal then check by path. Path is first by length then by the
* number of / path separators.</li>
* <li>a.com/xyz/index.html -&gt; a.com/abc/page.html*</li>
* <li>*www.a.com/xyz/index.html -&gt; www.news.a.com/xyz/index.html</li>
* <li>If the redirect is temporary and both the source and the destination
* are root, then keep the shortest sub-domain</li>
* <li>*www.a.com -&gt; www.news.a.com</li>
* </ul>
* <br>
* While not in this logic there is a further piece of representative url
* logic that occurs during indexing and after scoring. During creation of the
* basic fields before indexing, if a url has a representative url stored we
* check both the url and its representative url (which should never be the
* same) against their linkrank scores and the highest scoring one is kept as
* the url and the lower scoring one is held as the orig url inside of the
* index.
*
* @param src
* The source url.
* @param dst
* The destination url.
* @param temp
* Is the redirect a temporary redirect.
*
* @return String The representative url.
*/
public static String chooseRepr(String src, String dst, boolean temp) {
// validate both are well formed urls
URL srcUrl;
URL dstUrl;
try {
srcUrl = new URL(src);
dstUrl = new URL(dst);
} catch (MalformedURLException e) {
return dst;
}
// get the source and destination domain, host, and page
String srcDomain = URLUtil.getDomainName(srcUrl);
String dstDomain = URLUtil.getDomainName(dstUrl);
String srcHost = srcUrl.getHost();
String dstHost = dstUrl.getHost();
String srcFile = srcUrl.getFile();
String dstFile = dstUrl.getFile();
// are the source and destination the root path url.com/ or url.com
boolean srcRoot = (srcFile.equals("/") || srcFile.length() == 0);
boolean destRoot = (dstFile.equals("/") || dstFile.length() == 0);
// 1) different domain them keep dest, temp or perm
// a.com -> b.com*
//
// 2) permanent and root, keep src
// *a.com -> a.com?y=1 || *a.com -> a.com/xyz/index.html
//
// 3) permanent and not root and dest root, keep dest
// a.com/xyz/index.html -> a.com*
//
// 4) permanent and neither root keep dest
// a.com/xyz/index.html -> a.com/abc/page.html*
//
// 5) temp and root and dest not root keep src
// *a.com -> a.com/xyz/index.html
//
// 7) temp and not root and dest root keep dest
// a.com/xyz/index.html -> a.com*
//
// 8) temp and neither root, keep shortest, if hosts equal by path else by
// hosts. paths are first by length then by number of / separators
// a.com/xyz/index.html -> a.com/abc/page.html*
// *www.a.com/xyz/index.html -> www.news.a.com/xyz/index.html
//
// 9) temp and both root keep shortest sub domain
// *www.a.com -> www.news.a.com
// if we are dealing with a redirect from one domain to another keep the
// destination
if (!srcDomain.equals(dstDomain)) {
return dst;
}
// if it is a permanent redirect
if (!temp) {
// if source is root return source, otherwise destination
if (srcRoot) {
return src;
} else {
return dst;
}
} else { // temporary redirect
// source root and destination not root
if (srcRoot && !destRoot) {
return src;
} else if (!srcRoot && destRoot) { // destination root and source not
return dst;
} else if (!srcRoot && !destRoot && (srcHost.equals(dstHost))) {
// source and destination hosts are the same, check paths, host length
int numSrcPaths = srcFile.split("/").length;
int numDstPaths = dstFile.split("/").length;
if (numSrcPaths != numDstPaths) {
return (numDstPaths < numSrcPaths ? dst : src);
} else {
int srcPathLength = srcFile.length();
int dstPathLength = dstFile.length();
return (dstPathLength < srcPathLength ? dst : src);
}
} else {
// different host names and both root take the shortest
int numSrcSubs = srcHost.split("\\.").length;
int numDstSubs = dstHost.split("\\.").length;
return (numDstSubs < numSrcSubs ? dst : src);
}
}
}
/**
* Returns the lowercased hostname for the url or null if the url is not well
* formed.
*
* @param url
* The url to check.
* @return String The hostname for the url.
*/
public static String getHost(String url) {
try {
return new URL(url).getHost().toLowerCase();
} catch (MalformedURLException e) {
return null;
}
}
/**
* Returns the page for the url. The page consists of the protocol, host, and
* path, but does not include the query string. The host is lowercased but the
* path is not.
*
* @param url
* The url to check.
* @return String The page for the url.
*/
public static String getPage(String url) {
try {
// get the full url, and replace the query string with and empty string
url = url.toLowerCase();
String queryStr = new URL(url).getQuery();
return (queryStr != null) ? url.replace("?" + queryStr, "") : url;
} catch (MalformedURLException e) {
return null;
}
}
public static String getProtocol(String url) {
try {
return getProtocol(new URL(url));
} catch (Exception e) {
return null;
}
}
public static String getProtocol(URL url) {
return url.getProtocol();
}
public static String toASCII(String url) {
try {
URL u = new URL(url);
String host = u.getHost();
if (host == null || host.isEmpty()) {
// no host name => no punycoded domain name
// also do not add additional slashes for file: URLs (NUTCH-1880)
return url;
}
URI p = new URI(u.getProtocol(), u.getUserInfo(), IDN.toASCII(host),
u.getPort(), u.getPath(), u.getQuery(), u.getRef());
return p.toString();
} catch (Exception e) {
return null;
}
}
public static String toUNICODE(String url) {
try {
URL u = new URL(url);
String host = u.getHost();
if (host == null || host.isEmpty()) {
// no host name => no punycoded domain name
// also do not add additional slashes for file: URLs (NUTCH-1880)
return url;
}
StringBuilder sb = new StringBuilder();
sb.append(u.getProtocol());
sb.append("://");
if (u.getUserInfo() != null) {
sb.append(u.getUserInfo());
sb.append('@');
}
sb.append(IDN.toUnicode(host));
if (u.getPort() != -1) {
sb.append(':');
sb.append(u.getPort());
}
sb.append(u.getFile()); // includes query
if (u.getRef() != null) {
sb.append('#');
sb.append(u.getRef());
}
return sb.toString();
} catch (Exception e) {
return null;
}
}
/** For testing */
public static void main(String[] args) {
if (args.length != 1) {
System.err.println("Usage : URLUtil <url>");
return;
}
String url = args[0];
try {
System.out.println(URLUtil.getDomainName(new URL(url)));
} catch (MalformedURLException ex) {
ex.printStackTrace();
}
}
}