blob: 93bd3362159d41c8a8e24aafe2d23adaccac64f1 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.net.urlnormalizer.basic;
import java.lang.invoke.MethodHandles;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.IDN;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.net.URLNormalizer;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.util.NutchConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Converts URLs to a normal form:
* <ul>
* <li>remove dot segments in path: <code>/./</code> or <code>/../</code></li>
* <li>remove default ports, e.g. 80 for protocol <code>http://</code></li>
* <li>normalize <a href=
* "https://en.wikipedia.org/wiki/Percent-encoding#Percent-encoding_in_a_URI">
* percent-encoding</a> in URL paths</li>
* </ul>
*/
public class BasicURLNormalizer implements URLNormalizer {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
public final static String NORM_HOST_IDN = "urlnormalizer.basic.host.idn";
public final static String NORM_HOST_TRIM_TRAILING_DOT = "urlnormalizer.basic.host.trim-trailing-dot";
/**
* Pattern to detect whether a URL path could be normalized. Contains one of
* /. or ./ /.. or ../ //
*/
private final static Pattern hasNormalizablePathPattern = Pattern
.compile("/[./]|[.]/");
/**
* Nutch 1098 - finds URL encoded parts of the URL
*/
private final static Pattern unescapeRulePattern = Pattern
.compile("%([0-9A-Fa-f]{2})");
// charset used for encoding URLs before escaping
private final static Charset utf8 = StandardCharsets.UTF_8;
/** look-up table for characters which should not be escaped in URL paths */
private final static boolean[] unescapedCharacters = new boolean[128];
static {
for (int c = 0; c < 128; c++) {
/* https://tools.ietf.org/html/rfc3986#section-2.2
* For consistency, percent-encoded octets in the ranges of ALPHA
* (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
* underscore (%5F), or tilde (%7E) should not be created by URI
* producers and, when found in a URI, should be decoded to their
* corresponding unreserved characters by URI normalizers.
*/
if (isAlphaNumeric(c)
|| c == 0x2D || c == 0x2E
|| c == 0x5F || c == 0x7E) {
unescapedCharacters[c] = true;
} else {
unescapedCharacters[c] = false;
}
}
}
/** look-up table for characters which should not be escaped in URL paths */
private final static boolean[] escapedCharacters = new boolean[128];
static {
for (int c = 0; c < 128; c++) {
if (unescapedCharacters[c]) {
escapedCharacters[c] = false;
} else if (c < 0x21 // control character or space
|| c == 0x22 // "
|| c == 0x3C // <
|| c == 0x3E // >
|| c == 0x5B // [
|| c == 0x5D // ]
|| c == 0x5E // ^
|| c == 0x60 // `
|| c == 0x7B // {
|| c == 0x7C // |
|| c == 0x7D // }
|| c == 0x7F // DEL
) {
escapedCharacters[c] = true;
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("Character {} ({}) not handled as escaped or unescaped", c,
(char) c);
}
}
}
}
private static boolean isAlphaNumeric(int c) {
return (0x41 <= c && c <= 0x5A)
|| (0x61 <= c && c <= 0x7A)
|| (0x30 <= c && c <= 0x39);
}
private static boolean isHexCharacter(int c) {
return (0x41 <= c && c <= 0x46)
|| (0x61 <= c && c <= 0x66)
|| (0x30 <= c && c <= 0x39);
}
private static boolean isAscii(String str) {
char[] chars = str.toCharArray();
for (char c : chars) {
if (c > 127) {
return false;
}
}
return true;
}
private Configuration conf;
private boolean hostIDNtoASCII;
private boolean hostASCIItoIDN;
private boolean hostTrimTrailingDot;
public void BasicUrlNormalizer() {
}
@Override
public Configuration getConf() {
return conf;
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
String normIdn = conf.get(NORM_HOST_IDN, "");
if (normIdn.equalsIgnoreCase("toAscii")) {
hostIDNtoASCII = true;
} else if (normIdn.equalsIgnoreCase("toUnicode")) {
hostASCIItoIDN = true;
}
hostTrimTrailingDot = conf.getBoolean(NORM_HOST_TRIM_TRAILING_DOT, false);
}
@Override
public String normalize(String urlString, String scope)
throws MalformedURLException {
if ("".equals(urlString)) // permit empty
return urlString;
urlString = urlString.trim(); // remove extra spaces
URL url = new URL(urlString);
String protocol = url.getProtocol();
String host = url.getHost();
int port = url.getPort();
String file = url.getFile();
boolean changed = false;
boolean normalizePath = false;
if (!urlString.startsWith(protocol)) // protocol was lowercased
changed = true;
if ("http".equals(protocol) || "https".equals(protocol)
|| "ftp".equals(protocol)) {
if (host != null && url.getAuthority() != null) {
String newHost = normalizeHostName(host);
if (!host.equals(newHost)) {
host = newHost;
changed = true;
} else if (!url.getAuthority().equals(newHost)) {
// authority (http://<...>/) contains other elements (port, user,
// etc.) which will likely cause a change if left away
changed = true;
}
} else {
// no host or authority: recompose the URL from components
changed = true;
}
if (port == url.getDefaultPort()) { // uses default port
port = -1; // so don't specify it
changed = true;
}
normalizePath = true;
if (file == null || "".equals(file)) {
file = "/";
changed = true;
normalizePath = false; // no further path normalization required
} else if (!file.startsWith("/")) {
file = "/" + file;
changed = true;
normalizePath = false; // no further path normalization required
}
if (url.getRef() != null) { // remove the ref
changed = true;
}
} else if (protocol.equals("file")) {
normalizePath = true;
}
// properly encode characters in path/file using percent-encoding
String file2 = unescapePath(file);
file2 = escapePath(file2);
if (!file.equals(file2)) {
changed = true;
file = file2;
}
if (normalizePath) {
// check for unnecessary use of "/../", "/./", and "//"
if (changed) {
url = new URL(protocol, host, port, file);
}
file2 = getFileWithNormalizedPath(url);
if (!file.equals(file2)) {
changed = true;
file = file2;
}
}
if (changed) {
url = new URL(protocol, host, port, file);
urlString = url.toString();
}
return urlString;
}
private String getFileWithNormalizedPath(URL url)
throws MalformedURLException {
String file;
if (hasNormalizablePathPattern.matcher(url.getPath()).find()) {
// only normalize the path if there is something to normalize
// to avoid needless work
try {
file = url.toURI().normalize().toURL().getFile();
// URI.normalize() does not normalize leading dot segments,
// see also http://tools.ietf.org/html/rfc3986#section-5.2.4
int start = 0;
while (file.startsWith("/..", start)
&& ((start + 3) == file.length() || file.charAt(3) == '/')) {
start += 3;
}
if (start > 0) {
file = file.substring(start);
}
} catch (URISyntaxException e) {
file = url.getFile();
}
} else {
file = url.getFile();
}
// if path is empty return a single slash
if (file.isEmpty()) {
file = "/";
} else if (!file.startsWith("/")) {
file = "/" + file;
}
return file;
}
/**
* Remove % encoding from path segment in URL for characters which should be
* unescaped according to <a
* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
*/
private String unescapePath(String path) {
StringBuilder sb = new StringBuilder();
Matcher matcher = unescapeRulePattern.matcher(path);
int end = -1;
int letter;
// Traverse over all encoded groups
while (matcher.find()) {
// Append everything up to this group
sb.append(path.substring(end + 1, matcher.start()));
// Get the integer representation of this hexadecimal encoded character
letter = Integer.valueOf(matcher.group().substring(1), 16);
if (letter < 128 && unescapedCharacters[letter]) {
// character should be unescaped in URLs
sb.append(Character.valueOf((char)letter));
} else {
// Append the encoded character as uppercase
sb.append(matcher.group().toUpperCase(Locale.ROOT));
}
end = matcher.start() + 2;
}
letter = path.length();
// Append the rest if there's anything
if (end <= letter - 1) {
sb.append(path.substring(end + 1, letter));
}
// Ok!
return sb.toString();
}
/**
* Convert path segment of URL from Unicode to UTF-8 and escape all
* characters which should be escaped according to <a
* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>..
*/
private String escapePath(String path) {
StringBuilder sb = new StringBuilder(path.length());
// Traverse over all bytes in this URL
byte[] bytes = path.getBytes(utf8);
for (int i = 0; i < bytes.length; i++) {
byte b = bytes[i];
// Is this a control character?
if (b < 0 || escapedCharacters[b]) {
// Start escape sequence
sb.append('%');
// Get this byte's hexadecimal representation
String hex = Integer.toHexString(b & 0xFF).toUpperCase(Locale.ROOT);
// Do we need to prepend a zero?
if (hex.length() % 2 != 0 ) {
sb.append('0');
sb.append(hex);
} else {
// No, append this hexadecimal representation
sb.append(hex);
}
} else if (b == 0x25) {
// percent sign (%): read-ahead to check whether a valid escape sequence
if ((i+2) >= bytes.length) {
// need at least two more characters
sb.append("%25");
} else {
byte e1 = bytes[i+1];
byte e2 = bytes[i+2];
if (isHexCharacter(e1) && isHexCharacter(e2)) {
// valid percent encoding, output and fast-forward
i += 2;
sb.append((char) b);
sb.append((char) e1);
sb.append((char) e2);
} else {
sb.append("%25");
}
}
} else {
// No, just append this character as-is
sb.append((char) b);
}
}
return sb.toString();
}
private String normalizeHostName(String host) throws MalformedURLException {
// 1. lowercase host name
host = host.toLowerCase(Locale.ROOT);
// 2. if configured: convert between Unicode and ASCII forms
// for Internationalized Domain Names (IDNs)
if (hostIDNtoASCII && !isAscii(host)) {
try {
host = IDN.toASCII(host);
} catch (IllegalArgumentException | IndexOutOfBoundsException e) {
// IllegalArgumentException: thrown if the input string contains
// non-convertible Unicode codepoints
// IndexOutOfBoundsException: thrown (undocumented) if one "label"
// (non-ASCII dot-separated segment) is longer than 256 characters,
// cf. https://bugs.openjdk.java.net/browse/JDK-6806873
LOG.debug("Failed to convert IDN host {}: ", host, e);
throw (MalformedURLException) new MalformedURLException(
"Invalid IDN " + host + ": " + e.getMessage()).initCause(e);
}
} else if (hostASCIItoIDN && host.contains("xn--")) {
host = IDN.toUnicode(host);
}
// 3. optionally trim a trailing dot
if (hostTrimTrailingDot) {
if (host.endsWith(".")) {
host = host.substring(0, host.length()-1);
}
}
return host;
}
public static void main(String args[]) throws IOException {
BasicURLNormalizer normalizer = new BasicURLNormalizer();
normalizer.setConf(NutchConfiguration.create());
String scope = URLNormalizers.SCOPE_DEFAULT;
if (args.length >= 1) {
scope = args[0];
System.out.println("Scope: " + scope);
}
String line, normUrl;
BufferedReader in = new BufferedReader(
new InputStreamReader(System.in, utf8));
while ((line = in.readLine()) != null) {
try {
normUrl = normalizer.normalize(line, scope);
System.out.println(normUrl);
} catch (MalformedURLException e) {
System.out.println("failed: " + line);
}
}
System.exit(0);
}
}