| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.net.urlnormalizer.basic; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.InputStreamReader; |
| import java.net.IDN; |
| import java.net.MalformedURLException; |
| import java.net.URISyntaxException; |
| import java.net.URL; |
| import java.nio.charset.Charset; |
| import java.nio.charset.StandardCharsets; |
| import java.util.Locale; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.nutch.net.URLNormalizer; |
| import org.apache.nutch.net.URLNormalizers; |
| import org.apache.nutch.util.NutchConfiguration; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * Converts URLs to a normal form: |
| * <ul> |
| * <li>remove dot segments in path: <code>/./</code> or <code>/../</code></li> |
| * <li>remove default ports, e.g. 80 for protocol <code>http://</code></li> |
| * <li>normalize <a href= |
| * "https://en.wikipedia.org/wiki/Percent-encoding#Percent-encoding_in_a_URI"> |
| * percent-encoding</a> in URL paths</li> |
| * </ul> |
| */ |
| public class BasicURLNormalizer implements URLNormalizer { |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| public final static String NORM_HOST_IDN = "urlnormalizer.basic.host.idn"; |
| public final static String NORM_HOST_TRIM_TRAILING_DOT = "urlnormalizer.basic.host.trim-trailing-dot"; |
| |
| /** |
| * Pattern to detect whether a URL path could be normalized. Contains one of |
| * /. or ./ /.. or ../ // |
| */ |
| private final static Pattern hasNormalizablePathPattern = Pattern |
| .compile("/[./]|[.]/"); |
| |
| /** |
| * Nutch 1098 - finds URL encoded parts of the URL |
| */ |
| private final static Pattern unescapeRulePattern = Pattern |
| .compile("%([0-9A-Fa-f]{2})"); |
| |
| // charset used for encoding URLs before escaping |
| private final static Charset utf8 = StandardCharsets.UTF_8; |
| |
| /** look-up table for characters which should not be escaped in URL paths */ |
| private final static boolean[] unescapedCharacters = new boolean[128]; |
| static { |
| for (int c = 0; c < 128; c++) { |
| /* https://tools.ietf.org/html/rfc3986#section-2.2 |
| * For consistency, percent-encoded octets in the ranges of ALPHA |
| * (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), |
| * underscore (%5F), or tilde (%7E) should not be created by URI |
| * producers and, when found in a URI, should be decoded to their |
| * corresponding unreserved characters by URI normalizers. |
| */ |
| if (isAlphaNumeric(c) |
| || c == 0x2D || c == 0x2E |
| || c == 0x5F || c == 0x7E) { |
| unescapedCharacters[c] = true; |
| } else { |
| unescapedCharacters[c] = false; |
| } |
| } |
| } |
| |
| /** look-up table for characters which should not be escaped in URL paths */ |
| private final static boolean[] escapedCharacters = new boolean[128]; |
| static { |
| for (int c = 0; c < 128; c++) { |
| if (unescapedCharacters[c]) { |
| escapedCharacters[c] = false; |
| } else if (c < 0x21 // control character or space |
| || c == 0x22 // " |
| || c == 0x3C // < |
| || c == 0x3E // > |
| || c == 0x5B // [ |
| || c == 0x5D // ] |
| || c == 0x5E // ^ |
| || c == 0x60 // ` |
| || c == 0x7B // { |
| || c == 0x7C // | |
| || c == 0x7D // } |
| || c == 0x7F // DEL |
| ) { |
| escapedCharacters[c] = true; |
| } else { |
| if (LOG.isDebugEnabled()) { |
| LOG.debug("Character {} ({}) not handled as escaped or unescaped", c, |
| (char) c); |
| } |
| } |
| } |
| } |
| |
| private static boolean isAlphaNumeric(int c) { |
| return (0x41 <= c && c <= 0x5A) |
| || (0x61 <= c && c <= 0x7A) |
| || (0x30 <= c && c <= 0x39); |
| } |
| |
| private static boolean isHexCharacter(int c) { |
| return (0x41 <= c && c <= 0x46) |
| || (0x61 <= c && c <= 0x66) |
| || (0x30 <= c && c <= 0x39); |
| } |
| |
| private static boolean isAscii(String str) { |
| char[] chars = str.toCharArray(); |
| for (char c : chars) { |
| if (c > 127) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| private Configuration conf; |
| |
| private boolean hostIDNtoASCII; |
| private boolean hostASCIItoIDN; |
| private boolean hostTrimTrailingDot; |
| |
| public void BasicUrlNormalizer() { |
| } |
| |
| @Override |
| public Configuration getConf() { |
| return conf; |
| } |
| |
| @Override |
| public void setConf(Configuration conf) { |
| this.conf = conf; |
| String normIdn = conf.get(NORM_HOST_IDN, ""); |
| if (normIdn.equalsIgnoreCase("toAscii")) { |
| hostIDNtoASCII = true; |
| } else if (normIdn.equalsIgnoreCase("toUnicode")) { |
| hostASCIItoIDN = true; |
| } |
| hostTrimTrailingDot = conf.getBoolean(NORM_HOST_TRIM_TRAILING_DOT, false); |
| } |
| |
| @Override |
| public String normalize(String urlString, String scope) |
| throws MalformedURLException { |
| |
| if ("".equals(urlString)) // permit empty |
| return urlString; |
| |
| urlString = urlString.trim(); // remove extra spaces |
| |
| URL url = new URL(urlString); |
| |
| String protocol = url.getProtocol(); |
| String host = url.getHost(); |
| int port = url.getPort(); |
| String file = url.getFile(); |
| |
| boolean changed = false; |
| boolean normalizePath = false; |
| |
| if (!urlString.startsWith(protocol)) // protocol was lowercased |
| changed = true; |
| |
| if ("http".equals(protocol) || "https".equals(protocol) |
| || "ftp".equals(protocol)) { |
| |
| if (host != null && url.getAuthority() != null) { |
| String newHost = normalizeHostName(host); |
| if (!host.equals(newHost)) { |
| host = newHost; |
| changed = true; |
| } else if (!url.getAuthority().equals(newHost)) { |
| // authority (http://<...>/) contains other elements (port, user, |
| // etc.) which will likely cause a change if left away |
| changed = true; |
| } |
| } else { |
| // no host or authority: recompose the URL from components |
| changed = true; |
| } |
| |
| if (port == url.getDefaultPort()) { // uses default port |
| port = -1; // so don't specify it |
| changed = true; |
| } |
| |
| normalizePath = true; |
| if (file == null || "".equals(file)) { |
| file = "/"; |
| changed = true; |
| normalizePath = false; // no further path normalization required |
| } else if (!file.startsWith("/")) { |
| file = "/" + file; |
| changed = true; |
| normalizePath = false; // no further path normalization required |
| } |
| |
| if (url.getRef() != null) { // remove the ref |
| changed = true; |
| } |
| |
| } else if (protocol.equals("file")) { |
| normalizePath = true; |
| } |
| |
| // properly encode characters in path/file using percent-encoding |
| String file2 = unescapePath(file); |
| file2 = escapePath(file2); |
| if (!file.equals(file2)) { |
| changed = true; |
| file = file2; |
| } |
| |
| if (normalizePath) { |
| // check for unnecessary use of "/../", "/./", and "//" |
| if (changed) { |
| url = new URL(protocol, host, port, file); |
| } |
| file2 = getFileWithNormalizedPath(url); |
| if (!file.equals(file2)) { |
| changed = true; |
| file = file2; |
| } |
| } |
| |
| if (changed) { |
| url = new URL(protocol, host, port, file); |
| urlString = url.toString(); |
| } |
| |
| return urlString; |
| } |
| |
| private String getFileWithNormalizedPath(URL url) |
| throws MalformedURLException { |
| String file; |
| |
| if (hasNormalizablePathPattern.matcher(url.getPath()).find()) { |
| // only normalize the path if there is something to normalize |
| // to avoid needless work |
| try { |
| file = url.toURI().normalize().toURL().getFile(); |
| // URI.normalize() does not normalize leading dot segments, |
| // see also http://tools.ietf.org/html/rfc3986#section-5.2.4 |
| int start = 0; |
| while (file.startsWith("/..", start) |
| && ((start + 3) == file.length() || file.charAt(3) == '/')) { |
| start += 3; |
| } |
| if (start > 0) { |
| file = file.substring(start); |
| } |
| } catch (URISyntaxException e) { |
| file = url.getFile(); |
| } |
| } else { |
| file = url.getFile(); |
| } |
| |
| // if path is empty return a single slash |
| if (file.isEmpty()) { |
| file = "/"; |
| } else if (!file.startsWith("/")) { |
| file = "/" + file; |
| } |
| |
| return file; |
| } |
| |
| /** |
| * Remove % encoding from path segment in URL for characters which should be |
| * unescaped according to <a |
| * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>. |
| */ |
| private String unescapePath(String path) { |
| StringBuilder sb = new StringBuilder(); |
| |
| Matcher matcher = unescapeRulePattern.matcher(path); |
| |
| int end = -1; |
| int letter; |
| |
| // Traverse over all encoded groups |
| while (matcher.find()) { |
| // Append everything up to this group |
| sb.append(path.substring(end + 1, matcher.start())); |
| |
| // Get the integer representation of this hexadecimal encoded character |
| letter = Integer.valueOf(matcher.group().substring(1), 16); |
| |
| if (letter < 128 && unescapedCharacters[letter]) { |
| // character should be unescaped in URLs |
| sb.append(Character.valueOf((char)letter)); |
| } else { |
| // Append the encoded character as uppercase |
| sb.append(matcher.group().toUpperCase(Locale.ROOT)); |
| } |
| |
| end = matcher.start() + 2; |
| } |
| |
| letter = path.length(); |
| |
| // Append the rest if there's anything |
| if (end <= letter - 1) { |
| sb.append(path.substring(end + 1, letter)); |
| } |
| |
| // Ok! |
| return sb.toString(); |
| } |
| |
| /** |
| * Convert path segment of URL from Unicode to UTF-8 and escape all |
| * characters which should be escaped according to <a |
| * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.. |
| */ |
| private String escapePath(String path) { |
| StringBuilder sb = new StringBuilder(path.length()); |
| |
| // Traverse over all bytes in this URL |
| byte[] bytes = path.getBytes(utf8); |
| for (int i = 0; i < bytes.length; i++) { |
| byte b = bytes[i]; |
| // Is this a control character? |
| if (b < 0 || escapedCharacters[b]) { |
| // Start escape sequence |
| sb.append('%'); |
| |
| // Get this byte's hexadecimal representation |
| String hex = Integer.toHexString(b & 0xFF).toUpperCase(Locale.ROOT); |
| |
| // Do we need to prepend a zero? |
| if (hex.length() % 2 != 0 ) { |
| sb.append('0'); |
| sb.append(hex); |
| } else { |
| // No, append this hexadecimal representation |
| sb.append(hex); |
| } |
| } else if (b == 0x25) { |
| // percent sign (%): read-ahead to check whether a valid escape sequence |
| if ((i+2) >= bytes.length) { |
| // need at least two more characters |
| sb.append("%25"); |
| } else { |
| byte e1 = bytes[i+1]; |
| byte e2 = bytes[i+2]; |
| if (isHexCharacter(e1) && isHexCharacter(e2)) { |
| // valid percent encoding, output and fast-forward |
| i += 2; |
| sb.append((char) b); |
| sb.append((char) e1); |
| sb.append((char) e2); |
| } else { |
| sb.append("%25"); |
| } |
| } |
| } else { |
| // No, just append this character as-is |
| sb.append((char) b); |
| } |
| } |
| |
| return sb.toString(); |
| } |
| |
| private String normalizeHostName(String host) throws MalformedURLException { |
| |
| // 1. lowercase host name |
| host = host.toLowerCase(Locale.ROOT); |
| |
| // 2. if configured: convert between Unicode and ASCII forms |
| // for Internationalized Domain Names (IDNs) |
| if (hostIDNtoASCII && !isAscii(host)) { |
| try { |
| host = IDN.toASCII(host); |
| } catch (IllegalArgumentException | IndexOutOfBoundsException e) { |
| // IllegalArgumentException: thrown if the input string contains |
| // non-convertible Unicode codepoints |
| // IndexOutOfBoundsException: thrown (undocumented) if one "label" |
| // (non-ASCII dot-separated segment) is longer than 256 characters, |
| // cf. https://bugs.openjdk.java.net/browse/JDK-6806873 |
| LOG.debug("Failed to convert IDN host {}: ", host, e); |
| throw (MalformedURLException) new MalformedURLException( |
| "Invalid IDN " + host + ": " + e.getMessage()).initCause(e); |
| } |
| } else if (hostASCIItoIDN && host.contains("xn--")) { |
| host = IDN.toUnicode(host); |
| } |
| |
| // 3. optionally trim a trailing dot |
| if (hostTrimTrailingDot) { |
| if (host.endsWith(".")) { |
| host = host.substring(0, host.length()-1); |
| } |
| } |
| |
| return host; |
| } |
| |
| public static void main(String args[]) throws IOException { |
| BasicURLNormalizer normalizer = new BasicURLNormalizer(); |
| normalizer.setConf(NutchConfiguration.create()); |
| String scope = URLNormalizers.SCOPE_DEFAULT; |
| if (args.length >= 1) { |
| scope = args[0]; |
| System.out.println("Scope: " + scope); |
| } |
| String line, normUrl; |
| BufferedReader in = new BufferedReader( |
| new InputStreamReader(System.in, utf8)); |
| while ((line = in.readLine()) != null) { |
| try { |
| normUrl = normalizer.normalize(line, scope); |
| System.out.println(normUrl); |
| } catch (MalformedURLException e) { |
| System.out.println("failed: " + line); |
| } |
| } |
| System.exit(0); |
| } |
| |
| } |