src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.net.urlnormalizer.basic;

 import java.lang.invoke.MethodHandles;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.net.IDN;
 import java.net.MalformedURLException;
 import java.net.URISyntaxException;
 import java.net.URL;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.util.Locale;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.net.URLNormalizer;
 import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.util.NutchConfiguration;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * Converts URLs to a normal form:
  * <ul>
  * <li>remove dot segments in path: <code>/./</code> or <code>/../</code></li>
  * <li>remove default ports, e.g. 80 for protocol <code>http://</code></li>
  * <li>normalize <a href=
  * "https://en.wikipedia.org/wiki/Percent-encoding#Percent-encoding_in_a_URI">
  * percent-encoding</a> in URL paths</li>
  * </ul>
  */
 public class BasicURLNormalizer implements URLNormalizer {
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   public final static String NORM_HOST_IDN = "urlnormalizer.basic.host.idn";
   public final static String NORM_HOST_TRIM_TRAILING_DOT = "urlnormalizer.basic.host.trim-trailing-dot";

   /**
    * Pattern to detect whether a URL path could be normalized. Contains one of
    * /. or ./ /.. or ../ //
    */
   private final static Pattern hasNormalizablePathPattern = Pattern
       .compile("/[./]|[.]/");

   /**
    * Nutch 1098 - finds URL encoded parts of the URL
    */
   private final static Pattern unescapeRulePattern = Pattern
       .compile("%([0-9A-Fa-f]{2})");

   // charset used for encoding URLs before escaping
   private final static Charset utf8 = StandardCharsets.UTF_8;

   /** look-up table for characters which should not be escaped in URL paths */
   private final static boolean[] unescapedCharacters = new boolean[128];
   static {
     for (int c = 0; c < 128; c++) {
       /* https://tools.ietf.org/html/rfc3986#section-2.2
        * For consistency, percent-encoded octets in the ranges of ALPHA
        * (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
        * underscore (%5F), or tilde (%7E) should not be created by URI
        * producers and, when found in a URI, should be decoded to their
        * corresponding unreserved characters by URI normalizers.
        */
       if (isAlphaNumeric(c)
         || c == 0x2D || c == 0x2E
         || c == 0x5F || c == 0x7E) {
         unescapedCharacters[c] = true;
       } else {
         unescapedCharacters[c] = false;
       }
     }
   }

   /** look-up table for characters which should not be escaped in URL paths */
   private final static boolean[] escapedCharacters = new boolean[128];
   static {
     for (int c = 0; c < 128; c++) {
       if (unescapedCharacters[c]) {
         escapedCharacters[c] = false;
       } else if (c < 0x21 // control character or space
           || c == 0x22 // "
           || c == 0x3C // <
           || c == 0x3E // >
           || c == 0x5B // [
           || c == 0x5D // ]
           || c == 0x5E // ^
           || c == 0x60 // `
           || c == 0x7B // {
           || c == 0x7C // |
           || c == 0x7D // }
           || c == 0x7F // DEL
           ) {
         escapedCharacters[c] = true;
       } else {
         if (LOG.isDebugEnabled()) {
           LOG.debug("Character {} ({}) not handled as escaped or unescaped", c,
               (char) c);
         }
       }
     }
   }

   private static boolean isAlphaNumeric(int c) {
     return (0x41 <= c && c <= 0x5A)
         || (0x61 <= c && c <= 0x7A)
         || (0x30 <= c && c <= 0x39);
   }

   private static boolean isHexCharacter(int c) {
     return (0x41 <= c && c <= 0x46)
         || (0x61 <= c && c <= 0x66)
         || (0x30 <= c && c <= 0x39);
   }

   private static boolean isAscii(String str) {
     char[] chars = str.toCharArray();
     for (char c : chars) {
       if (c > 127) {
         return false;
       }
     }
     return true;
   }

   private Configuration conf;

   private boolean hostIDNtoASCII;
   private boolean hostASCIItoIDN;
   private boolean hostTrimTrailingDot;

   public void BasicUrlNormalizer() {
   }

   @Override
   public Configuration getConf() {
     return conf;
   }

   @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
     String normIdn = conf.get(NORM_HOST_IDN, "");
     if (normIdn.equalsIgnoreCase("toAscii")) {
       hostIDNtoASCII = true;
     } else if (normIdn.equalsIgnoreCase("toUnicode")) {
       hostASCIItoIDN = true;
     }
     hostTrimTrailingDot = conf.getBoolean(NORM_HOST_TRIM_TRAILING_DOT, false);
   }

   @Override
   public String normalize(String urlString, String scope)
       throws MalformedURLException {

     if ("".equals(urlString)) // permit empty
       return urlString;

     urlString = urlString.trim(); // remove extra spaces

     URL url = new URL(urlString);

     String protocol = url.getProtocol();
     String host = url.getHost();
     int port = url.getPort();
     String file = url.getFile();

     boolean changed = false;
     boolean normalizePath = false;

     if (!urlString.startsWith(protocol)) // protocol was lowercased
       changed = true;

     if ("http".equals(protocol) || "https".equals(protocol)
         || "ftp".equals(protocol)) {

       if (host != null && url.getAuthority() != null) {
         String newHost = normalizeHostName(host);
         if (!host.equals(newHost)) {
           host = newHost;
           changed = true;
         } else if (!url.getAuthority().equals(newHost)) {
           // authority (http://<...>/) contains other elements (port, user,
           // etc.) which will likely cause a change if left away
           changed = true;
         }
       } else {
         // no host or authority: recompose the URL from components
         changed = true;
       }

       if (port == url.getDefaultPort()) { // uses default port
         port = -1; // so don't specify it
         changed = true;
       }

       normalizePath = true;
       if (file == null || "".equals(file)) {
         file = "/";
         changed = true;
         normalizePath = false; // no further path normalization required
       } else if (!file.startsWith("/")) {
         file = "/" + file;
         changed = true;
         normalizePath = false; // no further path normalization required
       }

       if (url.getRef() != null) { // remove the ref
         changed = true;
       }

     } else if (protocol.equals("file")) {
       normalizePath = true;
     }

     // properly encode characters in path/file using percent-encoding
     String file2 = unescapePath(file);
     file2 = escapePath(file2);
     if (!file.equals(file2)) {
       changed = true;
       file = file2;
     }

     if (normalizePath) {
       // check for unnecessary use of "/../", "/./", and "//"
       if (changed) {
         url = new URL(protocol, host, port, file);
       }
       file2 = getFileWithNormalizedPath(url);
       if (!file.equals(file2)) {
         changed = true;
         file = file2;
       }
     }

     if (changed) {
       url = new URL(protocol, host, port, file);
       urlString = url.toString();
     }

     return urlString;
   }

   private String getFileWithNormalizedPath(URL url)
       throws MalformedURLException {
     String file;

     if (hasNormalizablePathPattern.matcher(url.getPath()).find()) {
       // only normalize the path if there is something to normalize
       // to avoid needless work
       try {
         file = url.toURI().normalize().toURL().getFile();
         // URI.normalize() does not normalize leading dot segments,
         // see also http://tools.ietf.org/html/rfc3986#section-5.2.4
         int start = 0;
         while (file.startsWith("/..", start)
             && ((start + 3) == file.length() || file.charAt(3) == '/')) {
           start += 3;
         }
         if (start > 0) {
           file = file.substring(start);
         }
       } catch (URISyntaxException e) {
         file = url.getFile();
       }
     } else {
       file = url.getFile();
     }

     // if path is empty return a single slash
     if (file.isEmpty()) {
       file = "/";
     } else if (!file.startsWith("/")) {
       file = "/" + file;
     }

     return file;
   }

   /**
    * Remove % encoding from path segment in URL for characters which should be
    * unescaped according to <a
    * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
    */
   private String unescapePath(String path) {
     StringBuilder sb = new StringBuilder();

     Matcher matcher = unescapeRulePattern.matcher(path);

     int end = -1;
     int letter;

     // Traverse over all encoded groups
     while (matcher.find()) {
       // Append everything up to this group
       sb.append(path.substring(end + 1, matcher.start()));

       // Get the integer representation of this hexadecimal encoded character
       letter = Integer.valueOf(matcher.group().substring(1), 16);

       if (letter < 128 && unescapedCharacters[letter]) {
         // character should be unescaped in URLs
         sb.append(Character.valueOf((char)letter));
       } else {
         // Append the encoded character as uppercase
         sb.append(matcher.group().toUpperCase(Locale.ROOT));
       }

       end = matcher.start() + 2;
     }

     letter = path.length();

     // Append the rest if there's anything
     if (end <= letter - 1) {
       sb.append(path.substring(end + 1, letter));
     }

     // Ok!
     return sb.toString();
   }

   /**
    * Convert path segment of URL from Unicode to UTF-8 and escape all
    * characters which should be escaped according to <a
    * href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>..
    */
   private String escapePath(String path) {
     StringBuilder sb = new StringBuilder(path.length());

     // Traverse over all bytes in this URL
     byte[] bytes = path.getBytes(utf8);
     for (int i = 0; i < bytes.length; i++) {
       byte b = bytes[i];
       // Is this a control character?
       if (b < 0 || escapedCharacters[b]) {
         // Start escape sequence
         sb.append('%');

         // Get this byte's hexadecimal representation
         String hex = Integer.toHexString(b & 0xFF).toUpperCase(Locale.ROOT);

         // Do we need to prepend a zero?
         if (hex.length() % 2 != 0 ) {
           sb.append('0');
           sb.append(hex);
         } else {
           // No, append this hexadecimal representation
           sb.append(hex);
         }
       } else if (b == 0x25) {
         // percent sign (%): read-ahead to check whether a valid escape sequence
         if ((i+2) >= bytes.length) {
           // need at least two more characters
           sb.append("%25");
         } else {
           byte e1 = bytes[i+1];
           byte e2 = bytes[i+2];
           if (isHexCharacter(e1) && isHexCharacter(e2)) {
             // valid percent encoding, output and fast-forward
             i += 2;
             sb.append((char) b);
             sb.append((char) e1);
             sb.append((char) e2);
           } else {
             sb.append("%25");
           }
         }
       } else {
         // No, just append this character as-is
         sb.append((char) b);
       }
     }

     return sb.toString();
   }

   private String normalizeHostName(String host) throws MalformedURLException {

     // 1. lowercase host name
     host = host.toLowerCase(Locale.ROOT);

     // 2. if configured: convert between Unicode and ASCII forms
     //    for Internationalized Domain Names (IDNs)
     if (hostIDNtoASCII && !isAscii(host)) {
       try {
         host = IDN.toASCII(host);
       } catch (IllegalArgumentException | IndexOutOfBoundsException e) {
         // IllegalArgumentException: thrown if the input string contains
         // non-convertible Unicode codepoints
         // IndexOutOfBoundsException: thrown (undocumented) if one "label"
         // (non-ASCII dot-separated segment) is longer than 256 characters,
         // cf. https://bugs.openjdk.java.net/browse/JDK-6806873
         LOG.debug("Failed to convert IDN host {}: ", host, e);
         throw (MalformedURLException) new MalformedURLException(
             "Invalid IDN " + host + ": " + e.getMessage()).initCause(e);
       }
     } else if (hostASCIItoIDN && host.contains("xn--")) {
       host = IDN.toUnicode(host);
     }

     // 3. optionally trim a trailing dot
     if (hostTrimTrailingDot) {
       if (host.endsWith(".")) {
         host = host.substring(0, host.length()-1);
       }
     }

     return host;
   }

   public static void main(String args[]) throws IOException {
     BasicURLNormalizer normalizer = new BasicURLNormalizer();
     normalizer.setConf(NutchConfiguration.create());
     String scope = URLNormalizers.SCOPE_DEFAULT;
     if (args.length >= 1) {
       scope = args[0];
       System.out.println("Scope: " + scope);
     }
     String line, normUrl;
     BufferedReader in = new BufferedReader(
         new InputStreamReader(System.in, utf8));
     while ((line = in.readLine()) != null) {
       try {
         normUrl = normalizer.normalize(line, scope);
         System.out.println(normUrl);
       } catch (MalformedURLException e) {
         System.out.println("failed: " + line);
       }
     }
     System.exit(0);
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.net.urlnormalizer.basic;

	import java.lang.invoke.MethodHandles;
	import java.io.BufferedReader;
	import java.io.IOException;
	import java.io.InputStreamReader;
	import java.net.IDN;
	import java.net.MalformedURLException;
	import java.net.URISyntaxException;
	import java.net.URL;
	import java.nio.charset.Charset;
	import java.nio.charset.StandardCharsets;
	import java.util.Locale;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.nutch.net.URLNormalizer;
	import org.apache.nutch.net.URLNormalizers;
	import org.apache.nutch.util.NutchConfiguration;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	* Converts URLs to a normal form:
	* <ul>
	* <li>remove dot segments in path: <code>/./</code> or <code>/../</code></li>
	* <li>remove default ports, e.g. 80 for protocol <code>http://</code></li>
	* <li>normalize <a href=
	* "https://en.wikipedia.org/wiki/Percent-encoding#Percent-encoding_in_a_URI">
	* percent-encoding</a> in URL paths</li>
	* </ul>
	*/
	public class BasicURLNormalizer implements URLNormalizer {
	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	public final static String NORM_HOST_IDN = "urlnormalizer.basic.host.idn";
	public final static String NORM_HOST_TRIM_TRAILING_DOT = "urlnormalizer.basic.host.trim-trailing-dot";

	/**
	* Pattern to detect whether a URL path could be normalized. Contains one of
	* /. or ./ /.. or ../ //
	*/
	private final static Pattern hasNormalizablePathPattern = Pattern
	.compile("/[./]\|[.]/");

	/**
	* Nutch 1098 - finds URL encoded parts of the URL
	*/
	private final static Pattern unescapeRulePattern = Pattern
	.compile("%([0-9A-Fa-f]{2})");

	// charset used for encoding URLs before escaping
	private final static Charset utf8 = StandardCharsets.UTF_8;

	/** look-up table for characters which should not be escaped in URL paths */
	private final static boolean[] unescapedCharacters = new boolean[128];
	static {
	for (int c = 0; c < 128; c++) {
	/* https://tools.ietf.org/html/rfc3986#section-2.2
	* For consistency, percent-encoded octets in the ranges of ALPHA
	* (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
	* underscore (%5F), or tilde (%7E) should not be created by URI
	* producers and, when found in a URI, should be decoded to their
	* corresponding unreserved characters by URI normalizers.
	*/
	if (isAlphaNumeric(c)
	\|\| c == 0x2D \|\| c == 0x2E
	\|\| c == 0x5F \|\| c == 0x7E) {
	unescapedCharacters[c] = true;
	} else {
	unescapedCharacters[c] = false;
	}
	}
	}

	/** look-up table for characters which should not be escaped in URL paths */
	private final static boolean[] escapedCharacters = new boolean[128];
	static {
	for (int c = 0; c < 128; c++) {
	if (unescapedCharacters[c]) {
	escapedCharacters[c] = false;
	} else if (c < 0x21 // control character or space
	\|\| c == 0x22 // "
	\|\| c == 0x3C // <
	\|\| c == 0x3E // >
	\|\| c == 0x5B // [
	\|\| c == 0x5D // ]
	\|\| c == 0x5E // ^
	\|\| c == 0x60 // `
	\|\| c == 0x7B // {
	\|\| c == 0x7C // \|
	\|\| c == 0x7D // }
	\|\| c == 0x7F // DEL
	) {
	escapedCharacters[c] = true;
	} else {
	if (LOG.isDebugEnabled()) {
	LOG.debug("Character {} ({}) not handled as escaped or unescaped", c,
	(char) c);
	}
	}
	}
	}

	private static boolean isAlphaNumeric(int c) {
	return (0x41 <= c && c <= 0x5A)
	\|\| (0x61 <= c && c <= 0x7A)
	\|\| (0x30 <= c && c <= 0x39);
	}

	private static boolean isHexCharacter(int c) {
	return (0x41 <= c && c <= 0x46)
	\|\| (0x61 <= c && c <= 0x66)
	\|\| (0x30 <= c && c <= 0x39);
	}

	private static boolean isAscii(String str) {
	char[] chars = str.toCharArray();
	for (char c : chars) {
	if (c > 127) {
	return false;
	}
	}
	return true;
	}

	private Configuration conf;

	private boolean hostIDNtoASCII;
	private boolean hostASCIItoIDN;
	private boolean hostTrimTrailingDot;

	public void BasicUrlNormalizer() {
	}

	@Override
	public Configuration getConf() {
	return conf;
	}

	@Override
	public void setConf(Configuration conf) {
	this.conf = conf;
	String normIdn = conf.get(NORM_HOST_IDN, "");
	if (normIdn.equalsIgnoreCase("toAscii")) {
	hostIDNtoASCII = true;
	} else if (normIdn.equalsIgnoreCase("toUnicode")) {
	hostASCIItoIDN = true;
	}
	hostTrimTrailingDot = conf.getBoolean(NORM_HOST_TRIM_TRAILING_DOT, false);
	}

	@Override
	public String normalize(String urlString, String scope)
	throws MalformedURLException {

	if ("".equals(urlString)) // permit empty
	return urlString;

	urlString = urlString.trim(); // remove extra spaces

	URL url = new URL(urlString);

	String protocol = url.getProtocol();
	String host = url.getHost();
	int port = url.getPort();
	String file = url.getFile();

	boolean changed = false;
	boolean normalizePath = false;

	if (!urlString.startsWith(protocol)) // protocol was lowercased
	changed = true;

	if ("http".equals(protocol) \|\| "https".equals(protocol)
	\|\| "ftp".equals(protocol)) {

	if (host != null && url.getAuthority() != null) {
	String newHost = normalizeHostName(host);
	if (!host.equals(newHost)) {
	host = newHost;
	changed = true;
	} else if (!url.getAuthority().equals(newHost)) {
	// authority (http://<...>/) contains other elements (port, user,
	// etc.) which will likely cause a change if left away
	changed = true;
	}
	} else {
	// no host or authority: recompose the URL from components
	changed = true;
	}

	if (port == url.getDefaultPort()) { // uses default port
	port = -1; // so don't specify it
	changed = true;
	}

	normalizePath = true;
	if (file == null \|\| "".equals(file)) {
	file = "/";
	changed = true;
	normalizePath = false; // no further path normalization required
	} else if (!file.startsWith("/")) {
	file = "/" + file;
	changed = true;
	normalizePath = false; // no further path normalization required
	}

	if (url.getRef() != null) { // remove the ref
	changed = true;
	}

	} else if (protocol.equals("file")) {
	normalizePath = true;
	}

	// properly encode characters in path/file using percent-encoding
	String file2 = unescapePath(file);
	file2 = escapePath(file2);
	if (!file.equals(file2)) {
	changed = true;
	file = file2;
	}

	if (normalizePath) {
	// check for unnecessary use of "/../", "/./", and "//"
	if (changed) {
	url = new URL(protocol, host, port, file);
	}
	file2 = getFileWithNormalizedPath(url);
	if (!file.equals(file2)) {
	changed = true;
	file = file2;
	}
	}

	if (changed) {
	url = new URL(protocol, host, port, file);
	urlString = url.toString();
	}

	return urlString;
	}

	private String getFileWithNormalizedPath(URL url)
	throws MalformedURLException {
	String file;

	if (hasNormalizablePathPattern.matcher(url.getPath()).find()) {
	// only normalize the path if there is something to normalize
	// to avoid needless work
	try {
	file = url.toURI().normalize().toURL().getFile();
	// URI.normalize() does not normalize leading dot segments,
	// see also http://tools.ietf.org/html/rfc3986#section-5.2.4
	int start = 0;
	while (file.startsWith("/..", start)
	&& ((start + 3) == file.length() \|\| file.charAt(3) == '/')) {
	start += 3;
	}
	if (start > 0) {
	file = file.substring(start);
	}
	} catch (URISyntaxException e) {
	file = url.getFile();
	}
	} else {
	file = url.getFile();
	}

	// if path is empty return a single slash
	if (file.isEmpty()) {
	file = "/";
	} else if (!file.startsWith("/")) {
	file = "/" + file;
	}

	return file;
	}

	/**
	* Remove % encoding from path segment in URL for characters which should be
	* unescaped according to <a
	* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>.
	*/
	private String unescapePath(String path) {
	StringBuilder sb = new StringBuilder();

	Matcher matcher = unescapeRulePattern.matcher(path);

	int end = -1;
	int letter;

	// Traverse over all encoded groups
	while (matcher.find()) {
	// Append everything up to this group
	sb.append(path.substring(end + 1, matcher.start()));

	// Get the integer representation of this hexadecimal encoded character
	letter = Integer.valueOf(matcher.group().substring(1), 16);

	if (letter < 128 && unescapedCharacters[letter]) {
	// character should be unescaped in URLs
	sb.append(Character.valueOf((char)letter));
	} else {
	// Append the encoded character as uppercase
	sb.append(matcher.group().toUpperCase(Locale.ROOT));
	}

	end = matcher.start() + 2;
	}

	letter = path.length();

	// Append the rest if there's anything
	if (end <= letter - 1) {
	sb.append(path.substring(end + 1, letter));
	}

	// Ok!
	return sb.toString();
	}

	/**
	* Convert path segment of URL from Unicode to UTF-8 and escape all
	* characters which should be escaped according to <a
	* href="https://tools.ietf.org/html/rfc3986#section-2.2">RFC3986</a>..
	*/
	private String escapePath(String path) {
	StringBuilder sb = new StringBuilder(path.length());

	// Traverse over all bytes in this URL
	byte[] bytes = path.getBytes(utf8);
	for (int i = 0; i < bytes.length; i++) {
	byte b = bytes[i];
	// Is this a control character?
	if (b < 0 \|\| escapedCharacters[b]) {
	// Start escape sequence
	sb.append('%');

	// Get this byte's hexadecimal representation
	String hex = Integer.toHexString(b & 0xFF).toUpperCase(Locale.ROOT);

	// Do we need to prepend a zero?
	if (hex.length() % 2 != 0 ) {
	sb.append('0');
	sb.append(hex);
	} else {
	// No, append this hexadecimal representation
	sb.append(hex);
	}
	} else if (b == 0x25) {
	// percent sign (%): read-ahead to check whether a valid escape sequence
	if ((i+2) >= bytes.length) {
	// need at least two more characters
	sb.append("%25");
	} else {
	byte e1 = bytes[i+1];
	byte e2 = bytes[i+2];
	if (isHexCharacter(e1) && isHexCharacter(e2)) {
	// valid percent encoding, output and fast-forward
	i += 2;
	sb.append((char) b);
	sb.append((char) e1);
	sb.append((char) e2);
	} else {
	sb.append("%25");
	}
	}
	} else {
	// No, just append this character as-is
	sb.append((char) b);
	}
	}

	return sb.toString();
	}

	private String normalizeHostName(String host) throws MalformedURLException {

	// 1. lowercase host name
	host = host.toLowerCase(Locale.ROOT);

	// 2. if configured: convert between Unicode and ASCII forms
	// for Internationalized Domain Names (IDNs)
	if (hostIDNtoASCII && !isAscii(host)) {
	try {
	host = IDN.toASCII(host);
	} catch (IllegalArgumentException \| IndexOutOfBoundsException e) {
	// IllegalArgumentException: thrown if the input string contains
	// non-convertible Unicode codepoints
	// IndexOutOfBoundsException: thrown (undocumented) if one "label"
	// (non-ASCII dot-separated segment) is longer than 256 characters,
	// cf. https://bugs.openjdk.java.net/browse/JDK-6806873
	LOG.debug("Failed to convert IDN host {}: ", host, e);
	throw (MalformedURLException) new MalformedURLException(
	"Invalid IDN " + host + ": " + e.getMessage()).initCause(e);
	}
	} else if (hostASCIItoIDN && host.contains("xn--")) {
	host = IDN.toUnicode(host);
	}

	// 3. optionally trim a trailing dot
	if (hostTrimTrailingDot) {
	if (host.endsWith(".")) {
	host = host.substring(0, host.length()-1);
	}
	}

	return host;
	}

	public static void main(String args[]) throws IOException {
	BasicURLNormalizer normalizer = new BasicURLNormalizer();
	normalizer.setConf(NutchConfiguration.create());
	String scope = URLNormalizers.SCOPE_DEFAULT;
	if (args.length >= 1) {
	scope = args[0];
	System.out.println("Scope: " + scope);
	}
	String line, normUrl;
	BufferedReader in = new BufferedReader(
	new InputStreamReader(System.in, utf8));
	while ((line = in.readLine()) != null) {
	try {
	normUrl = normalizer.normalize(line, scope);
	System.out.println(normUrl);
	} catch (MalformedURLException e) {
	System.out.println("failed: " + line);
	}
	}
	System.exit(0);
	}

	}