| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.net.urlnormalizer.ajax; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.net.URL; |
| import java.net.URLDecoder; |
| import java.net.MalformedURLException; |
| import java.nio.charset.Charset; |
| |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import org.apache.nutch.net.URLNormalizer; |
| import org.apache.nutch.net.URLNormalizers; |
| import org.apache.hadoop.conf.Configuration; |
| |
| /** |
| * URLNormalizer capable of dealing with AJAX URL's. |
| * |
| * Use the following regex filter to prevent escaped fragments from being fetched. |
| * ^(.*)\?.*_escaped_fragment_ |
| */ |
| public class AjaxURLNormalizer implements URLNormalizer { |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| public static String AJAX_URL_PART = "#!"; |
| public static String ESCAPED_URL_PART = "_escaped_fragment_="; |
| |
| private Configuration conf; |
| private Charset utf8; |
| |
| /** |
| * Default constructor. |
| */ |
| public AjaxURLNormalizer() { |
| utf8 = Charset.forName("UTF-8"); |
| } |
| |
| /** |
| * Attempts to normalize the input URL string |
| * |
| * @param urlString a String to process |
| * @param scope used when indexing URLs |
| * @return String |
| * @throws MalformedURLException if the urlString is malformed |
| */ |
| @Override |
| public String normalize(String urlString, String scope) throws MalformedURLException { |
| LOG.info(scope + " // " + urlString); |
| |
| // When indexing, transform _escaped_fragment_ URL's to their #! counterpart |
| if (scope.equals(URLNormalizers.SCOPE_INDEXER) && urlString.contains(ESCAPED_URL_PART)) { |
| return normalizeEscapedFragment(urlString); |
| } |
| |
| // Otherwise transform #! URL's to their _escaped_fragment_ counterpart |
| if (urlString.contains(AJAX_URL_PART)) { |
| LOG.info(scope + " // " + normalizeHashedFragment(urlString)); |
| return normalizeHashedFragment(urlString); |
| } |
| |
| // Nothing to normalize here, return verbatim |
| return urlString; |
| } |
| |
| /** |
| * Returns a normalized input URL. #! querystrings are transformed |
| * to a _escaped_fragment_ form. |
| * |
| * @param urlString a String to process |
| * @return String |
| * @throws MalformedURLException if the urlString is malformed |
| */ |
| protected String normalizeHashedFragment(String urlString) throws MalformedURLException { |
| URL u = new URL(urlString); |
| int pos = urlString.indexOf(AJAX_URL_PART); |
| StringBuilder sb = new StringBuilder(urlString.substring(0, pos)); |
| |
| // Get the escaped fragment |
| String escapedFragment = escape(urlString.substring(pos + AJAX_URL_PART.length())); |
| |
| // Check if we already have a query in the URL |
| if (u.getQuery() == null) { |
| sb.append("?"); |
| } else { |
| sb.append("&"); |
| } |
| |
| // Append the escaped fragment key and the value |
| sb.append(ESCAPED_URL_PART); |
| sb.append(escapedFragment); |
| |
| return sb.toString(); |
| } |
| |
| /** |
| * Returns a normalized input URL. _escaped_fragment_ querystrings are |
| * transformed to a #! form. |
| * |
| * @param urlString a String to process |
| * @return String |
| * @throws MalformedURLException if the urlString is malformed |
| */ |
| protected String normalizeEscapedFragment(String urlString) throws MalformedURLException { |
| URL u = new URL(urlString); |
| StringBuilder sb = new StringBuilder(); |
| |
| // Write the URL without query string, we'll handle that later |
| sb.append(u.getProtocol()); |
| sb.append("://"); |
| sb.append(u.getHost()); |
| if (u.getPort() != -1) { |
| sb.append(":"); |
| sb.append(u.getPort()); |
| } |
| sb.append(u.getPath()); |
| |
| // Get the query string |
| String queryString = u.getQuery(); |
| |
| // Check if there's an & in the query string |
| int ampPos = queryString.indexOf("&"); |
| String keyValuePair = null; |
| |
| // If there's none, then the escaped fragment is the only k/v pair |
| if (ampPos == -1) { |
| keyValuePair = queryString; |
| queryString = ""; |
| } else { |
| // Obtain the escaped k/v pair |
| keyValuePair = queryString.substring(ampPos + 1); |
| |
| // Remove the escaped fragment key/value pair from the query string |
| queryString = queryString.replaceFirst("&" + keyValuePair, ""); |
| } |
| |
| // Remove escapedUrlPart from the keyValuePair |
| keyValuePair = keyValuePair.replaceFirst(ESCAPED_URL_PART, ""); |
| |
| // Get the fragment escaped |
| String unescapedFragment = unescape(keyValuePair); |
| |
| // Append a possible query string, without original escaped fragment |
| if (queryString.length() > 0) { |
| sb.append("?"); |
| sb.append(queryString); |
| } |
| |
| // Append the fragment delimiter and the unescaped fragment |
| sb.append("#!"); |
| sb.append(unescapedFragment); |
| |
| return sb.toString(); |
| } |
| |
| /** |
| * Unescape some exotic characters in the fragment part |
| * |
| * @param fragmentPart a String to process |
| * @return String |
| */ |
| protected String unescape(String fragmentPart) { |
| try { |
| fragmentPart = URLDecoder.decode(fragmentPart, "UTF-8"); |
| } catch (Exception e) { |
| /// bluh |
| } |
| |
| return fragmentPart; |
| } |
| |
| /** |
| * Escape some exotic characters in the fragment part |
| * |
| * @param fragmentPart a String to process |
| * @return String |
| */ |
| protected String escape(String fragmentPart) { |
| String hex = null; |
| StringBuilder sb = new StringBuilder(fragmentPart.length()); |
| |
| for (byte b : fragmentPart.getBytes(utf8)) { |
| if (b < 33) { |
| sb.append('%'); |
| |
| hex = Integer.toHexString(b & 0xFF).toUpperCase(); |
| |
| // Prevent odd # chars |
| if (hex.length() % 2 != 0) { |
| sb.append('0'); |
| } |
| sb.append(hex); |
| } else if (b == 35) { |
| sb.append("%23"); |
| } else if (b == 37) { |
| sb.append("%25"); |
| } else if (b == 38) { |
| sb.append("%26"); |
| } else if (b == 43) { |
| sb.append("%2B"); |
| } else { |
| sb.append((char)b); |
| } |
| } |
| |
| return sb.toString(); |
| } |
| |
| /** |
| * @param conf a populated {@link Configuration} |
| */ |
| @Override |
| public void setConf(Configuration conf) { |
| this.conf = conf; |
| } |
| |
| /** |
| * @return Configuration |
| */ |
| @Override |
| public Configuration getConf() { |
| return this.conf; |
| } |
| |
| } |