blob: 7c55cd723582a2e27e38696cc0bce76f5c82300e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.net.urlnormalizer.ajax;
import java.lang.invoke.MethodHandles;
import java.net.URL;
import java.net.URLDecoder;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.nutch.net.URLNormalizer;
import org.apache.nutch.net.URLNormalizers;
import org.apache.hadoop.conf.Configuration;
/**
* URLNormalizer capable of dealing with AJAX URL's.
*
* Use the following regex filter to prevent escaped fragments from being fetched.
* ^(.*)\?.*_escaped_fragment_
*/
public class AjaxURLNormalizer implements URLNormalizer {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
public static String AJAX_URL_PART = "#!";
public static String ESCAPED_URL_PART = "_escaped_fragment_=";
private Configuration conf;
private Charset utf8;
/**
* Default constructor.
*/
public AjaxURLNormalizer() {
utf8 = Charset.forName("UTF-8");
}
/**
* Attempts to normalize the input URL string
*
* @param urlString a String to process
* @param scope used when indexing URLs
* @return String
* @throws MalformedURLException if the urlString is malformed
*/
@Override
public String normalize(String urlString, String scope) throws MalformedURLException {
LOG.info(scope + " // " + urlString);
// When indexing, transform _escaped_fragment_ URL's to their #! counterpart
if (scope.equals(URLNormalizers.SCOPE_INDEXER) && urlString.contains(ESCAPED_URL_PART)) {
return normalizeEscapedFragment(urlString);
}
// Otherwise transform #! URL's to their _escaped_fragment_ counterpart
if (urlString.contains(AJAX_URL_PART)) {
LOG.info(scope + " // " + normalizeHashedFragment(urlString));
return normalizeHashedFragment(urlString);
}
// Nothing to normalize here, return verbatim
return urlString;
}
/**
* Returns a normalized input URL. #! querystrings are transformed
* to a _escaped_fragment_ form.
*
* @param urlString a String to process
* @return String
* @throws MalformedURLException if the urlString is malformed
*/
protected String normalizeHashedFragment(String urlString) throws MalformedURLException {
URL u = new URL(urlString);
int pos = urlString.indexOf(AJAX_URL_PART);
StringBuilder sb = new StringBuilder(urlString.substring(0, pos));
// Get the escaped fragment
String escapedFragment = escape(urlString.substring(pos + AJAX_URL_PART.length()));
// Check if we already have a query in the URL
if (u.getQuery() == null) {
sb.append("?");
} else {
sb.append("&");
}
// Append the escaped fragment key and the value
sb.append(ESCAPED_URL_PART);
sb.append(escapedFragment);
return sb.toString();
}
/**
* Returns a normalized input URL. _escaped_fragment_ querystrings are
* transformed to a #! form.
*
* @param urlString a String to process
* @return String
* @throws MalformedURLException if the urlString is malformed
*/
protected String normalizeEscapedFragment(String urlString) throws MalformedURLException {
URL u = new URL(urlString);
StringBuilder sb = new StringBuilder();
// Write the URL without query string, we'll handle that later
sb.append(u.getProtocol());
sb.append("://");
sb.append(u.getHost());
if (u.getPort() != -1) {
sb.append(":");
sb.append(u.getPort());
}
sb.append(u.getPath());
// Get the query string
String queryString = u.getQuery();
// Check if there's an & in the query string
int ampPos = queryString.indexOf("&");
String keyValuePair = null;
// If there's none, then the escaped fragment is the only k/v pair
if (ampPos == -1) {
keyValuePair = queryString;
queryString = "";
} else {
// Obtain the escaped k/v pair
keyValuePair = queryString.substring(ampPos + 1);
// Remove the escaped fragment key/value pair from the query string
queryString = queryString.replaceFirst("&" + keyValuePair, "");
}
// Remove escapedUrlPart from the keyValuePair
keyValuePair = keyValuePair.replaceFirst(ESCAPED_URL_PART, "");
// Get the fragment escaped
String unescapedFragment = unescape(keyValuePair);
// Append a possible query string, without original escaped fragment
if (queryString.length() > 0) {
sb.append("?");
sb.append(queryString);
}
// Append the fragment delimiter and the unescaped fragment
sb.append("#!");
sb.append(unescapedFragment);
return sb.toString();
}
/**
* Unescape some exotic characters in the fragment part
*
* @param fragmentPart a String to process
* @return String
*/
protected String unescape(String fragmentPart) {
try {
fragmentPart = URLDecoder.decode(fragmentPart, "UTF-8");
} catch (Exception e) {
/// bluh
}
return fragmentPart;
}
/**
* Escape some exotic characters in the fragment part
*
* @param fragmentPart a String to process
* @return String
*/
protected String escape(String fragmentPart) {
String hex = null;
StringBuilder sb = new StringBuilder(fragmentPart.length());
for (byte b : fragmentPart.getBytes(utf8)) {
if (b < 33) {
sb.append('%');
hex = Integer.toHexString(b & 0xFF).toUpperCase();
// Prevent odd # chars
if (hex.length() % 2 != 0) {
sb.append('0');
}
sb.append(hex);
} else if (b == 35) {
sb.append("%23");
} else if (b == 37) {
sb.append("%25");
} else if (b == 38) {
sb.append("%26");
} else if (b == 43) {
sb.append("%2B");
} else {
sb.append((char)b);
}
}
return sb.toString();
}
/**
* @param conf a populated {@link Configuration}
*/
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
/**
* @return Configuration
*/
@Override
public Configuration getConf() {
return this.conf;
}
}