blob: c86d55ac2e95fc222cd45672911d6f18dcaf1a18 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.net.urlnormalizer.regex;
import java.lang.invoke.MethodHandles;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import javax.xml.parsers.DocumentBuilderFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.nutch.net.URLNormalizer;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.util.NutchConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;
/**
* Allows users to do regex substitutions on all/any URLs that are encountered,
* which is useful for stripping session IDs from URLs.
*
* <p>
* This class uses the <code>urlnormalizer.regex.file</code> property. It should be
* set to the file name of an xml file which should contain the patterns and
* substitutions to be done on encountered URLs.
* </p>
* <p>
* This class also supports different rules depending on the scope. Please see
* the javadoc in {@link org.apache.nutch.net.URLNormalizers} for more details.
* </p>
*
* @author Luke Baker
* @author Andrzej Bialecki
*/
public class RegexURLNormalizer extends Configured implements URLNormalizer {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
/**
* Class which holds a compiled pattern and its corresponding substition
* string.
*/
private static class Rule {
public Pattern pattern;
public String substitution;
}
private ThreadLocal<HashMap<String, List<Rule>>> scopedRulesThreadLocal = new ThreadLocal<HashMap<String, List<Rule>>>() {
protected java.util.HashMap<String, java.util.List<Rule>> initialValue() {
return new HashMap<String, List<Rule>>();
};
};
public HashMap<String, List<Rule>> getScopedRules() {
return scopedRulesThreadLocal.get();
}
private List<Rule> defaultRules;
private static final List<Rule> EMPTY_RULES = Collections.emptyList();
/**
* The default constructor which is called from UrlNormalizerFactory
* (normalizerClass.newInstance()) in method: getNormalizer()*
*/
public RegexURLNormalizer() {
super(null);
}
public RegexURLNormalizer(Configuration conf) {
super(conf);
}
/**
* Constructor which can be passed the configuration file name,
* so it doesn't look in other configuration files for it.
* @param conf A populated {@link Configuration}
* @param filename A specific configuration file
* @throws IOException if there is an error locatingf the specified input file
* @throws PatternSyntaxException If there is an error whilst interpreting
* rule patterns.
*/
public RegexURLNormalizer(Configuration conf, String filename)
throws IOException, PatternSyntaxException {
super(conf);
List<Rule> rules = readConfigurationFile(filename);
if (rules != null) {
defaultRules = rules;
}
}
@Override
public void setConf(Configuration conf) {
super.setConf(conf);
if (conf == null)
return;
// the default constructor was called
String filename = getConf().get("urlnormalizer.regex.file");
String stringRules = getConf().get("urlnormalizer.regex.rules");
Reader reader = null;
if (stringRules != null) {
reader = new StringReader(stringRules);
} else {
reader = getConf().getConfResourceAsReader(filename);
}
List<Rule> rules = null;
if (reader == null) {
LOG.warn("Can't load the default rules! ");
rules = EMPTY_RULES;
} else {
try {
rules = readConfiguration(reader);
} catch (Exception e) {
LOG.warn("Couldn't read default config: " + e);
rules = EMPTY_RULES;
}
}
defaultRules = rules;
}
// used in JUnit test.
void setConfiguration(Reader reader, String scope) {
List<Rule> rules = readConfiguration(reader);
getScopedRules().put(scope, rules);
LOG.debug("Set config for scope '" + scope + "': " + rules.size()
+ " rules.");
}
/**
* This function does the replacements by iterating through all the regex
* patterns. It accepts a string url as input and returns the altered string.
* @param urlString A url string to process
* @param scope The identifier for a specific scoped rule
* @return The altered string
*/
public String regexNormalize(String urlString, String scope) {
HashMap<String, List<Rule>> scopedRules = getScopedRules();
List<Rule> curRules = scopedRules.get(scope);
if (curRules == null) {
// try to populate
String configFile = getConf().get("urlnormalizer.regex.file." + scope);
if (configFile != null) {
LOG.debug("resource for scope '" + scope + "': " + configFile);
try {
Reader reader = getConf().getConfResourceAsReader(configFile);
curRules = readConfiguration(reader);
scopedRules.put(scope, curRules);
} catch (Exception e) {
LOG.warn("Couldn't load resource '" + configFile + "': " + e);
}
}
if (curRules == EMPTY_RULES || curRules == null) {
LOG.info("can't find rules for scope '" + scope + "', using default");
scopedRules.put(scope, EMPTY_RULES);
}
}
if (curRules == EMPTY_RULES || curRules == null) {
curRules = defaultRules;
}
Iterator<Rule> i = curRules.iterator();
while (i.hasNext()) {
Rule r = (Rule) i.next();
Matcher matcher = r.pattern.matcher(urlString);
urlString = matcher.replaceAll(r.substitution);
}
return urlString;
}
@Override
public String normalize(String urlString, String scope)
throws MalformedURLException {
return regexNormalize(urlString, scope);
}
/** Reads the configuration file and populates a List of Rules. */
private List<Rule> readConfigurationFile(String filename) {
if (LOG.isInfoEnabled()) {
LOG.info("loading " + filename);
}
try {
FileReader reader = new FileReader(filename);
return readConfiguration(reader);
} catch (Exception e) {
LOG.error("Error loading rules from '" + filename + "': " + e);
return EMPTY_RULES;
}
}
private List<Rule> readConfiguration(Reader reader) {
List<Rule> rules = new ArrayList<Rule>();
try {
// borrowed heavily from code in Configuration.java
Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder()
.parse(new InputSource(reader));
Element root = doc.getDocumentElement();
if ((!"regex-normalize".equals(root.getTagName()))
&& (LOG.isErrorEnabled())) {
LOG.error("bad conf file: top-level element not <regex-normalize>");
}
NodeList regexes = root.getChildNodes();
for (int i = 0; i < regexes.getLength(); i++) {
Node regexNode = regexes.item(i);
if (!(regexNode instanceof Element))
continue;
Element regex = (Element) regexNode;
if ((!"regex".equals(regex.getTagName())) && (LOG.isWarnEnabled())) {
LOG.warn("bad conf file: element not <regex>");
}
NodeList fields = regex.getChildNodes();
String patternValue = null;
String subValue = null;
for (int j = 0; j < fields.getLength(); j++) {
Node fieldNode = fields.item(j);
if (!(fieldNode instanceof Element))
continue;
Element field = (Element) fieldNode;
if ("pattern".equals(field.getTagName()) && field.hasChildNodes())
patternValue = ((Text) field.getFirstChild()).getData();
if ("substitution".equals(field.getTagName())
&& field.hasChildNodes())
subValue = ((Text) field.getFirstChild()).getData();
if (!field.hasChildNodes())
subValue = "";
}
if (patternValue != null && subValue != null) {
Rule rule = new Rule();
try {
rule.pattern = Pattern.compile(patternValue);
} catch (PatternSyntaxException e) {
if (LOG.isErrorEnabled()) {
LOG.error("skipped rule: " + patternValue + " -> " + subValue
+ " : invalid regular expression pattern: " + e);
}
continue;
}
rule.substitution = subValue;
rules.add(rule);
}
}
} catch (Exception e) {
if (LOG.isErrorEnabled()) {
LOG.error("error parsing conf file: " + e);
}
return EMPTY_RULES;
}
if (rules.size() == 0)
return EMPTY_RULES;
return rules;
}
/**
* Spits out patterns and substitutions that are in the configuration file.
* @param args accepts one argument which is a scope
* @throws IOException Can be thrown by {@link RegexURLNormalizer#normalize(String, String)}
* @throws PatternSyntaxException If there is an error with the provided scope
* rule pattern.
*/
public static void main(String args[]) throws PatternSyntaxException,
IOException {
RegexURLNormalizer normalizer = new RegexURLNormalizer();
normalizer.setConf(NutchConfiguration.create());
HashMap<String, List<Rule>> scopedRules = normalizer.getScopedRules();
Iterator<Rule> i = normalizer.defaultRules.iterator();
System.out.println("* Rules for 'DEFAULT' scope:");
while (i.hasNext()) {
Rule r = i.next();
System.out.print(" " + r.pattern.pattern() + " -> ");
System.out.println(r.substitution);
}
// load the scope
if (args.length > 1) {
normalizer.normalize("http://test.com", args[1]);
}
if (scopedRules.size() > 1) {
Iterator<String> it = scopedRules.keySet().iterator();
while (it.hasNext()) {
String scope = it.next();
if (URLNormalizers.SCOPE_DEFAULT.equals(scope))
continue;
System.out.println("* Rules for '" + scope + "' scope:");
i = ((List<Rule>) scopedRules.get(scope)).iterator();
while (i.hasNext()) {
Rule r = (Rule) i.next();
System.out.print(" " + r.pattern.pattern() + " -> ");
System.out.println(r.substitution);
}
}
}
if (args.length > 0) {
System.out.println("\n---------- Normalizer test -----------");
String scope = URLNormalizers.SCOPE_DEFAULT;
if (args.length > 1)
scope = args[1];
System.out.println("Scope: " + scope);
System.out.println("Input url: '" + args[0] + "'");
System.out.println("Output url: '" + normalizer.normalize(args[0], scope)
+ "'");
}
System.exit(0);
}
}