| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.net.urlnormalizer.regex; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.io.FileReader; |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.net.MalformedURLException; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| import java.util.regex.PatternSyntaxException; |
| |
| import javax.xml.parsers.DocumentBuilderFactory; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.conf.Configured; |
| import org.apache.nutch.net.URLNormalizer; |
| import org.apache.nutch.net.URLNormalizers; |
| import org.apache.nutch.util.NutchConfiguration; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| import org.w3c.dom.Document; |
| import org.w3c.dom.Element; |
| import org.w3c.dom.Node; |
| import org.w3c.dom.NodeList; |
| import org.w3c.dom.Text; |
| import org.xml.sax.InputSource; |
| |
| /** |
| * Allows users to do regex substitutions on all/any URLs that are encountered, |
| * which is useful for stripping session IDs from URLs. |
| * |
| * <p> |
| * This class uses the <tt>urlnormalizer.regex.file</tt> property. It should be |
| * set to the file name of an xml file which should contain the patterns and |
| * substitutions to be done on encountered URLs. |
| * </p> |
| * <p> |
| * This class also supports different rules depending on the scope. Please see |
| * the javadoc in {@link org.apache.nutch.net.URLNormalizers} for more details. |
| * </p> |
| * |
| * @author Luke Baker |
| * @author Andrzej Bialecki |
| */ |
| public class RegexURLNormalizer extends Configured implements URLNormalizer { |
| |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| /** |
| * Class which holds a compiled pattern and its corresponding substition |
| * string. |
| */ |
| private static class Rule { |
| public Pattern pattern; |
| |
| public String substitution; |
| } |
| |
| private ThreadLocal<HashMap<String, List<Rule>>> scopedRulesThreadLocal = new ThreadLocal<HashMap<String, List<Rule>>>() { |
| protected java.util.HashMap<String, java.util.List<Rule>> initialValue() { |
| return new HashMap<String, List<Rule>>(); |
| }; |
| }; |
| |
| public HashMap<String, List<Rule>> getScopedRules() { |
| return scopedRulesThreadLocal.get(); |
| } |
| |
| private List<Rule> defaultRules; |
| |
| private static final List<Rule> EMPTY_RULES = Collections.emptyList(); |
| |
| /** |
| * The default constructor which is called from UrlNormalizerFactory |
| * (normalizerClass.newInstance()) in method: getNormalizer()* |
| */ |
| public RegexURLNormalizer() { |
| super(null); |
| } |
| |
| public RegexURLNormalizer(Configuration conf) { |
| super(conf); |
| } |
| |
| /** |
| * Constructor which can be passed the file name, so it doesn't look in the |
| * configuration files for it. |
| */ |
| public RegexURLNormalizer(Configuration conf, String filename) |
| throws IOException, PatternSyntaxException { |
| super(conf); |
| List<Rule> rules = readConfigurationFile(filename); |
| if (rules != null) { |
| defaultRules = rules; |
| } |
| } |
| |
| public void setConf(Configuration conf) { |
| super.setConf(conf); |
| if (conf == null) |
| return; |
| // the default constructor was called |
| |
| String filename = getConf().get("urlnormalizer.regex.file"); |
| String stringRules = getConf().get("urlnormalizer.regex.rules"); |
| Reader reader = null; |
| if (stringRules != null) { |
| reader = new StringReader(stringRules); |
| } else { |
| reader = getConf().getConfResourceAsReader(filename); |
| } |
| List<Rule> rules = null; |
| if (reader == null) { |
| LOG.warn("Can't load the default rules! "); |
| rules = EMPTY_RULES; |
| } else { |
| try { |
| rules = readConfiguration(reader); |
| } catch (Exception e) { |
| LOG.warn("Couldn't read default config: " + e); |
| rules = EMPTY_RULES; |
| } |
| } |
| defaultRules = rules; |
| } |
| |
| // used in JUnit test. |
| void setConfiguration(Reader reader, String scope) { |
| List<Rule> rules = readConfiguration(reader); |
| getScopedRules().put(scope, rules); |
| LOG.debug("Set config for scope '" + scope + "': " + rules.size() |
| + " rules."); |
| } |
| |
| /** |
| * This function does the replacements by iterating through all the regex |
| * patterns. It accepts a string url as input and returns the altered string. |
| */ |
| public String regexNormalize(String urlString, String scope) { |
| HashMap<String, List<Rule>> scopedRules = getScopedRules(); |
| List<Rule> curRules = scopedRules.get(scope); |
| if (curRules == null) { |
| // try to populate |
| String configFile = getConf().get("urlnormalizer.regex.file." + scope); |
| if (configFile != null) { |
| LOG.debug("resource for scope '" + scope + "': " + configFile); |
| try { |
| Reader reader = getConf().getConfResourceAsReader(configFile); |
| curRules = readConfiguration(reader); |
| scopedRules.put(scope, curRules); |
| } catch (Exception e) { |
| LOG.warn("Couldn't load resource '" + configFile + "': " + e); |
| } |
| } |
| if (curRules == EMPTY_RULES || curRules == null) { |
| LOG.info("can't find rules for scope '" + scope + "', using default"); |
| scopedRules.put(scope, EMPTY_RULES); |
| } |
| } |
| if (curRules == EMPTY_RULES || curRules == null) { |
| curRules = defaultRules; |
| } |
| Iterator<Rule> i = curRules.iterator(); |
| while (i.hasNext()) { |
| Rule r = (Rule) i.next(); |
| |
| Matcher matcher = r.pattern.matcher(urlString); |
| |
| urlString = matcher.replaceAll(r.substitution); |
| } |
| return urlString; |
| } |
| |
| public String normalize(String urlString, String scope) |
| throws MalformedURLException { |
| return regexNormalize(urlString, scope); |
| } |
| |
| /** Reads the configuration file and populates a List of Rules. */ |
| private List<Rule> readConfigurationFile(String filename) { |
| if (LOG.isInfoEnabled()) { |
| LOG.info("loading " + filename); |
| } |
| try { |
| FileReader reader = new FileReader(filename); |
| return readConfiguration(reader); |
| } catch (Exception e) { |
| LOG.error("Error loading rules from '" + filename + "': " + e); |
| return EMPTY_RULES; |
| } |
| } |
| |
| private List<Rule> readConfiguration(Reader reader) { |
| List<Rule> rules = new ArrayList<Rule>(); |
| try { |
| |
| // borrowed heavily from code in Configuration.java |
| Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder() |
| .parse(new InputSource(reader)); |
| Element root = doc.getDocumentElement(); |
| if ((!"regex-normalize".equals(root.getTagName())) |
| && (LOG.isErrorEnabled())) { |
| LOG.error("bad conf file: top-level element not <regex-normalize>"); |
| } |
| NodeList regexes = root.getChildNodes(); |
| for (int i = 0; i < regexes.getLength(); i++) { |
| Node regexNode = regexes.item(i); |
| if (!(regexNode instanceof Element)) |
| continue; |
| Element regex = (Element) regexNode; |
| if ((!"regex".equals(regex.getTagName())) && (LOG.isWarnEnabled())) { |
| LOG.warn("bad conf file: element not <regex>"); |
| } |
| NodeList fields = regex.getChildNodes(); |
| String patternValue = null; |
| String subValue = null; |
| for (int j = 0; j < fields.getLength(); j++) { |
| Node fieldNode = fields.item(j); |
| if (!(fieldNode instanceof Element)) |
| continue; |
| Element field = (Element) fieldNode; |
| if ("pattern".equals(field.getTagName()) && field.hasChildNodes()) |
| patternValue = ((Text) field.getFirstChild()).getData(); |
| if ("substitution".equals(field.getTagName()) |
| && field.hasChildNodes()) |
| subValue = ((Text) field.getFirstChild()).getData(); |
| if (!field.hasChildNodes()) |
| subValue = ""; |
| } |
| if (patternValue != null && subValue != null) { |
| Rule rule = new Rule(); |
| try { |
| rule.pattern = Pattern.compile(patternValue); |
| } catch (PatternSyntaxException e) { |
| if (LOG.isErrorEnabled()) { |
| LOG.error("skipped rule: " + patternValue + " -> " + subValue |
| + " : invalid regular expression pattern: " + e); |
| } |
| continue; |
| } |
| rule.substitution = subValue; |
| rules.add(rule); |
| } |
| } |
| } catch (Exception e) { |
| if (LOG.isErrorEnabled()) { |
| LOG.error("error parsing conf file: " + e); |
| } |
| return EMPTY_RULES; |
| } |
| if (rules.size() == 0) |
| return EMPTY_RULES; |
| return rules; |
| } |
| |
| /** Spits out patterns and substitutions that are in the configuration file. */ |
| public static void main(String args[]) throws PatternSyntaxException, |
| IOException { |
| RegexURLNormalizer normalizer = new RegexURLNormalizer(); |
| normalizer.setConf(NutchConfiguration.create()); |
| HashMap<String, List<Rule>> scopedRules = normalizer.getScopedRules(); |
| Iterator<Rule> i = normalizer.defaultRules.iterator(); |
| System.out.println("* Rules for 'DEFAULT' scope:"); |
| while (i.hasNext()) { |
| Rule r = i.next(); |
| System.out.print(" " + r.pattern.pattern() + " -> "); |
| System.out.println(r.substitution); |
| } |
| // load the scope |
| if (args.length > 1) { |
| normalizer.normalize("http://test.com", args[1]); |
| } |
| if (scopedRules.size() > 1) { |
| Iterator<String> it = scopedRules.keySet().iterator(); |
| while (it.hasNext()) { |
| String scope = it.next(); |
| if (URLNormalizers.SCOPE_DEFAULT.equals(scope)) |
| continue; |
| System.out.println("* Rules for '" + scope + "' scope:"); |
| i = ((List<Rule>) scopedRules.get(scope)).iterator(); |
| while (i.hasNext()) { |
| Rule r = (Rule) i.next(); |
| System.out.print(" " + r.pattern.pattern() + " -> "); |
| System.out.println(r.substitution); |
| } |
| } |
| } |
| if (args.length > 0) { |
| System.out.println("\n---------- Normalizer test -----------"); |
| String scope = URLNormalizers.SCOPE_DEFAULT; |
| if (args.length > 1) |
| scope = args[1]; |
| System.out.println("Scope: " + scope); |
| System.out.println("Input url: '" + args[0] + "'"); |
| System.out.println("Output url: '" + normalizer.normalize(args[0], scope) |
| + "'"); |
| } |
| System.exit(0); |
| } |
| |
| } |