| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.urlfilter.fast; |
| |
| import com.google.common.collect.LinkedHashMultimap; |
| import com.google.common.collect.Multimap; |
| import org.apache.commons.lang.StringUtils; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.nutch.net.URLFilter; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.net.URL; |
| import java.util.regex.Pattern; |
| import java.util.regex.PatternSyntaxException; |
| |
| /** |
| * Filters URLs based on a file of regular expressions using host/domains |
| * matching first. The default policy is to accept a URL if no matches are |
| * found. |
| * |
| * Rule Format: |
| * |
| * <pre> |
| * Host www.example.org |
| * DenyPath /path/to/be/excluded |
| * DenyPath /some/other/path/excluded |
| * |
| * # Deny everything from *.example.com and example.com |
| * Domain example.com |
| * DenyPath .* |
| * |
| * Domain example.org |
| * DenyPathQuery /resource/.*?action=exclude |
| * </pre> |
| * |
| * <code>Host</code> rules are evaluated before <code>Domain</code> rules. For |
| * <code>Host</code> rules the entire host name of a URL must match while the |
| * domain names in <code>Domain</code> rules are considered as matches if the |
| * domain is a suffix of the host name (consisting of complete host name parts). |
| * Shorter domain suffixes are checked first, a single dot |
| * "<code>.</code>" as "domain name" can be used to specify |
| * global rules applied to every URL. |
| * |
| * E.g., for "www.example.com" the rules given above are looked up in the |
| * following order: |
| * <ol> |
| * <li>check "www.example.com" whether host-based rules exist and whether one of |
| * them matches</li> |
| * <li>check "www.example.com" for domain-based rules</li> |
| * <li>check "example.com" for domain-based rules</li> |
| * <li>check "com" for domain-based rules</li> |
| * <li>check for global rules ("<code>Domain .</code>")</li> |
| * </ol> |
| * The first matching rule will reject the URL and no further rules are checked. |
| * If no rule matches the URL is accepted. URLs without a host name (e.g., |
| * <code>file:/path/file.txt</code> are checked for global rules only. URLs |
| * which fail to be parsed as {@link java.net.URL} are always rejected. |
| * |
| * For rules either the URL path (<code>DenyPath</code>) or path and query |
| * (<code>DenyPathQuery</code>) are checked whether the given |
| * {@link java.util.regex Java Regular expression} is found (see |
| * {@link java.util.regex.Matcher#find()}) in the URL path (and query). |
| * |
| * Rules are applied in the order of their definition. For better performance, |
| * regular expressions which are simpler/faster or match more URLs should be |
| * defined earlier. |
| * |
| * Comments in the rule file start with the <code>#</code> character and reach |
| * until the end of the line. |
| * |
| * The rules file is defined via the property <code>urlfilter.fast.file</code>, |
| * the default name is <code>fast-urlfilter.txt</code>. |
| */ |
| public class FastURLFilter implements URLFilter { |
| |
| protected static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| private Configuration conf; |
| public static final String URLFILTER_FAST_FILE = "urlfilter.fast.file"; |
| private Multimap<String, Rule> hostRules = LinkedHashMultimap.create(); |
| private Multimap<String, Rule> domainRules = LinkedHashMultimap.create(); |
| |
| private static final Pattern CATCH_ALL_RULE = Pattern |
| .compile("^\\s*DenyPath(?:Query)?\\s+\\.[*?]\\s*$"); |
| |
| public FastURLFilter() {} |
| |
| FastURLFilter(Reader rules) throws IOException, PatternSyntaxException { |
| reloadRules(rules); |
| } |
| |
| @Override |
| public void setConf(Configuration conf) { |
| this.conf = conf; |
| try { |
| reloadRules(); |
| } catch (Exception e) { |
| LOG.error(e.getMessage()); |
| throw new RuntimeException(e.getMessage(), e); |
| } |
| } |
| |
| @Override |
| public Configuration getConf() { |
| return this.conf; |
| } |
| |
| @Override |
| public String filter(String url) { |
| |
| URL u; |
| |
| try { |
| u = new URL(url); |
| } catch (Exception e) { |
| LOG.debug("Rejected {} because failed to parse as URL: {}", url, |
| e.getMessage()); |
| return null; |
| } |
| |
| String hostname = u.getHost(); |
| |
| // first check for host-specific rules |
| for (Rule rule : hostRules.get(hostname)) { |
| if (rule.match(u)) { |
| return null; |
| } |
| } |
| |
| // also look up domain rules for host name |
| for (Rule rule : domainRules.get(hostname)) { |
| if (rule.match(u)) { |
| return null; |
| } |
| } |
| |
| // check suffixes of host name from longer to shorter: |
| // subdomains, domain, top-level domain |
| int start = 0; |
| int pos; |
| while ((pos = hostname.indexOf('.', start)) != -1) { |
| start = pos + 1; |
| String domain = hostname.substring(start); |
| for (Rule rule : domainRules.get(domain)) { |
| if (rule.match(u)) { |
| return null; |
| } |
| } |
| } |
| |
| // finally check "global" rules defined for `Domain .` |
| for (Rule rule : domainRules.get(".")) { |
| if (rule.match(u)) { |
| return null; |
| } |
| } |
| |
| // no reject rules found |
| return url; |
| } |
| |
| public void reloadRules() throws IOException { |
| String fileRules = conf.get(URLFILTER_FAST_FILE); |
| try (Reader reader = conf.getConfResourceAsReader(fileRules)) { |
| reloadRules(reader); |
| } |
| } |
| |
| private void reloadRules(Reader rules) throws IOException { |
| domainRules.clear(); |
| hostRules.clear(); |
| |
| BufferedReader reader = new BufferedReader(rules); |
| |
| String current = null; |
| boolean host = false; |
| int lineno = 0; |
| |
| String line; |
| try { |
| while((line = reader.readLine()) != null) { |
| lineno++; |
| line = line.trim(); |
| |
| if (line.indexOf("#") != -1) { |
| // strip comments |
| line = line.substring(0, line.indexOf("#")).trim(); |
| } |
| |
| if (StringUtils.isBlank(line)) { |
| continue; |
| } |
| |
| if (line.startsWith("Host")) { |
| host = true; |
| current = line.split("\\s+")[1]; |
| } else if (line.startsWith("Domain")) { |
| host = false; |
| current = line.split("\\s+")[1]; |
| } else { |
| if (current == null) { |
| continue; |
| } |
| |
| Rule rule = null; |
| try { |
| if (CATCH_ALL_RULE.matcher(line).matches()) { |
| rule = DenyAllRule.getInstance(); |
| } else if (line.startsWith("DenyPathQuery")) { |
| rule = new DenyPathQueryRule(line.split("\\s+")[1]); |
| } else if (line.startsWith("DenyPath")) { |
| rule = new DenyPathRule(line.split("\\s+")[1]); |
| } else { |
| LOG.warn("Problem reading rule on line {}: {}", lineno, line); |
| continue; |
| } |
| } catch (Exception e) { |
| LOG.warn("Problem reading rule on line {}: {} - {}", lineno, line, e.getMessage()); |
| continue; |
| } |
| |
| if (host) { |
| LOG.trace("Adding host rule [{}] [{}]", current, rule); |
| hostRules.put(current, rule); |
| } else { |
| LOG.trace("Adding domain rule [{}] [{}]", current, rule); |
| domainRules.put(current, rule); |
| } |
| } |
| } |
| } catch (IOException e) { |
| LOG.warn("Caught exception while reading rules file at line {}: {}", |
| lineno, e.getMessage()); |
| throw e; |
| } |
| } |
| |
| public static class Rule { |
| protected Pattern pattern; |
| |
| Rule() {} |
| |
| public Rule(String regex) { |
| pattern = Pattern.compile(regex); |
| } |
| |
| public boolean match(URL url) { |
| return pattern.matcher(url.toString()).find(); |
| } |
| |
| public String toString() { |
| return pattern.toString(); |
| } |
| } |
| |
| public static class DenyPathRule extends Rule { |
| public DenyPathRule(String regex) { |
| super(regex); |
| } |
| |
| public boolean match(URL url) { |
| String haystack = url.getPath(); |
| return pattern.matcher(haystack).find(); |
| } |
| } |
| |
| /** Rule for <code>DenyPath .*</code> or <code>DenyPath .?</code> */ |
| public static class DenyAllRule extends Rule { |
| |
| private static Rule instance = new DenyAllRule("."); |
| |
| private DenyAllRule(String regex) { |
| super(regex); |
| } |
| |
| public static Rule getInstance() { |
| return instance; |
| } |
| |
| public boolean match(URL url) { |
| return true; |
| } |
| } |
| |
| public static class DenyPathQueryRule extends Rule { |
| public DenyPathQueryRule(String regex) { |
| super(regex); |
| } |
| |
| public boolean match(URL url) { |
| String haystack = url.getFile(); |
| return pattern.matcher(haystack).find(); |
| } |
| } |
| } |