blob: 9e2e2e7d807d163b3e255712d0e75431e7abb364 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.urlfilter.domain;
import java.lang.invoke.MethodHandles;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.LinkedHashSet;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.net.URLFilter;
import org.apache.nutch.plugin.Extension;
import org.apache.nutch.plugin.PluginRepository;
import org.apache.nutch.util.URLUtil;
import org.apache.nutch.util.domain.DomainSuffix;
/**
* <p>
* Filters URLs based on a file containing domain suffixes, domain names, and
* hostnames. Only a url that matches one of the suffixes, domains, or hosts
* present in the file is allowed.
* </p>
*
* <p>
* Urls are checked in order of domain suffix, domain name, and hostname against
* entries in the domain file. The domain file would be setup as follows with
* one entry per line:
*
* <pre>
* com apache.org www.apache.org
* </pre>
*
* <p>
* The first line is an example of a filter that would allow all .com domains.
* The second line allows all urls from apache.org and all of its subdomains
* such as lucene.apache.org and hadoop.apache.org. The third line would allow
* only urls from www.apache.org. There is no specific ordering to entries. The
* entries are from more general to more specific with the more general
* overridding the more specific.
* </p>
*
* The domain file defaults to domain-urlfilter.txt in the classpath but can be
* overridden using the:
*
* <ul>
* <li>
* property "urlfilter.domain.file" in ./conf/nutch-*.xml, and
* </li>
* <li>
* attribute "file" in plugin.xml of this plugin
* </li>
* </ul>
*
* the attribute "file" has higher precedence if defined.
*/
public class DomainURLFilter implements URLFilter {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
// read in attribute "file" of this plugin.
private static String attributeFile = null;
private Configuration conf;
private String domainFile = null;
private Set<String> domainSet = new LinkedHashSet<String>();
private void readConfiguration(Reader configReader) throws IOException {
// read the configuration file, line by line
BufferedReader reader = new BufferedReader(configReader);
String line = null;
while ((line = reader.readLine()) != null) {
if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
// add non-blank lines and non-commented lines
domainSet.add(StringUtils.lowerCase(line.trim()));
}
}
}
/**
* Default constructor.
*/
public DomainURLFilter() {
}
/**
* Constructor that specifies the domain file to use.
*
* @param domainFile
* The domain file, overrides domain-urlfilter.text default.
*/
public DomainURLFilter(String domainFile) {
this.domainFile = domainFile;
}
/**
* Sets the configuration.
*/
public void setConf(Configuration conf) {
this.conf = conf;
// get the extensions for domain urlfilter
String pluginName = "urlfilter-domain";
Extension[] extensions = PluginRepository.get(conf)
.getExtensionPoint(URLFilter.class.getName()).getExtensions();
for (int i = 0; i < extensions.length; i++) {
Extension extension = extensions[i];
if (extension.getDescriptor().getPluginId().equals(pluginName)) {
attributeFile = extension.getAttribute("file");
break;
}
}
// handle blank non empty input
if (attributeFile != null && attributeFile.trim().equals("")) {
attributeFile = null;
}
if (attributeFile != null) {
if (LOG.isInfoEnabled()) {
LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+ " as " + attributeFile);
}
} else {
if (LOG.isWarnEnabled()) {
LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+ pluginName);
}
}
// domain file and attribute "file" take precedence if defined
String file = conf.get("urlfilter.domain.file");
String stringRules = conf.get("urlfilter.domain.rules");
if (domainFile != null) {
file = domainFile;
} else if (attributeFile != null) {
file = attributeFile;
}
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
reader = conf.getConfResourceAsReader(file);
}
try {
if (reader == null) {
reader = new FileReader(file);
}
readConfiguration(reader);
} catch (IOException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
}
public Configuration getConf() {
return this.conf;
}
public String filter(String url) {
// https://issues.apache.org/jira/browse/NUTCH-2189
if (domainSet.size() == 0) return url;
try {
// match for suffix, domain, and host in that order. more general will
// override more specific
String domain = URLUtil.getDomainName(url).toLowerCase().trim();
String host = URLUtil.getHost(url);
String suffix = null;
DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
if (domainSuffix != null) {
suffix = domainSuffix.getDomain();
}
if (domainSet.contains(suffix) || domainSet.contains(domain)
|| domainSet.contains(host)) {
return url;
}
// doesn't match, don't allow
return null;
} catch (Exception e) {
// if an error happens, allow the url to pass
LOG.error("Could not apply filter on url: " + url + "\n"
+ org.apache.hadoop.util.StringUtils.stringifyException(e));
return null;
}
}
}