src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.urlfilter.domain;

 import java.lang.invoke.MethodHandles;
 import java.io.BufferedReader;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.LinkedHashSet;
 import java.util.Set;

 import org.apache.commons.lang.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.net.URLFilter;
 import org.apache.nutch.plugin.Extension;
 import org.apache.nutch.plugin.PluginRepository;
 import org.apache.nutch.util.URLUtil;
 import org.apache.nutch.util.domain.DomainSuffix;

 /**
  * <p>
  * Filters URLs based on a file containing domain suffixes, domain names, and
  * hostnames. Only a url that matches one of the suffixes, domains, or hosts
  * present in the file is allowed.
  * </p>
  *
  * <p>
  * Urls are checked in order of domain suffix, domain name, and hostname against
  * entries in the domain file. The domain file would be setup as follows with
  * one entry per line:
  *
  * <pre>
  * com apache.org www.apache.org
  * </pre>
  *
  * <p>
  * The first line is an example of a filter that would allow all .com domains.
  * The second line allows all urls from apache.org and all of its subdomains
  * such as lucene.apache.org and hadoop.apache.org. The third line would allow
  * only urls from www.apache.org. There is no specific ordering to entries. The
  * entries are from more general to more specific with the more general
  * overridding the more specific.
  * </p>
  *
  * The domain file defaults to domain-urlfilter.txt in the classpath but can be
  * overridden using the:
  *
  * <ul>
  * <li>
  * property "urlfilter.domain.file" in ./conf/nutch-*.xml, and
  * </li>
  * <li>
  * attribute "file" in plugin.xml of this plugin
  * </li>
  * </ul>
  *
  * the attribute "file" has higher precedence if defined.
  */
 public class DomainURLFilter implements URLFilter {

   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   // read in attribute "file" of this plugin.
   private static String attributeFile = null;
   private Configuration conf;
   private String domainFile = null;
   private Set<String> domainSet = new LinkedHashSet<String>();

   private void readConfiguration(Reader configReader) throws IOException {

     // read the configuration file, line by line
     BufferedReader reader = new BufferedReader(configReader);
     String line = null;
     while ((line = reader.readLine()) != null) {
       if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
         // add non-blank lines and non-commented lines
         domainSet.add(StringUtils.lowerCase(line.trim()));
       }
     }
   }

   /**
    * Default constructor.
    */
   public DomainURLFilter() {

   }

   /**
    * Constructor that specifies the domain file to use.
    *
    * @param domainFile
    *          The domain file, overrides domain-urlfilter.text default.
    */
   public DomainURLFilter(String domainFile) {
     this.domainFile = domainFile;
   }

   /**
    * Sets the configuration.
    */
   public void setConf(Configuration conf) {
     this.conf = conf;

     // get the extensions for domain urlfilter
     String pluginName = "urlfilter-domain";
     Extension[] extensions = PluginRepository.get(conf)
         .getExtensionPoint(URLFilter.class.getName()).getExtensions();
     for (int i = 0; i < extensions.length; i++) {
       Extension extension = extensions[i];
       if (extension.getDescriptor().getPluginId().equals(pluginName)) {
         attributeFile = extension.getAttribute("file");
         break;
       }
     }

     // handle blank non empty input
     if (attributeFile != null && attributeFile.trim().equals("")) {
       attributeFile = null;
     }

     if (attributeFile != null) {
       if (LOG.isInfoEnabled()) {
         LOG.info("Attribute \"file\" is defined for plugin " + pluginName
             + " as " + attributeFile);
       }
     } else {
       if (LOG.isWarnEnabled()) {
         LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
             + pluginName);
       }
     }

     // domain file and attribute "file" take precedence if defined
     String file = conf.get("urlfilter.domain.file");
     String stringRules = conf.get("urlfilter.domain.rules");
     if (domainFile != null) {
       file = domainFile;
     } else if (attributeFile != null) {
       file = attributeFile;
     }
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
       reader = conf.getConfResourceAsReader(file);
     }
     try {
       if (reader == null) {
         reader = new FileReader(file);
       }
       readConfiguration(reader);
     } catch (IOException e) {
       LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
     }
   }

   public Configuration getConf() {
     return this.conf;
   }

   public String filter(String url) {
     // https://issues.apache.org/jira/browse/NUTCH-2189
     if (domainSet.size() == 0) return url;

     try {
       // match for suffix, domain, and host in that order. more general will
       // override more specific
       String domain = URLUtil.getDomainName(url).toLowerCase().trim();
       String host = URLUtil.getHost(url);
       String suffix = null;
       DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
       if (domainSuffix != null) {
         suffix = domainSuffix.getDomain();
       }

       if (domainSet.contains(suffix) || domainSet.contains(domain)
           || domainSet.contains(host)) {
         return url;
       }

       // doesn't match, don't allow
       return null;
     } catch (Exception e) {

       // if an error happens, allow the url to pass
       LOG.error("Could not apply filter on url: " + url + "\n"
           + org.apache.hadoop.util.StringUtils.stringifyException(e));
       return null;
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.urlfilter.domain;

	import java.lang.invoke.MethodHandles;
	import java.io.BufferedReader;
	import java.io.FileReader;
	import java.io.IOException;
	import java.io.Reader;
	import java.io.StringReader;
	import java.util.LinkedHashSet;
	import java.util.Set;

	import org.apache.commons.lang.StringUtils;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.nutch.net.URLFilter;
	import org.apache.nutch.plugin.Extension;
	import org.apache.nutch.plugin.PluginRepository;
	import org.apache.nutch.util.URLUtil;
	import org.apache.nutch.util.domain.DomainSuffix;

	/**
	* <p>
	* Filters URLs based on a file containing domain suffixes, domain names, and
	* hostnames. Only a url that matches one of the suffixes, domains, or hosts
	* present in the file is allowed.
	* </p>
	*
	* <p>
	* Urls are checked in order of domain suffix, domain name, and hostname against
	* entries in the domain file. The domain file would be setup as follows with
	* one entry per line:
	*
	* <pre>
	* com apache.org www.apache.org
	* </pre>
	*
	* <p>
	* The first line is an example of a filter that would allow all .com domains.
	* The second line allows all urls from apache.org and all of its subdomains
	* such as lucene.apache.org and hadoop.apache.org. The third line would allow
	* only urls from www.apache.org. There is no specific ordering to entries. The
	* entries are from more general to more specific with the more general
	* overridding the more specific.
	* </p>
	*
	* The domain file defaults to domain-urlfilter.txt in the classpath but can be
	* overridden using the:
	*
	* <ul>
	* <li>
	* property "urlfilter.domain.file" in ./conf/nutch-*.xml, and
	* </li>
	* <li>
	* attribute "file" in plugin.xml of this plugin
	* </li>
	* </ul>
	*
	* the attribute "file" has higher precedence if defined.
	*/
	public class DomainURLFilter implements URLFilter {

	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	// read in attribute "file" of this plugin.
	private static String attributeFile = null;
	private Configuration conf;
	private String domainFile = null;
	private Set<String> domainSet = new LinkedHashSet<String>();

	private void readConfiguration(Reader configReader) throws IOException {

	// read the configuration file, line by line
	BufferedReader reader = new BufferedReader(configReader);
	String line = null;
	while ((line = reader.readLine()) != null) {
	if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
	// add non-blank lines and non-commented lines
	domainSet.add(StringUtils.lowerCase(line.trim()));
	}
	}
	}

	/**
	* Default constructor.
	*/
	public DomainURLFilter() {

	}

	/**
	* Constructor that specifies the domain file to use.
	*
	* @param domainFile
	* The domain file, overrides domain-urlfilter.text default.
	*/
	public DomainURLFilter(String domainFile) {
	this.domainFile = domainFile;
	}

	/**
	* Sets the configuration.
	*/
	public void setConf(Configuration conf) {
	this.conf = conf;

	// get the extensions for domain urlfilter
	String pluginName = "urlfilter-domain";
	Extension[] extensions = PluginRepository.get(conf)
	.getExtensionPoint(URLFilter.class.getName()).getExtensions();
	for (int i = 0; i < extensions.length; i++) {
	Extension extension = extensions[i];
	if (extension.getDescriptor().getPluginId().equals(pluginName)) {
	attributeFile = extension.getAttribute("file");
	break;
	}
	}

	// handle blank non empty input
	if (attributeFile != null && attributeFile.trim().equals("")) {
	attributeFile = null;
	}

	if (attributeFile != null) {
	if (LOG.isInfoEnabled()) {
	LOG.info("Attribute \"file\" is defined for plugin " + pluginName
	+ " as " + attributeFile);
	}
	} else {
	if (LOG.isWarnEnabled()) {
	LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
	+ pluginName);
	}
	}

	// domain file and attribute "file" take precedence if defined
	String file = conf.get("urlfilter.domain.file");
	String stringRules = conf.get("urlfilter.domain.rules");
	if (domainFile != null) {
	file = domainFile;
	} else if (attributeFile != null) {
	file = attributeFile;
	}
	Reader reader = null;
	if (stringRules != null) { // takes precedence over files
	reader = new StringReader(stringRules);
	} else {
	reader = conf.getConfResourceAsReader(file);
	}
	try {
	if (reader == null) {
	reader = new FileReader(file);
	}
	readConfiguration(reader);
	} catch (IOException e) {
	LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
	}
	}

	public Configuration getConf() {
	return this.conf;
	}

	public String filter(String url) {
	// https://issues.apache.org/jira/browse/NUTCH-2189
	if (domainSet.size() == 0) return url;

	try {
	// match for suffix, domain, and host in that order. more general will
	// override more specific
	String domain = URLUtil.getDomainName(url).toLowerCase().trim();
	String host = URLUtil.getHost(url);
	String suffix = null;
	DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
	if (domainSuffix != null) {
	suffix = domainSuffix.getDomain();
	}

	if (domainSet.contains(suffix) \|\| domainSet.contains(domain)
	\|\| domainSet.contains(host)) {
	return url;
	}

	// doesn't match, don't allow
	return null;
	} catch (Exception e) {

	// if an error happens, allow the url to pass
	LOG.error("Could not apply filter on url: " + url + "\n"
	+ org.apache.hadoop.util.StringUtils.stringifyException(e));
	return null;
	}
	}
	}