src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.urlfilter.domainblacklist;

 import java.lang.invoke.MethodHandles;
 import java.io.BufferedReader;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.LinkedHashSet;
 import java.util.Set;

 import org.apache.commons.lang.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.net.URLFilter;
 import org.apache.nutch.plugin.Extension;
 import org.apache.nutch.plugin.PluginRepository;
 import org.apache.nutch.util.URLUtil;
 import org.apache.nutch.util.domain.DomainSuffix;

 /**
  * <p>
  * Filters URLs based on a file containing domain suffixes, domain names, and
  * hostnames. A URL that matches one of the suffixes, domains, or hosts present
  * in the file is filtered out.
  * </p>
  *
  * <p>
  * URLs are checked in order of domain suffix, domain name, and hostname against
  * entries in the domain file. The domain file would be setup as follows with
  * one entry per line:
  *
  * <pre>
  * com
  * apache.org
  * www.apache.org
  * </pre>
  *
  * <p>
  * The first line is an example of a filter that would exclude all .com domains.
  * The second line excludes all URLs from apache.org and all of its subdomains
  * such as lucene.apache.org and hadoop.apache.org. The third line would exclude
  * only URLs from www.apache.org. There is no specific ordering to entries. The
  * entries are from more general to more specific with the more general
  * overridding the more specific.
  * </p>
  *
  * The domain file defaults to domainblacklist-urlfilter.txt in the classpath
  * but can be overridden using the:
  *
  * <ul>
  * <li>
  * property "urlfilter.domainblacklist.file" in ./conf/nutch-*.xml, and
  * </li>
  * <li>
  * attribute "file" in plugin.xml of this plugin
  * </li>
  * </ul>
  *
  */
 public class DomainBlacklistURLFilter implements URLFilter {

   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   // read in attribute "file" of this plugin.
   private static String attributeFile = null;
   private Configuration conf;
   private Set<String> domainSet = new LinkedHashSet<String>();

   private void readConfiguration(Reader configReader) throws IOException {

     // read the configuration file, line by line
     BufferedReader reader = new BufferedReader(configReader);
     String line = null;
     while ((line = reader.readLine()) != null) {
       if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
         // add non-blank lines and non-commented lines
         domainSet.add(StringUtils.lowerCase(line.trim()));
       }
     }
   }

   /**
    * Sets the configuration.
    */
   public void setConf(Configuration conf) {
     this.conf = conf;

     // get the extensions for domain urlfilter
     String pluginName = "urlfilter-domainblacklist";
     Extension[] extensions = PluginRepository.get(conf)
         .getExtensionPoint(URLFilter.class.getName()).getExtensions();
     for (int i = 0; i < extensions.length; i++) {
       Extension extension = extensions[i];
       if (extension.getDescriptor().getPluginId().equals(pluginName)) {
         attributeFile = extension.getAttribute("file");
         break;
       }
     }

     if (attributeFile != null && attributeFile.trim().isEmpty()) {
       attributeFile = null;
     }

     if (attributeFile != null) {
       LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName,
           attributeFile);
     }

     // precedence hierarchy for definition of filter rules
     // (first non-empty definition takes precedence):
     // 1. string rules defined by `urlfilter.domainblacklist.rules`
     // 2. rule file name defined by `urlfilter.domainblacklist.file`
     // 3. rule file name defined in plugin.xml (`attributeFile`)
     String stringRules = conf.get("urlfilter.domainblacklist.rules");
     String file = conf.get("urlfilter.domainblacklist.file");
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
       if (file != null) {
         // take file
       } else if (attributeFile != null) {
         file = attributeFile;
       }
       LOG.info("Reading {} rules file {}", pluginName, file);
       reader = conf.getConfResourceAsReader(file);
     }
     try {
       if (reader == null) {
         // read local file
         reader = new FileReader(file);
       }
       readConfiguration(reader);
     } catch (IOException e) {
       LOG.error("Error reading " + pluginName + " rule file " + file, e);
     }
   }

   public Configuration getConf() {
     return this.conf;
   }

   public String filter(String url) {
     try {
       // match for suffix, domain, and host in that order. more general will
       // override more specific
       String domain = URLUtil.getDomainName(url).toLowerCase().trim();
       String host = URLUtil.getHost(url);
       String suffix = null;
       DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
       if (domainSuffix != null) {
         suffix = domainSuffix.getDomain();
       }

       if (domainSet.contains(suffix) || domainSet.contains(domain)
           || domainSet.contains(host)) {
         // Matches, filter!
         return null;
       }

       // doesn't match, allow
       return url;
     } catch (Exception e) {

       // if an error happens, allow the url to pass
       LOG.error("Could not apply filter on url: " + url + "\n"
           + org.apache.hadoop.util.StringUtils.stringifyException(e));
       return null;
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.urlfilter.domainblacklist;

	import java.lang.invoke.MethodHandles;
	import java.io.BufferedReader;
	import java.io.FileReader;
	import java.io.IOException;
	import java.io.Reader;
	import java.io.StringReader;
	import java.util.LinkedHashSet;
	import java.util.Set;

	import org.apache.commons.lang.StringUtils;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.nutch.net.URLFilter;
	import org.apache.nutch.plugin.Extension;
	import org.apache.nutch.plugin.PluginRepository;
	import org.apache.nutch.util.URLUtil;
	import org.apache.nutch.util.domain.DomainSuffix;

	/**
	* <p>
	* Filters URLs based on a file containing domain suffixes, domain names, and
	* hostnames. A URL that matches one of the suffixes, domains, or hosts present
	* in the file is filtered out.
	* </p>
	*
	* <p>
	* URLs are checked in order of domain suffix, domain name, and hostname against
	* entries in the domain file. The domain file would be setup as follows with
	* one entry per line:
	*
	* <pre>
	* com
	* apache.org
	* www.apache.org
	* </pre>
	*
	* <p>
	* The first line is an example of a filter that would exclude all .com domains.
	* The second line excludes all URLs from apache.org and all of its subdomains
	* such as lucene.apache.org and hadoop.apache.org. The third line would exclude
	* only URLs from www.apache.org. There is no specific ordering to entries. The
	* entries are from more general to more specific with the more general
	* overridding the more specific.
	* </p>
	*
	* The domain file defaults to domainblacklist-urlfilter.txt in the classpath
	* but can be overridden using the:
	*
	* <ul>
	* <li>
	* property "urlfilter.domainblacklist.file" in ./conf/nutch-*.xml, and
	* </li>
	* <li>
	* attribute "file" in plugin.xml of this plugin
	* </li>
	* </ul>
	*
	*/
	public class DomainBlacklistURLFilter implements URLFilter {

	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	// read in attribute "file" of this plugin.
	private static String attributeFile = null;
	private Configuration conf;
	private Set<String> domainSet = new LinkedHashSet<String>();

	private void readConfiguration(Reader configReader) throws IOException {

	// read the configuration file, line by line
	BufferedReader reader = new BufferedReader(configReader);
	String line = null;
	while ((line = reader.readLine()) != null) {
	if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
	// add non-blank lines and non-commented lines
	domainSet.add(StringUtils.lowerCase(line.trim()));
	}
	}
	}

	/**
	* Sets the configuration.
	*/
	public void setConf(Configuration conf) {
	this.conf = conf;

	// get the extensions for domain urlfilter
	String pluginName = "urlfilter-domainblacklist";
	Extension[] extensions = PluginRepository.get(conf)
	.getExtensionPoint(URLFilter.class.getName()).getExtensions();
	for (int i = 0; i < extensions.length; i++) {
	Extension extension = extensions[i];
	if (extension.getDescriptor().getPluginId().equals(pluginName)) {
	attributeFile = extension.getAttribute("file");
	break;
	}
	}

	if (attributeFile != null && attributeFile.trim().isEmpty()) {
	attributeFile = null;
	}

	if (attributeFile != null) {
	LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName,
	attributeFile);
	}

	// precedence hierarchy for definition of filter rules
	// (first non-empty definition takes precedence):
	// 1. string rules defined by `urlfilter.domainblacklist.rules`
	// 2. rule file name defined by `urlfilter.domainblacklist.file`
	// 3. rule file name defined in plugin.xml (`attributeFile`)
	String stringRules = conf.get("urlfilter.domainblacklist.rules");
	String file = conf.get("urlfilter.domainblacklist.file");
	Reader reader = null;
	if (stringRules != null) { // takes precedence over files
	reader = new StringReader(stringRules);
	} else {
	if (file != null) {
	// take file
	} else if (attributeFile != null) {
	file = attributeFile;
	}
	LOG.info("Reading {} rules file {}", pluginName, file);
	reader = conf.getConfResourceAsReader(file);
	}
	try {
	if (reader == null) {
	// read local file
	reader = new FileReader(file);
	}
	readConfiguration(reader);
	} catch (IOException e) {
	LOG.error("Error reading " + pluginName + " rule file " + file, e);
	}
	}

	public Configuration getConf() {
	return this.conf;
	}

	public String filter(String url) {
	try {
	// match for suffix, domain, and host in that order. more general will
	// override more specific
	String domain = URLUtil.getDomainName(url).toLowerCase().trim();
	String host = URLUtil.getHost(url);
	String suffix = null;
	DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
	if (domainSuffix != null) {
	suffix = domainSuffix.getDomain();
	}

	if (domainSet.contains(suffix) \|\| domainSet.contains(domain)
	\|\| domainSet.contains(host)) {
	// Matches, filter!
	return null;
	}

	// doesn't match, allow
	return url;
	} catch (Exception e) {

	// if an error happens, allow the url to pass
	LOG.error("Could not apply filter on url: " + url + "\n"
	+ org.apache.hadoop.util.StringUtils.stringifyException(e));
	return null;
	}
	}
	}