src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.urlfilter.suffix;

 import org.apache.hadoop.conf.Configuration;

 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.SuffixStringMatcher;
 import org.apache.nutch.net.URLFilter;
 import org.apache.nutch.plugin.Extension;
 import org.apache.nutch.plugin.PluginRepository;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.lang.invoke.MethodHandles;
 import java.io.Reader;
 import java.io.FileReader;
 import java.io.BufferedReader;
 import java.io.InputStreamReader;
 import java.io.IOException;
 import java.io.StringReader;

 import java.util.List;
 import java.util.ArrayList;

 import java.net.URL;
 import java.net.MalformedURLException;

 /**
  * Filters URLs based on a file of URL suffixes. The file is named by
  * <ol>
  * <li>property "urlfilter.suffix.file" in ./conf/nutch-default.xml, and</li>
  * <li>attribute "file" in plugin.xml of this plugin</li>
  * </ol>
  * If the config file is missing, all URLs will be rejected.
  *
  * <p>
  * This filter can be configured to work in one of two modes:
  * <ul>
  * <li><b>default to reject</b> ('-'): in this mode, only URLs that match
  * suffixes specified in the config file will be accepted, all other URLs will
  * be rejected.</li>
  * <li><b>default to accept</b> ('+'): in this mode, only URLs that match
  * suffixes specified in the config file will be rejected, all other URLs will
  * be accepted.</li>
  * </ul>
  * <p>
  * The format of this config file is one URL suffix per line, with no preceding
  * whitespace. Order, in which suffixes are specified, doesn't matter. Blank
  * lines and comments (#) are allowed.
  * </p>
  * <p>
  * A single '+' or '-' sign not followed by any suffix must be used once, to
  * signify the mode this plugin operates in. An optional single 'I' can be
  * appended, to signify that suffix matches should be case-insensitive. The
  * default, if not specified, is to use case-sensitive matches, i.e. suffix
  * '.JPG' does not match '.jpg'.
  * </p>
  * <p>
  * NOTE: the format of this file is different from urlfilter-prefix, because
  * that plugin doesn't support allowed/prohibited prefixes (only supports
  * allowed prefixes). Please note that this plugin does not support regular
  * expressions, it only accepts literal suffixes. I.e. a suffix "+*.jpg" is most
  * probably wrong, you should use "+.jpg" instead.
  * </p>
  * <h3>Example 1</h3>
  * <p>
  * The configuration shown below will accept all URLs with '.html' or '.htm'
  * suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), and prohibit
  * all other suffixes.
  * <p>
  *
  * <pre>
  *  # this is a comment
  *
  *  # prohibit all unknown, case-sensitive matching
  *  -
  *
  *  # collect only HTML files.
  *  .html
  *  .htm
  * </pre>
  *
  * <h4>Example 2</h4>
  * <p>
  * The configuration shown below will accept all URLs except common graphical
  * formats.
  * <p>
  *
  * <pre>
  *  # this is a comment
  *
  *  # allow all unknown, case-insensitive matching
  *  +I
  *
  *  # prohibited suffixes
  *  .gif
  *  .png
  *  .jpg
  *  .jpeg
  *  .bmp
  * </pre>
  *
  * @author Andrzej Bialecki
  */
 public class SuffixURLFilter implements URLFilter {

   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   // read in attribute "file" of this plugin.
   private String attributeFile = null;

   private SuffixStringMatcher suffixes;
   private boolean modeAccept = false;
   private boolean filterFromPath = false;
   private boolean ignoreCase = false;

   private Configuration conf;

   public SuffixURLFilter() throws IOException {

   }

   public SuffixURLFilter(Reader reader) throws IOException {
     readConfiguration(reader);
   }

   public String filter(String url) {
     if (url == null)
       return null;
     String _url;
     if (ignoreCase)
       _url = url.toLowerCase();
     else
       _url = url;
     if (filterFromPath) {
       try {
         URL pUrl = new URL(_url);
         _url = pUrl.getPath();
       } catch (MalformedURLException e) {
         // don't care
       }
     }

     String a = suffixes.shortestMatch(_url);
     if (a == null) {
       if (modeAccept)
         return url;
       else
         return null;
     } else {
       if (modeAccept)
         return null;
       else
         return url;
     }
   }

   public void readConfiguration(Reader reader) throws IOException {

     // handle missing config file
     if (reader == null) {
       LOG.warn("Missing urlfilter.suffix.file, all URLs will be rejected!");
       suffixes = new SuffixStringMatcher(new String[0]);
       modeAccept = false;
       ignoreCase = false;
       return;
     }
     BufferedReader in = new BufferedReader(reader);
     List<String> aSuffixes = new ArrayList<String>();
     boolean allow = false;
     boolean ignore = false;
     String line;

     while ((line = in.readLine()) != null) {
       line = line.trim();
       if (line.length() == 0)
         continue;

       char first = line.charAt(0);
       switch (first) {
       case ' ':
       case '\n':
       case '#': // skip blank & comment lines
         break;
       case '-':
         allow = false;
         if (line.contains("P"))
           filterFromPath = true;
         if (line.contains("I"))
           ignore = true;
         break;
       case '+':
         allow = true;
         if (line.contains("P"))
           filterFromPath = true;
         if (line.contains("I"))
           ignore = true;
         break;
       default:
         aSuffixes.add(line);
       }
     }
     if (ignore) {
       for (int i = 0; i < aSuffixes.size(); i++) {
         aSuffixes.set(i, ((String) aSuffixes.get(i)).toLowerCase());
       }
     }
     suffixes = new SuffixStringMatcher(aSuffixes);
     modeAccept = allow;
     ignoreCase = ignore;
   }

   public static void main(String args[]) throws IOException {

     SuffixURLFilter filter;
     if (args.length >= 1)
       filter = new SuffixURLFilter(new FileReader(args[0]));
     else {
       filter = new SuffixURLFilter();
       filter.setConf(NutchConfiguration.create());
     }

     BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
     String line;
     while ((line = in.readLine()) != null) {
       String out = filter.filter(line);
       if (out != null) {
         System.out.println("ACCEPTED " + out);
       } else {
         System.out.println("REJECTED " + out);
       }
     }
   }

   public void setConf(Configuration conf) {
     this.conf = conf;

     String pluginName = "urlfilter-suffix";
     Extension[] extensions = PluginRepository.get(conf)
         .getExtensionPoint(URLFilter.class.getName()).getExtensions();
     for (int i = 0; i < extensions.length; i++) {
       Extension extension = extensions[i];
       if (extension.getDescriptor().getPluginId().equals(pluginName)) {
         attributeFile = extension.getAttribute("file");
         break;
       }
     }

     if (attributeFile != null && attributeFile.trim().isEmpty()) {
       attributeFile = null;
     }

     if (attributeFile != null) {
       LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName, attributeFile);
     }

     // precedence hierarchy for definition of filter rules
     // (first non-empty definition takes precedence):
     // 1. string rules defined by `urlfilter.domainblacklist.rules`
     // 2. rule file name defined by `urlfilter.domainblacklist.file`
     // 3. rule file name defined in plugin.xml (`attributeFile`)
     String file = conf.get("urlfilter.suffix.file", attributeFile);
     String stringRules = conf.get("urlfilter.suffix.rules");
     Reader reader = null;
     if (stringRules != null) { // takes precedence over files
       reader = new StringReader(stringRules);
     } else {
       LOG.info("Reading {} rules file {}", pluginName, file);
       reader = conf.getConfResourceAsReader(file);
     }

     try {
       readConfiguration(reader);
     } catch (IOException e) {
       LOG.error("Error reading " + pluginName + " rule file " + file, e);
     }
   }

   public Configuration getConf() {
     return this.conf;
   }

   public boolean isModeAccept() {
     return modeAccept;
   }

   public void setModeAccept(boolean modeAccept) {
     this.modeAccept = modeAccept;
   }

   public boolean isIgnoreCase() {
     return ignoreCase;
   }

   public void setIgnoreCase(boolean ignoreCase) {
     this.ignoreCase = ignoreCase;
   }

   public void setFilterFromPath(boolean filterFromPath) {
     this.filterFromPath = filterFromPath;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.urlfilter.suffix;

	import org.apache.hadoop.conf.Configuration;

	import org.apache.nutch.util.NutchConfiguration;
	import org.apache.nutch.util.SuffixStringMatcher;
	import org.apache.nutch.net.URLFilter;
	import org.apache.nutch.plugin.Extension;
	import org.apache.nutch.plugin.PluginRepository;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import java.lang.invoke.MethodHandles;
	import java.io.Reader;
	import java.io.FileReader;
	import java.io.BufferedReader;
	import java.io.InputStreamReader;
	import java.io.IOException;
	import java.io.StringReader;

	import java.util.List;
	import java.util.ArrayList;

	import java.net.URL;
	import java.net.MalformedURLException;

	/**
	* Filters URLs based on a file of URL suffixes. The file is named by
	* <ol>
	* <li>property "urlfilter.suffix.file" in ./conf/nutch-default.xml, and</li>
	* <li>attribute "file" in plugin.xml of this plugin</li>
	* </ol>
	* If the config file is missing, all URLs will be rejected.
	*
	* <p>
	* This filter can be configured to work in one of two modes:
	* <ul>
	* <li><b>default to reject</b> ('-'): in this mode, only URLs that match
	* suffixes specified in the config file will be accepted, all other URLs will
	* be rejected.</li>
	* <li><b>default to accept</b> ('+'): in this mode, only URLs that match
	* suffixes specified in the config file will be rejected, all other URLs will
	* be accepted.</li>
	* </ul>
	* <p>
	* The format of this config file is one URL suffix per line, with no preceding
	* whitespace. Order, in which suffixes are specified, doesn't matter. Blank
	* lines and comments (#) are allowed.
	* </p>
	* <p>
	* A single '+' or '-' sign not followed by any suffix must be used once, to
	* signify the mode this plugin operates in. An optional single 'I' can be
	* appended, to signify that suffix matches should be case-insensitive. The
	* default, if not specified, is to use case-sensitive matches, i.e. suffix
	* '.JPG' does not match '.jpg'.
	* </p>
	* <p>
	* NOTE: the format of this file is different from urlfilter-prefix, because
	* that plugin doesn't support allowed/prohibited prefixes (only supports
	* allowed prefixes). Please note that this plugin does not support regular
	* expressions, it only accepts literal suffixes. I.e. a suffix "+*.jpg" is most
	* probably wrong, you should use "+.jpg" instead.
	* </p>
	* <h3>Example 1</h3>
	* <p>
	* The configuration shown below will accept all URLs with '.html' or '.htm'
	* suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), and prohibit
	* all other suffixes.
	* <p>
	*
	* <pre>
	* # this is a comment
	*
	* # prohibit all unknown, case-sensitive matching
	* -
	*
	* # collect only HTML files.
	* .html
	* .htm
	* </pre>
	*
	* <h4>Example 2</h4>
	* <p>
	* The configuration shown below will accept all URLs except common graphical
	* formats.
	* <p>
	*
	* <pre>
	* # this is a comment
	*
	* # allow all unknown, case-insensitive matching
	* +I
	*
	* # prohibited suffixes
	* .gif
	* .png
	* .jpg
	* .jpeg
	* .bmp
	* </pre>
	*
	* @author Andrzej Bialecki
	*/
	public class SuffixURLFilter implements URLFilter {

	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	// read in attribute "file" of this plugin.
	private String attributeFile = null;

	private SuffixStringMatcher suffixes;
	private boolean modeAccept = false;
	private boolean filterFromPath = false;
	private boolean ignoreCase = false;

	private Configuration conf;

	public SuffixURLFilter() throws IOException {

	}

	public SuffixURLFilter(Reader reader) throws IOException {
	readConfiguration(reader);
	}

	public String filter(String url) {
	if (url == null)
	return null;
	String _url;
	if (ignoreCase)
	_url = url.toLowerCase();
	else
	_url = url;
	if (filterFromPath) {
	try {
	URL pUrl = new URL(_url);
	_url = pUrl.getPath();
	} catch (MalformedURLException e) {
	// don't care
	}
	}

	String a = suffixes.shortestMatch(_url);
	if (a == null) {
	if (modeAccept)
	return url;
	else
	return null;
	} else {
	if (modeAccept)
	return null;
	else
	return url;
	}
	}

	public void readConfiguration(Reader reader) throws IOException {

	// handle missing config file
	if (reader == null) {
	LOG.warn("Missing urlfilter.suffix.file, all URLs will be rejected!");
	suffixes = new SuffixStringMatcher(new String[0]);
	modeAccept = false;
	ignoreCase = false;
	return;
	}
	BufferedReader in = new BufferedReader(reader);
	List<String> aSuffixes = new ArrayList<String>();
	boolean allow = false;
	boolean ignore = false;
	String line;

	while ((line = in.readLine()) != null) {
	line = line.trim();
	if (line.length() == 0)
	continue;

	char first = line.charAt(0);
	switch (first) {
	case ' ':
	case '\n':
	case '#': // skip blank & comment lines
	break;
	case '-':
	allow = false;
	if (line.contains("P"))
	filterFromPath = true;
	if (line.contains("I"))
	ignore = true;
	break;
	case '+':
	allow = true;
	if (line.contains("P"))
	filterFromPath = true;
	if (line.contains("I"))
	ignore = true;
	break;
	default:
	aSuffixes.add(line);
	}
	}
	if (ignore) {
	for (int i = 0; i < aSuffixes.size(); i++) {
	aSuffixes.set(i, ((String) aSuffixes.get(i)).toLowerCase());
	}
	}
	suffixes = new SuffixStringMatcher(aSuffixes);
	modeAccept = allow;
	ignoreCase = ignore;
	}

	public static void main(String args[]) throws IOException {

	SuffixURLFilter filter;
	if (args.length >= 1)
	filter = new SuffixURLFilter(new FileReader(args[0]));
	else {
	filter = new SuffixURLFilter();
	filter.setConf(NutchConfiguration.create());
	}

	BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
	String line;
	while ((line = in.readLine()) != null) {
	String out = filter.filter(line);
	if (out != null) {
	System.out.println("ACCEPTED " + out);
	} else {
	System.out.println("REJECTED " + out);
	}
	}
	}

	public void setConf(Configuration conf) {
	this.conf = conf;

	String pluginName = "urlfilter-suffix";
	Extension[] extensions = PluginRepository.get(conf)
	.getExtensionPoint(URLFilter.class.getName()).getExtensions();
	for (int i = 0; i < extensions.length; i++) {
	Extension extension = extensions[i];
	if (extension.getDescriptor().getPluginId().equals(pluginName)) {
	attributeFile = extension.getAttribute("file");
	break;
	}
	}

	if (attributeFile != null && attributeFile.trim().isEmpty()) {
	attributeFile = null;
	}

	if (attributeFile != null) {
	LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName, attributeFile);
	}

	// precedence hierarchy for definition of filter rules
	// (first non-empty definition takes precedence):
	// 1. string rules defined by `urlfilter.domainblacklist.rules`
	// 2. rule file name defined by `urlfilter.domainblacklist.file`
	// 3. rule file name defined in plugin.xml (`attributeFile`)
	String file = conf.get("urlfilter.suffix.file", attributeFile);
	String stringRules = conf.get("urlfilter.suffix.rules");
	Reader reader = null;
	if (stringRules != null) { // takes precedence over files
	reader = new StringReader(stringRules);
	} else {
	LOG.info("Reading {} rules file {}", pluginName, file);
	reader = conf.getConfResourceAsReader(file);
	}

	try {
	readConfiguration(reader);
	} catch (IOException e) {
	LOG.error("Error reading " + pluginName + " rule file " + file, e);
	}
	}

	public Configuration getConf() {
	return this.conf;
	}

	public boolean isModeAccept() {
	return modeAccept;
	}

	public void setModeAccept(boolean modeAccept) {
	this.modeAccept = modeAccept;
	}

	public boolean isIgnoreCase() {
	return ignoreCase;
	}

	public void setIgnoreCase(boolean ignoreCase) {
	this.ignoreCase = ignoreCase;
	}

	public void setFilterFromPath(boolean filterFromPath) {
	this.filterFromPath = filterFromPath;
	}
	}