src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nutch.indexer.filter;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.Options;
 import org.apache.commons.cli.OptionBuilder;
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
 import org.apache.commons.cli.HelpFormatter;
 import org.apache.commons.cli.GnuParser;
 import org.apache.commons.cli.UnrecognizedOptionException;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;

 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;

 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.NutchDocument;

 import org.apache.nutch.net.protocols.Response;

 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.ParseStatus;

 import org.apache.nutch.metadata.Metadata;

 import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.PrefixStringMatcher;
 import org.apache.nutch.util.TrieStringMatcher;
 import org.apache.tika.Tika;

 import java.lang.invoke.MethodHandles;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.Reader;
 import java.util.ArrayList;
 import java.util.List;

 /**
  * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering
  * of documents based on the MIME Type detected by Tika
  *
  */
 public class MimeTypeIndexingFilter implements IndexingFilter {

   public static final String MIMEFILTER_REGEX_FILE = "mimetype.filter.file";

   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   private MimeUtil MIME;
   private Tika tika = new Tika();

   private TrieStringMatcher trie;

   private Configuration conf;

   private boolean acceptMode = true;

   // Inherited JavaDoc
   @Override
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
       CrawlDatum datum, Inlinks inlinks) throws IndexingException {

     String mimeType;
     String contentType;

     Writable tcontentType = datum.getMetaData()
         .get(new Text(Response.CONTENT_TYPE));

     if (tcontentType != null) {
       contentType = tcontentType.toString();
     } else {
       contentType = parse.getData().getMeta(Response.CONTENT_TYPE);
     }

     if (contentType == null) {
       mimeType = tika.detect(url.toString());
     } else {
       mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
     }

     contentType = mimeType;

     if (LOG.isInfoEnabled()) {
       LOG.info(String.format("[%s] %s", contentType, url));
     }

     if (trie != null) {
       if (trie.shortestMatch(contentType) == null) {
         // no match, but
         if (acceptMode) {
           return doc;
         }
         return null;
       } else {
         // matched, but we are blocking
         if (acceptMode) {
           return null;
         }
       }
     }

     return doc;
   }

   /*
    * -----------------------------
    * <implementation:Configurable> *
    * -----------------------------
    */
   @Override
   public void setConf(Configuration conf) {
     this.conf = conf;
     MIME = new MimeUtil(conf);

     // load the file of the values
     String file = conf.get(MIMEFILTER_REGEX_FILE, "");

     if (file != null) {
       if (file.isEmpty()) {
         LOG.warn(String
             .format("Missing %s property, ALL mimetypes will be allowed",
                 MIMEFILTER_REGEX_FILE));
       } else {
         Reader reader = conf.getConfResourceAsReader(file);

         try {
           readConfiguration(reader);
         } catch (IOException e) {
           if (LOG.isErrorEnabled()) {
             LOG.error(e.getMessage());
           }

           throw new RuntimeException(e.getMessage(), e);
         }
       }
     }
   }

   private void readConfiguration(Reader reader) throws IOException {
     BufferedReader in = new BufferedReader(reader);
     String line;
     List<String> rules = new ArrayList<String>();

     while (null != (line = in.readLine())) {
       if (line.length() == 0) {
         continue;
       }

       char first = line.charAt(0);
       switch (first) {
       case ' ':
       case '\n':
       case '#': // skip blank & comment lines
         break;
       case '+':
         acceptMode = true;
         break;
       case '-':
         acceptMode = false;
         break;
       default:
         rules.add(line);
         break;
       }
     }

     trie = new PrefixStringMatcher(rules);
   }

   @Override
   public Configuration getConf() {
     return this.conf;
   }

   /**
    * Main method for invoking this tool
    *
    * @throws IOException
    * @throws IndexingException
    */
   public static void main(String[] args) throws IOException, IndexingException {
     Option helpOpt = new Option("h", "help", false, "show this help message");
     @SuppressWarnings("static-access")
     Option rulesOpt = OptionBuilder.withArgName("file").hasArg()
         .withDescription(
             "Rules file to be used in the tests relative to the conf directory")
         .isRequired().create("rules");

     Options options = new Options();
     options.addOption(helpOpt).addOption(rulesOpt);

     CommandLineParser parser = new GnuParser();
     HelpFormatter formatter = new HelpFormatter();
     String rulesFile;

     try {
       CommandLine line = parser.parse(options, args);

       if (line.hasOption("help") || !line.hasOption("rules")) {
         formatter
             .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter",
                 options, true);
         return;
       }

       rulesFile = line.getOptionValue("rules");
     } catch (UnrecognizedOptionException e) {
       formatter
           .printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter",
               options, true);
       return;
     } catch (Exception e) {
       LOG.error(StringUtils.stringifyException(e));
       e.printStackTrace();
       return;
     }

     MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
     Configuration conf = NutchConfiguration.create();
     conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
     filter.setConf(conf);

     BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
     String line;

     while ((line = in.readLine()) != null && !line.isEmpty()) {
       Metadata metadata = new Metadata();
       metadata.set(Response.CONTENT_TYPE, line);
       ParseImpl parse = new ParseImpl("text",
           new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));

       NutchDocument doc = filter.filter(new NutchDocument(), parse,
           new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());

       if (doc != null) {
         System.out.print("+ ");
         System.out.println(line);
       } else {
         System.out.print("- ");
         System.out.println(line);
       }
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nutch.indexer.filter;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import org.apache.commons.cli.Option;
	import org.apache.commons.cli.Options;
	import org.apache.commons.cli.OptionBuilder;
	import org.apache.commons.cli.CommandLine;
	import org.apache.commons.cli.CommandLineParser;
	import org.apache.commons.cli.HelpFormatter;
	import org.apache.commons.cli.GnuParser;
	import org.apache.commons.cli.UnrecognizedOptionException;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.util.StringUtils;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.io.Writable;

	import org.apache.nutch.crawl.CrawlDatum;
	import org.apache.nutch.crawl.Inlinks;

	import org.apache.nutch.indexer.IndexingException;
	import org.apache.nutch.indexer.IndexingFilter;
	import org.apache.nutch.indexer.NutchDocument;

	import org.apache.nutch.net.protocols.Response;

	import org.apache.nutch.parse.Outlink;
	import org.apache.nutch.parse.Parse;
	import org.apache.nutch.parse.ParseData;
	import org.apache.nutch.parse.ParseImpl;
	import org.apache.nutch.parse.ParseStatus;

	import org.apache.nutch.metadata.Metadata;

	import org.apache.nutch.util.MimeUtil;
	import org.apache.nutch.util.NutchConfiguration;
	import org.apache.nutch.util.PrefixStringMatcher;
	import org.apache.nutch.util.TrieStringMatcher;
	import org.apache.tika.Tika;

	import java.lang.invoke.MethodHandles;
	import java.io.BufferedReader;
	import java.io.IOException;
	import java.io.InputStreamReader;
	import java.io.Reader;
	import java.util.ArrayList;
	import java.util.List;

	/**
	* An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering
	* of documents based on the MIME Type detected by Tika
	*
	*/
	public class MimeTypeIndexingFilter implements IndexingFilter {

	public static final String MIMEFILTER_REGEX_FILE = "mimetype.filter.file";

	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	private MimeUtil MIME;
	private Tika tika = new Tika();

	private TrieStringMatcher trie;

	private Configuration conf;

	private boolean acceptMode = true;

	// Inherited JavaDoc
	@Override
	public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
	CrawlDatum datum, Inlinks inlinks) throws IndexingException {

	String mimeType;
	String contentType;

	Writable tcontentType = datum.getMetaData()
	.get(new Text(Response.CONTENT_TYPE));

	if (tcontentType != null) {
	contentType = tcontentType.toString();
	} else {
	contentType = parse.getData().getMeta(Response.CONTENT_TYPE);
	}

	if (contentType == null) {
	mimeType = tika.detect(url.toString());
	} else {
	mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));
	}

	contentType = mimeType;

	if (LOG.isInfoEnabled()) {
	LOG.info(String.format("[%s] %s", contentType, url));
	}

	if (trie != null) {
	if (trie.shortestMatch(contentType) == null) {
	// no match, but
	if (acceptMode) {
	return doc;
	}
	return null;
	} else {
	// matched, but we are blocking
	if (acceptMode) {
	return null;
	}
	}
	}

	return doc;
	}

	/*
	* -----------------------------
	* <implementation:Configurable> *
	* -----------------------------
	*/
	@Override
	public void setConf(Configuration conf) {
	this.conf = conf;
	MIME = new MimeUtil(conf);

	// load the file of the values
	String file = conf.get(MIMEFILTER_REGEX_FILE, "");

	if (file != null) {
	if (file.isEmpty()) {
	LOG.warn(String
	.format("Missing %s property, ALL mimetypes will be allowed",
	MIMEFILTER_REGEX_FILE));
	} else {
	Reader reader = conf.getConfResourceAsReader(file);

	try {
	readConfiguration(reader);
	} catch (IOException e) {
	if (LOG.isErrorEnabled()) {
	LOG.error(e.getMessage());
	}

	throw new RuntimeException(e.getMessage(), e);
	}
	}
	}
	}

	private void readConfiguration(Reader reader) throws IOException {
	BufferedReader in = new BufferedReader(reader);
	String line;
	List<String> rules = new ArrayList<String>();

	while (null != (line = in.readLine())) {
	if (line.length() == 0) {
	continue;
	}

	char first = line.charAt(0);
	switch (first) {
	case ' ':
	case '\n':
	case '#': // skip blank & comment lines
	break;
	case '+':
	acceptMode = true;
	break;
	case '-':
	acceptMode = false;
	break;
	default:
	rules.add(line);
	break;
	}
	}

	trie = new PrefixStringMatcher(rules);
	}

	@Override
	public Configuration getConf() {
	return this.conf;
	}

	/**
	* Main method for invoking this tool
	*
	* @throws IOException
	* @throws IndexingException
	*/
	public static void main(String[] args) throws IOException, IndexingException {
	Option helpOpt = new Option("h", "help", false, "show this help message");
	@SuppressWarnings("static-access")
	Option rulesOpt = OptionBuilder.withArgName("file").hasArg()
	.withDescription(
	"Rules file to be used in the tests relative to the conf directory")
	.isRequired().create("rules");

	Options options = new Options();
	options.addOption(helpOpt).addOption(rulesOpt);

	CommandLineParser parser = new GnuParser();
	HelpFormatter formatter = new HelpFormatter();
	String rulesFile;

	try {
	CommandLine line = parser.parse(options, args);

	if (line.hasOption("help") \|\| !line.hasOption("rules")) {
	formatter
	.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter",
	options, true);
	return;
	}

	rulesFile = line.getOptionValue("rules");
	} catch (UnrecognizedOptionException e) {
	formatter
	.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter",
	options, true);
	return;
	} catch (Exception e) {
	LOG.error(StringUtils.stringifyException(e));
	e.printStackTrace();
	return;
	}

	MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
	Configuration conf = NutchConfiguration.create();
	conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
	filter.setConf(conf);

	BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
	String line;

	while ((line = in.readLine()) != null && !line.isEmpty()) {
	Metadata metadata = new Metadata();
	metadata.set(Response.CONTENT_TYPE, line);
	ParseImpl parse = new ParseImpl("text",
	new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));

	NutchDocument doc = filter.filter(new NutchDocument(), parse,
	new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());

	if (doc != null) {
	System.out.print("+ ");
	System.out.println(line);
	} else {
	System.out.print("- ");
	System.out.println(line);
	}
	}
	}
	}