src/java/org/apache/nutch/crawl/CrawlDbFilter.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.crawl;

 import java.io.IOException;
 import java.lang.invoke.MethodHandles;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;

 /**
  * This class provides a way to separate the URL normalization and filtering
  * steps from the rest of CrawlDb manipulation code.
  *
  * @author Andrzej Bialecki
  */
 public class CrawlDbFilter extends
     Mapper<Text, CrawlDatum, Text, CrawlDatum> {
   public static final String URL_FILTERING = "crawldb.url.filters";
   public static final String URL_NORMALIZING = "crawldb.url.normalizers";
   public static final String URL_NORMALIZING_SCOPE = "crawldb.url.normalizers.scope";

   private boolean urlFiltering;
   private boolean urlNormalizers;

   private boolean url404Purging;
   private boolean purgeOrphans;
   private URLFilters filters;
   private URLNormalizers normalizers;

   private String scope;

   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());


   public void setup(Mapper<Text, CrawlDatum, Text, CrawlDatum>.Context context) {
     Configuration conf = context.getConfiguration();
     urlFiltering = conf.getBoolean(URL_FILTERING, false);
     urlNormalizers = conf.getBoolean(URL_NORMALIZING, false);
     url404Purging = conf.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false);
     purgeOrphans = conf.getBoolean(CrawlDb.CRAWLDB_PURGE_ORPHANS, false);

     if (urlFiltering) {
       filters = new URLFilters(conf);
     }
     if (urlNormalizers) {
       scope = conf.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_CRAWLDB);
       normalizers = new URLNormalizers(conf, scope);
     }
   }

   public void close() {
   }

   private Text newKey = new Text();

   public void map(Text key, CrawlDatum value,
       Context context) throws IOException, InterruptedException {

     String url = key.toString();

     // https://issues.apache.org/jira/browse/NUTCH-1101 check status first,
     // cheaper than normalizing or filtering
     if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) {
       context.getCounter("CrawlDB filter",
         "Gone records removed").increment(1);
       return;
     }
     // Whether to remove orphaned pages
     // https://issues.apache.org/jira/browse/NUTCH-1932
     if (purgeOrphans && CrawlDatum.STATUS_DB_ORPHAN == value.getStatus()) {
       context.getCounter("CrawlDB filter",
         "Orphan records removed").increment(1);
       return;
     }
     if (url != null && urlNormalizers) {
       try {
         url = normalizers.normalize(url, scope); // normalize the url
       } catch (Exception e) {
         LOG.warn("Skipping " + url + ":" + e);
         url = null;
       }
     }
     if (url != null && urlFiltering) {
       try {
         url = filters.filter(url); // filter the url
       } catch (Exception e) {
         LOG.warn("Skipping " + url + ":" + e);
         url = null;
       }
     }
     if (url == null) {
       context.getCounter("CrawlDB filter", "URLs filtered").increment(1);
     } else {
       // URL has passed filters
       newKey.set(url); // collect it
       context.write(newKey, value);
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.crawl;

	import java.io.IOException;
	import java.lang.invoke.MethodHandles;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Mapper;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.nutch.net.URLFilters;
	import org.apache.nutch.net.URLNormalizers;

	/**
	* This class provides a way to separate the URL normalization and filtering
	* steps from the rest of CrawlDb manipulation code.
	*
	* @author Andrzej Bialecki
	*/
	public class CrawlDbFilter extends
	Mapper<Text, CrawlDatum, Text, CrawlDatum> {
	public static final String URL_FILTERING = "crawldb.url.filters";
	public static final String URL_NORMALIZING = "crawldb.url.normalizers";
	public static final String URL_NORMALIZING_SCOPE = "crawldb.url.normalizers.scope";

	private boolean urlFiltering;
	private boolean urlNormalizers;

	private boolean url404Purging;
	private boolean purgeOrphans;
	private URLFilters filters;
	private URLNormalizers normalizers;

	private String scope;

	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());


	public void setup(Mapper<Text, CrawlDatum, Text, CrawlDatum>.Context context) {
	Configuration conf = context.getConfiguration();
	urlFiltering = conf.getBoolean(URL_FILTERING, false);
	urlNormalizers = conf.getBoolean(URL_NORMALIZING, false);
	url404Purging = conf.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false);
	purgeOrphans = conf.getBoolean(CrawlDb.CRAWLDB_PURGE_ORPHANS, false);

	if (urlFiltering) {
	filters = new URLFilters(conf);
	}
	if (urlNormalizers) {
	scope = conf.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_CRAWLDB);
	normalizers = new URLNormalizers(conf, scope);
	}
	}

	public void close() {
	}

	private Text newKey = new Text();

	public void map(Text key, CrawlDatum value,
	Context context) throws IOException, InterruptedException {

	String url = key.toString();

	// https://issues.apache.org/jira/browse/NUTCH-1101 check status first,
	// cheaper than normalizing or filtering
	if (url404Purging && CrawlDatum.STATUS_DB_GONE == value.getStatus()) {
	context.getCounter("CrawlDB filter",
	"Gone records removed").increment(1);
	return;
	}
	// Whether to remove orphaned pages
	// https://issues.apache.org/jira/browse/NUTCH-1932
	if (purgeOrphans && CrawlDatum.STATUS_DB_ORPHAN == value.getStatus()) {
	context.getCounter("CrawlDB filter",
	"Orphan records removed").increment(1);
	return;
	}
	if (url != null && urlNormalizers) {
	try {
	url = normalizers.normalize(url, scope); // normalize the url
	} catch (Exception e) {
	LOG.warn("Skipping " + url + ":" + e);
	url = null;
	}
	}
	if (url != null && urlFiltering) {
	try {
	url = filters.filter(url); // filter the url
	} catch (Exception e) {
	LOG.warn("Skipping " + url + ":" + e);
	url = null;
	}
	}
	if (url == null) {
	context.getCounter("CrawlDB filter", "URLs filtered").increment(1);
	} else {
	// URL has passed filters
	newKey.set(url); // collect it
	context.write(newKey, value);
	}
	}
	}