src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.hostdb;

 import java.io.IOException;
 import java.lang.invoke.MethodHandles;

 import org.apache.hadoop.io.FloatWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.conf.Configuration;

 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.NutchWritable;
 import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.protocol.ProtocolStatus;
 import org.apache.nutch.util.URLUtil;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * Mapper ingesting HostDB and CrawlDB entries. Additionally it can also read
  * host score info from a plain text key/value file generated by the
  * Webgraph's NodeDumper tool.
  */
 public class UpdateHostDbMapper
   extends Mapper<Text, Writable, Text, NutchWritable> {

   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
   protected Text host = new Text();
   protected HostDatum hostDatum = null;
   protected CrawlDatum crawlDatum = null;
   protected String reprUrl = null;
   protected String buffer = null;
   protected String[] args = null;
   protected boolean filter = false;
   protected boolean normalize = false;
   protected boolean readingCrawlDb = false;
   protected URLFilters filters = null;
   protected URLNormalizers normalizers = null;

   @Override
   public void setup(Mapper<Text, Writable, Text, NutchWritable>.Context context) {
     Configuration conf = context.getConfiguration();
     readingCrawlDb = conf.getBoolean("hostdb.reading.crawldb", false);
     filter = conf.getBoolean(UpdateHostDb.HOSTDB_URL_FILTERING, false);
     normalize = conf.getBoolean(UpdateHostDb.HOSTDB_URL_NORMALIZING, false);

     if (filter)
       filters = new URLFilters(conf);
     if (normalize)
       normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
   }

   /**
    * Filters and or normalizes the input URL
    *
    * @param url
    * @return String
    */
   protected String filterNormalize(String url) {
     // We actually receive a hostname here so let's make a URL
     // TODO: we force shop.fcgroningen to be https, how do we know that here?
     // http://issues.openindex.io/browse/SPIDER-40
     url = "http://" + url + "/";

     try {
       if (normalize)
         url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
       if (filter)
         url = filters.filter(url);
       if (url == null)
         return null;
     } catch (Exception e) {
       return null;
     }

     // Turn back to host
     return URLUtil.getHost(url);
   }

   /**
     * Mapper ingesting records from the HostDB, CrawlDB and plaintext host
     * scores file. Statistics and scores are passed on.
     *
     * @param key
     * @param value
     * @param context
     */
   @Override
   public void map(Text key, Writable value,
     Context context)
     throws IOException, InterruptedException {

     // Get the key!
     String keyStr = key.toString();

     // Check if we process records from the CrawlDB
     if (key instanceof Text && value instanceof CrawlDatum) {
       // Get the normalized and filtered host of this URL
       buffer = filterNormalize(URLUtil.getHost(keyStr));

       // Filtered out?
       if (buffer == null) {
         context.getCounter("UpdateHostDb", "filtered_records").increment(1);
         LOG.info("UpdateHostDb: " + URLUtil.getHost(keyStr) + " crawldatum has been filtered");
         return;
       }

       // Set the host of this URL
       host.set(buffer);
       crawlDatum = (CrawlDatum)value;
       hostDatum = new HostDatum();

       /**
         * TODO: fix multi redirects: host_a => host_b/page => host_c/page/whatever
         * http://www.ferienwohnung-armbruster.de/
         * http://www.ferienwohnung-armbruster.de/website/
         * http://www.ferienwohnung-armbruster.de/website/willkommen.php
         *
         * We cannot reresolve redirects for host objects as CrawlDatum metadata is
         * not available. We also cannot reliably use the reducer in all cases
         * since redirects may be across hosts or even domains. The example
         * above has redirects that will end up in the same reducer. During that
         * phase, however, we do not know which URL redirects to the next URL.
         */
       // Do not resolve homepages when the root URL is unfetched
       if (crawlDatum.getStatus() != CrawlDatum.STATUS_DB_UNFETCHED) {
         // Get the protocol
         String protocol = URLUtil.getProtocol(keyStr);

         // Get the proposed homepage URL
         String homepage = protocol + "://" + buffer + "/";

         // Check if the current key is equals the host
         if (keyStr.equals(homepage)) {
           // Check if this is a redirect to the real home page
           if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM ||
             crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {

             // Obtain the repr url for this redirect via protocolstatus from the metadata
             ProtocolStatus z = (ProtocolStatus)crawlDatum.getMetaData().
               get(Nutch.WRITABLE_PROTO_STATUS_KEY);

             // Get the protocol status' arguments
             args = z.getArgs();

             // ..and the possible redirect URL
             reprUrl = args[0];

             // Am i a redirect?
             if (reprUrl != null) {
               LOG.info("UpdateHostDb: homepage: " + keyStr + " redirects to: " + args[0]);
               context.write(host, new NutchWritable(hostDatum));
               hostDatum.setHomepageUrl(reprUrl);
             } else {
               LOG.info("UpdateHostDb: homepage: " + keyStr +
                 " redirects to: " + args[0] + " but has been filtered out");
             }
           } else {
             hostDatum.setHomepageUrl(homepage);
             context.write(host, new NutchWritable(hostDatum));
             LOG.info("UpdateHostDb: homepage: " + homepage);
           }
         }
       }

       // Always emit crawl datum
       context.write(host, new NutchWritable(crawlDatum));
     }

     // Check if we got a record from the hostdb
     if (key instanceof Text && value instanceof HostDatum) {
       buffer = filterNormalize(keyStr);

       // Filtered out?
       if (buffer == null) {
         context.getCounter("UpdateHostDb", "filtered_records").increment(1);
         LOG.info("UpdateHostDb: " + key.toString() + " hostdatum has been filtered");
         return;
       }

       // Get a HostDatum
       hostDatum = (HostDatum)value;
       key.set(buffer);

       // If we're also reading CrawlDb entries, reset db_* statistics because
       // we're aggregating them from CrawlDB anyway
       if (readingCrawlDb) {
         hostDatum.resetStatistics();
       }

       context.write(key, new NutchWritable(hostDatum));
     }

     // Check if we got a record with host scores
     if (key instanceof Text && value instanceof Text) {
       buffer = filterNormalize(keyStr);

       // Filtered out?
       if (buffer == null) {
         context.getCounter("UpdateHostDb", "filtered_records").increment(1);
         LOG.info("UpdateHostDb: " + key.toString() + " score has been filtered");
         return;
       }

       key.set(buffer);

       context.write(key,
         new NutchWritable(new FloatWritable(Float.parseFloat(value.toString()))));
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.hostdb;

	import java.io.IOException;
	import java.lang.invoke.MethodHandles;

	import org.apache.hadoop.io.FloatWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.io.Writable;
	import org.apache.hadoop.mapreduce.Mapper;
	import org.apache.hadoop.conf.Configuration;

	import org.apache.nutch.crawl.CrawlDatum;
	import org.apache.nutch.crawl.NutchWritable;
	import org.apache.nutch.metadata.Nutch;
	import org.apache.nutch.net.URLFilters;
	import org.apache.nutch.net.URLNormalizers;
	import org.apache.nutch.protocol.ProtocolStatus;
	import org.apache.nutch.util.URLUtil;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	* Mapper ingesting HostDB and CrawlDB entries. Additionally it can also read
	* host score info from a plain text key/value file generated by the
	* Webgraph's NodeDumper tool.
	*/
	public class UpdateHostDbMapper
	extends Mapper<Text, Writable, Text, NutchWritable> {

	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());
	protected Text host = new Text();
	protected HostDatum hostDatum = null;
	protected CrawlDatum crawlDatum = null;
	protected String reprUrl = null;
	protected String buffer = null;
	protected String[] args = null;
	protected boolean filter = false;
	protected boolean normalize = false;
	protected boolean readingCrawlDb = false;
	protected URLFilters filters = null;
	protected URLNormalizers normalizers = null;

	@Override
	public void setup(Mapper<Text, Writable, Text, NutchWritable>.Context context) {
	Configuration conf = context.getConfiguration();
	readingCrawlDb = conf.getBoolean("hostdb.reading.crawldb", false);
	filter = conf.getBoolean(UpdateHostDb.HOSTDB_URL_FILTERING, false);
	normalize = conf.getBoolean(UpdateHostDb.HOSTDB_URL_NORMALIZING, false);

	if (filter)
	filters = new URLFilters(conf);
	if (normalize)
	normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
	}

	/**
	* Filters and or normalizes the input URL
	*
	* @param url
	* @return String
	*/
	protected String filterNormalize(String url) {
	// We actually receive a hostname here so let's make a URL
	// TODO: we force shop.fcgroningen to be https, how do we know that here?
	// http://issues.openindex.io/browse/SPIDER-40
	url = "http://" + url + "/";

	try {
	if (normalize)
	url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
	if (filter)
	url = filters.filter(url);
	if (url == null)
	return null;
	} catch (Exception e) {
	return null;
	}

	// Turn back to host
	return URLUtil.getHost(url);
	}

	/**
	* Mapper ingesting records from the HostDB, CrawlDB and plaintext host
	* scores file. Statistics and scores are passed on.
	*
	* @param key
	* @param value
	* @param context
	*/
	@Override
	public void map(Text key, Writable value,
	Context context)
	throws IOException, InterruptedException {

	// Get the key!
	String keyStr = key.toString();

	// Check if we process records from the CrawlDB
	if (key instanceof Text && value instanceof CrawlDatum) {
	// Get the normalized and filtered host of this URL
	buffer = filterNormalize(URLUtil.getHost(keyStr));

	// Filtered out?
	if (buffer == null) {
	context.getCounter("UpdateHostDb", "filtered_records").increment(1);
	LOG.info("UpdateHostDb: " + URLUtil.getHost(keyStr) + " crawldatum has been filtered");
	return;
	}

	// Set the host of this URL
	host.set(buffer);
	crawlDatum = (CrawlDatum)value;
	hostDatum = new HostDatum();

	/**
	* TODO: fix multi redirects: host_a => host_b/page => host_c/page/whatever
	* http://www.ferienwohnung-armbruster.de/
	* http://www.ferienwohnung-armbruster.de/website/
	* http://www.ferienwohnung-armbruster.de/website/willkommen.php
	*
	* We cannot reresolve redirects for host objects as CrawlDatum metadata is
	* not available. We also cannot reliably use the reducer in all cases
	* since redirects may be across hosts or even domains. The example
	* above has redirects that will end up in the same reducer. During that
	* phase, however, we do not know which URL redirects to the next URL.
	*/
	// Do not resolve homepages when the root URL is unfetched
	if (crawlDatum.getStatus() != CrawlDatum.STATUS_DB_UNFETCHED) {
	// Get the protocol
	String protocol = URLUtil.getProtocol(keyStr);

	// Get the proposed homepage URL
	String homepage = protocol + "://" + buffer + "/";

	// Check if the current key is equals the host
	if (keyStr.equals(homepage)) {
	// Check if this is a redirect to the real home page
	if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM \|\|
	crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {

	// Obtain the repr url for this redirect via protocolstatus from the metadata
	ProtocolStatus z = (ProtocolStatus)crawlDatum.getMetaData().
	get(Nutch.WRITABLE_PROTO_STATUS_KEY);

	// Get the protocol status' arguments
	args = z.getArgs();

	// ..and the possible redirect URL
	reprUrl = args[0];

	// Am i a redirect?
	if (reprUrl != null) {
	LOG.info("UpdateHostDb: homepage: " + keyStr + " redirects to: " + args[0]);
	context.write(host, new NutchWritable(hostDatum));
	hostDatum.setHomepageUrl(reprUrl);
	} else {
	LOG.info("UpdateHostDb: homepage: " + keyStr +
	" redirects to: " + args[0] + " but has been filtered out");
	}
	} else {
	hostDatum.setHomepageUrl(homepage);
	context.write(host, new NutchWritable(hostDatum));
	LOG.info("UpdateHostDb: homepage: " + homepage);
	}
	}
	}

	// Always emit crawl datum
	context.write(host, new NutchWritable(crawlDatum));
	}

	// Check if we got a record from the hostdb
	if (key instanceof Text && value instanceof HostDatum) {
	buffer = filterNormalize(keyStr);

	// Filtered out?
	if (buffer == null) {
	context.getCounter("UpdateHostDb", "filtered_records").increment(1);
	LOG.info("UpdateHostDb: " + key.toString() + " hostdatum has been filtered");
	return;
	}

	// Get a HostDatum
	hostDatum = (HostDatum)value;
	key.set(buffer);

	// If we're also reading CrawlDb entries, reset db_* statistics because
	// we're aggregating them from CrawlDB anyway
	if (readingCrawlDb) {
	hostDatum.resetStatistics();
	}

	context.write(key, new NutchWritable(hostDatum));
	}

	// Check if we got a record with host scores
	if (key instanceof Text && value instanceof Text) {
	buffer = filterNormalize(keyStr);

	// Filtered out?
	if (buffer == null) {
	context.getCounter("UpdateHostDb", "filtered_records").increment(1);
	LOG.info("UpdateHostDb: " + key.toString() + " score has been filtered");
	return;
	}

	key.set(buffer);

	context.write(key,
	new NutchWritable(new FloatWritable(Float.parseFloat(value.toString()))));
	}
	}
	}