src/java/org/apache/nutch/crawl/MimeAdaptiveFetchSchedule.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.crawl;

 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.Reader;
 import java.lang.invoke.MethodHandles;
 import java.util.HashMap;

 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.HttpHeaders;
 import org.apache.nutch.util.MimeUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * Extension of @see AdaptiveFetchSchedule that allows for more flexible
  * configuration of DEC and INC factors for various MIME-types.
  *
  * This class can be typically used in cases where a recrawl consists of many
  * different MIME-types. It's not very common for MIME-types other than
  * text/html to change frequently. Using this class you can configure different
  * factors per MIME-type so to prefer frequently changing MIME-types over
  * others.
  *
  * For it to work this class relies on the Content-Type MetaData key being
  * present in the CrawlDB. This can either be done when injecting new URL's or
  * by adding "Content-Type" to the db.parsemeta.to.crawldb configuration setting
  * to force MIME-types of newly discovered URL's to be added to the CrawlDB.
  *
  * @author markus
  */
 public class MimeAdaptiveFetchSchedule extends AdaptiveFetchSchedule {
   // Loggg
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   // Conf directives
   public static final String SCHEDULE_INC_RATE = "db.fetch.schedule.adaptive.inc_rate";
   public static final String SCHEDULE_DEC_RATE = "db.fetch.schedule.adaptive.dec_rate";
   public static final String SCHEDULE_MIME_FILE = "db.fetch.schedule.mime.file";

   // Default values for DEC and INC rate
   private float defaultIncRate;
   private float defaultDecRate;

   // Structure to store inc and dec rates per MIME-type
   private class AdaptiveRate {
     public float inc;
     public float dec;

     public AdaptiveRate(Float inc, Float dec) {
       this.inc = inc;
       this.dec = dec;
     }
   }

   // Here we store the mime's and their delta's
   private HashMap<String, AdaptiveRate> mimeMap;

   public void setConf(Configuration conf) {
     super.setConf(conf);
     if (conf == null)
       return;

     // Read and set the default INC and DEC rates in case we cannot set values
     // based on MIME-type
     defaultIncRate = conf.getFloat(SCHEDULE_INC_RATE, 0.2f);
     defaultDecRate = conf.getFloat(SCHEDULE_DEC_RATE, 0.2f);

     // Where's the mime/factor file?
     Reader mimeFile = conf.getConfResourceAsReader(conf.get(SCHEDULE_MIME_FILE,
         "adaptive-mimetypes.txt"));

     try {
       readMimeFile(mimeFile);
     } catch (IOException e) {
       LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
     }
   }

   @Override
   public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
       long prevFetchTime, long prevModifiedTime, long fetchTime,
       long modifiedTime, int state) {

     // Set defaults
     INC_RATE = defaultIncRate;
     DEC_RATE = defaultDecRate;

     // Check if the Content-Type field is available in the CrawlDatum
     if (datum.getMetaData().containsKey(HttpHeaders.WRITABLE_CONTENT_TYPE)) {
       // Get the MIME-type of the current URL
       String currentMime = MimeUtil.cleanMimeType(datum.getMetaData()
           .get(HttpHeaders.WRITABLE_CONTENT_TYPE).toString());

       // Check if this MIME-type exists in our map
       if (mimeMap.containsKey(currentMime)) {
         // Yes, set the INC and DEC rates for this MIME-type
         INC_RATE = mimeMap.get(currentMime).inc;
         DEC_RATE = mimeMap.get(currentMime).dec;
       }
     }

     return super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
         fetchTime, modifiedTime, state);
   }

   /**
    * Reads the mime types and their associated INC/DEC factors in a HashMap
    *
    * @param mimeFile
    *          Reader
    * @return void
    */
   private void readMimeFile(Reader mimeFile) throws IOException {
     // Instance of our mime/factor map
     mimeMap = new HashMap<>();

     // Open a reader
     BufferedReader reader = new BufferedReader(mimeFile);

     String line = null;
     String[] splits = null;

     // Read all lines
     while ((line = reader.readLine()) != null) {
       // Skip blank lines and comments
       if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
         // Split the line by TAB
         splits = line.split("\t");

         // Sanity check, we need two or three items
         if (splits.length == 3) {
           // Add a lower cased MIME-type and the factor to the map
           mimeMap.put(StringUtils.lowerCase(splits[0]), new AdaptiveRate(
               Float.valueOf(splits[1]), Float.valueOf(splits[2])));
         } else {
           LOG.warn("Invalid configuration line in: " + line);
         }
       }
     }
   }

   public static void main(String[] args) throws Exception {
     FetchSchedule fs = new MimeAdaptiveFetchSchedule();
     fs.setConf(NutchConfiguration.create());
     // we start the time at 0, for simplicity
     long curTime = 0;
     long delta = 1000L * 3600L * 24L; // 2 hours
     // we trigger the update of the page every 30 days
     long update = 1000L * 3600L * 24L * 30L; // 30 days
     boolean changed = true;
     long lastModified = 0;
     int miss = 0;
     int totalMiss = 0;
     int maxMiss = 0;
     int fetchCnt = 0;
     int changeCnt = 0;

     // initial fetchInterval is 10 days
     CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);

     // Set a default MIME-type to test with
     org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable();
     x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text(
         "text/html; charset=utf-8"));
     p.setMetaData(x);

     p.setFetchTime(0);
     LOG.info(p.toString());

     // let's move the timeline a couple of deltas
     for (int i = 0; i < 10000; i++) {
       if (lastModified + update < curTime) {
         // System.out.println("i=" + i + ", lastModified=" + lastModified +
         // ", update=" + update + ", curTime=" + curTime);
         changed = true;
         changeCnt++;
         lastModified = curTime;
       }

       LOG.info(i + ". " + changed + "\twill fetch at "
           + (p.getFetchTime() / delta) + "\tinterval "
           + (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed "
           + miss);

       if (p.getFetchTime() <= curTime) {
         fetchCnt++;
         fs.setFetchSchedule(new Text("http://www.example.com"), p, p
             .getFetchTime(), p.getModifiedTime(), curTime, lastModified,
             changed ? FetchSchedule.STATUS_MODIFIED
                 : FetchSchedule.STATUS_NOTMODIFIED);

         LOG.info("\tfetched & adjusted: " + "\twill fetch at "
             + (p.getFetchTime() / delta) + "\tinterval "
             + (p.getFetchInterval() / SECONDS_PER_DAY) + " days");

         if (!changed)
           miss++;
         if (miss > maxMiss)
           maxMiss = miss;
         changed = false;
         totalMiss += miss;
         miss = 0;
       }

       if (changed)
         miss++;
       curTime += delta;
     }
     LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
     LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt
         + " times.");
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.crawl;

	import java.io.BufferedReader;
	import java.io.IOException;
	import java.io.Reader;
	import java.lang.invoke.MethodHandles;
	import java.util.HashMap;

	import org.apache.commons.lang.StringUtils;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.io.Text;
	import org.apache.nutch.crawl.CrawlDatum;
	import org.apache.nutch.metadata.HttpHeaders;
	import org.apache.nutch.util.MimeUtil;
	import org.apache.nutch.util.NutchConfiguration;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	* Extension of @see AdaptiveFetchSchedule that allows for more flexible
	* configuration of DEC and INC factors for various MIME-types.
	*
	* This class can be typically used in cases where a recrawl consists of many
	* different MIME-types. It's not very common for MIME-types other than
	* text/html to change frequently. Using this class you can configure different
	* factors per MIME-type so to prefer frequently changing MIME-types over
	* others.
	*
	* For it to work this class relies on the Content-Type MetaData key being
	* present in the CrawlDB. This can either be done when injecting new URL's or
	* by adding "Content-Type" to the db.parsemeta.to.crawldb configuration setting
	* to force MIME-types of newly discovered URL's to be added to the CrawlDB.
	*
	* @author markus
	*/
	public class MimeAdaptiveFetchSchedule extends AdaptiveFetchSchedule {
	// Loggg
	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	// Conf directives
	public static final String SCHEDULE_INC_RATE = "db.fetch.schedule.adaptive.inc_rate";
	public static final String SCHEDULE_DEC_RATE = "db.fetch.schedule.adaptive.dec_rate";
	public static final String SCHEDULE_MIME_FILE = "db.fetch.schedule.mime.file";

	// Default values for DEC and INC rate
	private float defaultIncRate;
	private float defaultDecRate;

	// Structure to store inc and dec rates per MIME-type
	private class AdaptiveRate {
	public float inc;
	public float dec;

	public AdaptiveRate(Float inc, Float dec) {
	this.inc = inc;
	this.dec = dec;
	}
	}

	// Here we store the mime's and their delta's
	private HashMap<String, AdaptiveRate> mimeMap;

	public void setConf(Configuration conf) {
	super.setConf(conf);
	if (conf == null)
	return;

	// Read and set the default INC and DEC rates in case we cannot set values
	// based on MIME-type
	defaultIncRate = conf.getFloat(SCHEDULE_INC_RATE, 0.2f);
	defaultDecRate = conf.getFloat(SCHEDULE_DEC_RATE, 0.2f);

	// Where's the mime/factor file?
	Reader mimeFile = conf.getConfResourceAsReader(conf.get(SCHEDULE_MIME_FILE,
	"adaptive-mimetypes.txt"));

	try {
	readMimeFile(mimeFile);
	} catch (IOException e) {
	LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
	}
	}

	@Override
	public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
	long prevFetchTime, long prevModifiedTime, long fetchTime,
	long modifiedTime, int state) {

	// Set defaults
	INC_RATE = defaultIncRate;
	DEC_RATE = defaultDecRate;

	// Check if the Content-Type field is available in the CrawlDatum
	if (datum.getMetaData().containsKey(HttpHeaders.WRITABLE_CONTENT_TYPE)) {
	// Get the MIME-type of the current URL
	String currentMime = MimeUtil.cleanMimeType(datum.getMetaData()
	.get(HttpHeaders.WRITABLE_CONTENT_TYPE).toString());

	// Check if this MIME-type exists in our map
	if (mimeMap.containsKey(currentMime)) {
	// Yes, set the INC and DEC rates for this MIME-type
	INC_RATE = mimeMap.get(currentMime).inc;
	DEC_RATE = mimeMap.get(currentMime).dec;
	}
	}

	return super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
	fetchTime, modifiedTime, state);
	}

	/**
	* Reads the mime types and their associated INC/DEC factors in a HashMap
	*
	* @param mimeFile
	* Reader
	* @return void
	*/
	private void readMimeFile(Reader mimeFile) throws IOException {
	// Instance of our mime/factor map
	mimeMap = new HashMap<>();

	// Open a reader
	BufferedReader reader = new BufferedReader(mimeFile);

	String line = null;
	String[] splits = null;

	// Read all lines
	while ((line = reader.readLine()) != null) {
	// Skip blank lines and comments
	if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
	// Split the line by TAB
	splits = line.split("\t");

	// Sanity check, we need two or three items
	if (splits.length == 3) {
	// Add a lower cased MIME-type and the factor to the map
	mimeMap.put(StringUtils.lowerCase(splits[0]), new AdaptiveRate(
	Float.valueOf(splits[1]), Float.valueOf(splits[2])));
	} else {
	LOG.warn("Invalid configuration line in: " + line);
	}
	}
	}
	}

	public static void main(String[] args) throws Exception {
	FetchSchedule fs = new MimeAdaptiveFetchSchedule();
	fs.setConf(NutchConfiguration.create());
	// we start the time at 0, for simplicity
	long curTime = 0;
	long delta = 1000L * 3600L * 24L; // 2 hours
	// we trigger the update of the page every 30 days
	long update = 1000L * 3600L * 24L * 30L; // 30 days
	boolean changed = true;
	long lastModified = 0;
	int miss = 0;
	int totalMiss = 0;
	int maxMiss = 0;
	int fetchCnt = 0;
	int changeCnt = 0;

	// initial fetchInterval is 10 days
	CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);

	// Set a default MIME-type to test with
	org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable();
	x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text(
	"text/html; charset=utf-8"));
	p.setMetaData(x);

	p.setFetchTime(0);
	LOG.info(p.toString());

	// let's move the timeline a couple of deltas
	for (int i = 0; i < 10000; i++) {
	if (lastModified + update < curTime) {
	// System.out.println("i=" + i + ", lastModified=" + lastModified +
	// ", update=" + update + ", curTime=" + curTime);
	changed = true;
	changeCnt++;
	lastModified = curTime;
	}

	LOG.info(i + ". " + changed + "\twill fetch at "
	+ (p.getFetchTime() / delta) + "\tinterval "
	+ (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed "
	+ miss);

	if (p.getFetchTime() <= curTime) {
	fetchCnt++;
	fs.setFetchSchedule(new Text("http://www.example.com"), p, p
	.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
	changed ? FetchSchedule.STATUS_MODIFIED
	: FetchSchedule.STATUS_NOTMODIFIED);

	LOG.info("\tfetched & adjusted: " + "\twill fetch at "
	+ (p.getFetchTime() / delta) + "\tinterval "
	+ (p.getFetchInterval() / SECONDS_PER_DAY) + " days");

	if (!changed)
	miss++;
	if (miss > maxMiss)
	maxMiss = miss;
	changed = false;
	totalMiss += miss;
	miss = 0;
	}

	if (changed)
	miss++;
	curTime += delta;
	}
	LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
	LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt
	+ " times.");
	}

	}