src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java - nutch - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nutch.crawl;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.nutch.storage.WebPage;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.util.HashSet;
 import java.util.Set;

 /**
  * This class provides common methods for implementations of
  * {@link FetchSchedule}.
  *
  * @author Andrzej Bialecki
  */
 public abstract class AbstractFetchSchedule extends Configured implements
     FetchSchedule {
   private static final Logger LOG = LoggerFactory
       .getLogger(AbstractFetchSchedule.class);

   protected int defaultInterval;
   protected int maxInterval;

   private static final Set<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

   static {
     FIELDS.add(WebPage.Field.FETCH_TIME);
     FIELDS.add(WebPage.Field.RETRIES_SINCE_FETCH);
     FIELDS.add(WebPage.Field.FETCH_INTERVAL);
   }

   public AbstractFetchSchedule() {
     super(null);
   }

   public AbstractFetchSchedule(Configuration conf) {
     super(conf);
   }

   @Override
   public void setConf(Configuration conf) {
     super.setConf(conf);
     if (conf == null)
       return;
     defaultInterval = conf.getInt("db.fetch.interval.default", 0);
     maxInterval = conf.getInt("db.fetch.interval.max", 0);
     LOG.info("defaultInterval=" + defaultInterval);
     LOG.info("maxInterval=" + maxInterval);
   }

   /**
    * Initialize fetch schedule related data. Implementations should at least set
    * the <code>fetchTime</code> and <code>fetchInterval</code>. The default
    * implementation sets the <code>fetchTime</code> to now, using the default
    * <code>fetchInterval</code>.
    *
    * @param url
    *          URL of the page.
    * @param page
    */
   @Override
   public void initializeSchedule(String url, WebPage page) {
     page.setFetchTime(System.currentTimeMillis());
     page.setFetchInterval(defaultInterval);
     page.setRetriesSinceFetch(0);
   }

   /**
    * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
    * successfully fetched page. NOTE: this implementation resets the retry
    * counter - extending classes should call super.setFetchSchedule() to
    * preserve this behavior.
    */
   @Override
   public void setFetchSchedule(String url, WebPage page, long prevFetchTime,
       long prevModifiedTime, long fetchTime, long modifiedTime, int state) {
     page.setRetriesSinceFetch(0);
   }

   /**
    * This method specifies how to schedule refetching of pages marked as GONE.
    * Default implementation increases fetchInterval by 50% but the value may
    * never exceed <code>maxInterval</code>.
    *
    * @param url
    *          URL of the page
    * @param page
    * @return adjusted page information, including all original information.
    *         NOTE: this may be a different instance than
    * @param datum
    *          , but implementations should make sure that it contains at least
    *          all information from
    * @param datum
    *          .
    */
   @Override
   public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime,
       long prevModifiedTime, long fetchTime) {
     // no page is truly GONE ... just increase the interval by 50%
     // and try much later.
     if ((page.getFetchInterval() * 1.5f) < maxInterval) {
       int newFetchInterval = (int) (page.getFetchInterval() * 1.5f);
       page.setFetchInterval(newFetchInterval);
     } else {
       page.setFetchInterval((int) (maxInterval * 0.9f));
     }
     page.setFetchTime(fetchTime + page.getFetchInterval() * 1000L);
   }

   /**
    * This method adjusts the fetch schedule if fetching needs to be re-tried due
    * to transient errors. The default implementation sets the next fetch time 1
    * day in the future and increases the retry counter.
    *
    * @param url
    *          URL of the page
    * @param page
    * @param prevFetchTime
    *          previous fetch time
    * @param prevModifiedTime
    *          previous modified time
    * @param fetchTime
    *          current fetch time
    */
   @Override
   public void setPageRetrySchedule(String url, WebPage page,
       long prevFetchTime, long prevModifiedTime, long fetchTime) {
     page.setFetchTime(fetchTime + SECONDS_PER_DAY * 1000L);
     page.setRetriesSinceFetch(page.getRetriesSinceFetch() + 1);
   }

   /**
    * This method return the last fetch time of the CrawlDatum
    *
    * @return the date as a long.
    */
   @Override
   public long calculateLastFetchTime(WebPage page) {
     return page.getFetchTime() - page.getFetchInterval() * 1000L;
   }

   /**
    * This method provides information whether the page is suitable for selection
    * in the current fetchlist. NOTE: a true return value does not guarantee that
    * the page will be fetched, it just allows it to be included in the further
    * selection process based on scores. The default implementation checks
    * <code>fetchTime</code>, if it is higher than the
    *
    * @param curTime
    *          it returns false, and true otherwise. It will also check that
    *          fetchTime is not too remote (more than <code>maxInterval</code),
    *          in which case it lowers the interval and returns true.
    * @param url
    *          URL of the page
    * @param page
    * @param curTime
    *          reference time (usually set to the time when the fetchlist
    *          generation process was started).
    * @return true, if the page should be considered for inclusion in the current
    *         fetchlist, otherwise false.
    */
   @Override
   public boolean shouldFetch(String url, WebPage page, long curTime) {
     // pages are never truly GONE - we have to check them from time to time.
     // pages with too long fetchInterval are adjusted so that they fit within
     // maximum fetchInterval (batch retention period).
     long fetchTime = page.getFetchTime();
     if (fetchTime - curTime > maxInterval * 1000L) {
       if (page.getFetchInterval() > maxInterval) {
         page.setFetchInterval(Math.round(maxInterval * 0.9f));
       }
       page.setFetchTime(curTime);
     }
     return fetchTime <= curTime;
   }

   /**
    * This method resets fetchTime, fetchInterval, modifiedTime,
    * retriesSinceFetch and page signature, so that it forces refetching.
    *
    * @param url
    *          URL of the page
    * @param page
    * @param asap
    *          if true, force refetch as soon as possible - this sets the
    *          fetchTime to now. If false, force refetch whenever the next fetch
    *          time is set.
    */
   @Override
   public void forceRefetch(String url, WebPage page, boolean asap) {
     // reduce fetchInterval so that it fits within the max value
     if (page.getFetchInterval() > maxInterval)
       page.setFetchInterval(Math.round(maxInterval * 0.9f));
     page.setStatus((int) CrawlStatus.STATUS_UNFETCHED);
     page.setRetriesSinceFetch(0);
     // TODO: row.setSignature(null) ??
     page.setModifiedTime(0L);
     if (asap)
       page.setFetchTime(System.currentTimeMillis());
   }

   public Set<WebPage.Field> getFields() {
     return FIELDS;
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nutch.crawl;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.conf.Configured;
	import org.apache.nutch.storage.WebPage;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import java.util.HashSet;
	import java.util.Set;

	/**
	* This class provides common methods for implementations of
	* {@link FetchSchedule}.
	*
	* @author Andrzej Bialecki
	*/
	public abstract class AbstractFetchSchedule extends Configured implements
	FetchSchedule {
	private static final Logger LOG = LoggerFactory
	.getLogger(AbstractFetchSchedule.class);

	protected int defaultInterval;
	protected int maxInterval;

	private static final Set<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();

	static {
	FIELDS.add(WebPage.Field.FETCH_TIME);
	FIELDS.add(WebPage.Field.RETRIES_SINCE_FETCH);
	FIELDS.add(WebPage.Field.FETCH_INTERVAL);
	}

	public AbstractFetchSchedule() {
	super(null);
	}

	public AbstractFetchSchedule(Configuration conf) {
	super(conf);
	}

	@Override
	public void setConf(Configuration conf) {
	super.setConf(conf);
	if (conf == null)
	return;
	defaultInterval = conf.getInt("db.fetch.interval.default", 0);
	maxInterval = conf.getInt("db.fetch.interval.max", 0);
	LOG.info("defaultInterval=" + defaultInterval);
	LOG.info("maxInterval=" + maxInterval);
	}

	/**
	* Initialize fetch schedule related data. Implementations should at least set
	* the <code>fetchTime</code> and <code>fetchInterval</code>. The default
	* implementation sets the <code>fetchTime</code> to now, using the default
	* <code>fetchInterval</code>.
	*
	* @param url
	* URL of the page.
	* @param page
	*/
	@Override
	public void initializeSchedule(String url, WebPage page) {
	page.setFetchTime(System.currentTimeMillis());
	page.setFetchInterval(defaultInterval);
	page.setRetriesSinceFetch(0);
	}

	/**
	* Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
	* successfully fetched page. NOTE: this implementation resets the retry
	* counter - extending classes should call super.setFetchSchedule() to
	* preserve this behavior.
	*/
	@Override
	public void setFetchSchedule(String url, WebPage page, long prevFetchTime,
	long prevModifiedTime, long fetchTime, long modifiedTime, int state) {
	page.setRetriesSinceFetch(0);
	}

	/**
	* This method specifies how to schedule refetching of pages marked as GONE.
	* Default implementation increases fetchInterval by 50% but the value may
	* never exceed <code>maxInterval</code>.
	*
	* @param url
	* URL of the page
	* @param page
	* @return adjusted page information, including all original information.
	* NOTE: this may be a different instance than
	* @param datum
	* , but implementations should make sure that it contains at least
	* all information from
	* @param datum
	* .
	*/
	@Override
	public void setPageGoneSchedule(String url, WebPage page, long prevFetchTime,
	long prevModifiedTime, long fetchTime) {
	// no page is truly GONE ... just increase the interval by 50%
	// and try much later.
	if ((page.getFetchInterval() * 1.5f) < maxInterval) {
	int newFetchInterval = (int) (page.getFetchInterval() * 1.5f);
	page.setFetchInterval(newFetchInterval);
	} else {
	page.setFetchInterval((int) (maxInterval * 0.9f));
	}
	page.setFetchTime(fetchTime + page.getFetchInterval() * 1000L);
	}

	/**
	* This method adjusts the fetch schedule if fetching needs to be re-tried due
	* to transient errors. The default implementation sets the next fetch time 1
	* day in the future and increases the retry counter.
	*
	* @param url
	* URL of the page
	* @param page
	* @param prevFetchTime
	* previous fetch time
	* @param prevModifiedTime
	* previous modified time
	* @param fetchTime
	* current fetch time
	*/
	@Override
	public void setPageRetrySchedule(String url, WebPage page,
	long prevFetchTime, long prevModifiedTime, long fetchTime) {
	page.setFetchTime(fetchTime + SECONDS_PER_DAY * 1000L);
	page.setRetriesSinceFetch(page.getRetriesSinceFetch() + 1);
	}

	/**
	* This method return the last fetch time of the CrawlDatum
	*
	* @return the date as a long.
	*/
	@Override
	public long calculateLastFetchTime(WebPage page) {
	return page.getFetchTime() - page.getFetchInterval() * 1000L;
	}

	/**
	* This method provides information whether the page is suitable for selection
	* in the current fetchlist. NOTE: a true return value does not guarantee that
	* the page will be fetched, it just allows it to be included in the further
	* selection process based on scores. The default implementation checks
	* <code>fetchTime</code>, if it is higher than the
	*
	* @param curTime
	* it returns false, and true otherwise. It will also check that
	* fetchTime is not too remote (more than <code>maxInterval</code),
	* in which case it lowers the interval and returns true.
	* @param url
	* URL of the page
	* @param page
	* @param curTime
	* reference time (usually set to the time when the fetchlist
	* generation process was started).
	* @return true, if the page should be considered for inclusion in the current
	* fetchlist, otherwise false.
	*/
	@Override
	public boolean shouldFetch(String url, WebPage page, long curTime) {
	// pages are never truly GONE - we have to check them from time to time.
	// pages with too long fetchInterval are adjusted so that they fit within
	// maximum fetchInterval (batch retention period).
	long fetchTime = page.getFetchTime();
	if (fetchTime - curTime > maxInterval * 1000L) {
	if (page.getFetchInterval() > maxInterval) {
	page.setFetchInterval(Math.round(maxInterval * 0.9f));
	}
	page.setFetchTime(curTime);
	}
	return fetchTime <= curTime;
	}

	/**
	* This method resets fetchTime, fetchInterval, modifiedTime,
	* retriesSinceFetch and page signature, so that it forces refetching.
	*
	* @param url
	* URL of the page
	* @param page
	* @param asap
	* if true, force refetch as soon as possible - this sets the
	* fetchTime to now. If false, force refetch whenever the next fetch
	* time is set.
	*/
	@Override
	public void forceRefetch(String url, WebPage page, boolean asap) {
	// reduce fetchInterval so that it fits within the max value
	if (page.getFetchInterval() > maxInterval)
	page.setFetchInterval(Math.round(maxInterval * 0.9f));
	page.setStatus((int) CrawlStatus.STATUS_UNFETCHED);
	page.setRetriesSinceFetch(0);
	// TODO: row.setSignature(null) ??
	page.setModifiedTime(0L);
	if (asap)
	page.setFetchTime(System.currentTimeMillis());
	}

	public Set<WebPage.Field> getFields() {
	return FIELDS;
	}
	}