src/java/org/apache/nutch/crawl/FetchSchedule.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.crawl;

 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.io.Text;

 /**
  * This interface defines the contract for implementations that manipulate fetch
  * times and re-fetch intervals.
  *
  * @author Andrzej Bialecki
  */
 public interface FetchSchedule extends Configurable {

   /** It is unknown whether page was changed since our last visit. */
   public static final int STATUS_UNKNOWN = 0;
   /** Page is known to have been modified since our last visit. */
   public static final int STATUS_MODIFIED = 1;
   /** Page is known to remain unmodified since our last visit. */
   public static final int STATUS_NOTMODIFIED = 2;

   public static final int SECONDS_PER_DAY = 3600 * 24;

   /**
    * Initialize fetch schedule related data. Implementations should at least set
    * the <code>fetchTime</code> and <code>fetchInterval</code>. The default
    * implementation set the <code>fetchTime</code> to now, using the default
    * <code>fetchInterval</code>.
    *
    * @param url
    *          URL of the page.
    *
    * @param datum
    *          datum instance to be initialized.
    *
    * @return adjusted page information, including all original information.
    *         NOTE: this may be a different instance than @see CrawlDatum, but
    *         implementations should make sure that it contains at least all
    *         information from @see CrawlDatum.
    */
   public CrawlDatum initializeSchedule(Text url, CrawlDatum datum);

   /**
    * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
    * successfully fetched page. Implementations may use supplied arguments to
    * support different re-fetching schedules.
    *
    * @param url
    *          url of the page
    *
    * @param datum
    *          page description to be adjusted. NOTE: this instance, passed by
    *          reference, may be modified inside the method.
    *
    * @param prevFetchTime
    *          previous value of fetch time, or 0 if not available.
    *
    * @param prevModifiedTime
    *          previous value of modifiedTime, or 0 if not available.
    *
    * @param fetchTime
    *          the latest time, when the page was recently re-fetched. Most
    *          FetchSchedule implementations should update the value in @see
    *          CrawlDatum to something greater than this value.
    *
    * @param modifiedTime
    *          last time the content was modified. This information comes from
    *          the protocol implementations, or is set to &lt; 0 if not available.
    *          Most FetchSchedule implementations should update the value in @see
    *          CrawlDatum to this value.
    *
    * @param state
    *          if {@link #STATUS_MODIFIED}, then the content is considered to be
    *          "changed" before the <code>fetchTime</code>, if
    *          {@link #STATUS_NOTMODIFIED} then the content is known to be
    *          unchanged. This information may be obtained by comparing page
    *          signatures before and after fetching. If this is set to
    *          {@link #STATUS_UNKNOWN}, then it is unknown whether the page was
    *          changed; implementations are free to follow a sensible default
    *          behavior.
    *
    * @return adjusted page information, including all original information.
    *         NOTE: this may be a different instance than @see CrawlDatum, but
    *         implementations should make sure that it contains at least all
    *         information from @see CrawlDatum}.
    */
   public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
       long prevFetchTime, long prevModifiedTime, long fetchTime,
       long modifiedTime, int state);

   /**
    * This method specifies how to schedule refetching of pages marked as GONE.
    * Default implementation increases fetchInterval by 50%, and if it exceeds
    * the <code>maxInterval</code> it calls
    * {@link #forceRefetch(Text, CrawlDatum, boolean)}.
    *
    * @param url
    *          URL of the page
    *
    * @param datum
    *          datum instance to be adjusted.
    *
    * @return adjusted page information, including all original information.
    *         NOTE: this may be a different instance than @see CrawlDatum, but
    *         implementations should make sure that it contains at least all
    *         information from @see CrawlDatum.
    */
   public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
       long prevFetchTime, long prevModifiedTime, long fetchTime);

   /**
    * This method adjusts the fetch schedule if fetching needs to be re-tried due
    * to transient errors. The default implementation sets the next fetch time 1
    * day in the future and increases the retry counter.
    *
    * @param url
    *          URL of the page.
    *
    * @param datum
    *          page information.
    *
    * @param prevFetchTime
    *          previous fetch time.
    *
    * @param prevModifiedTime
    *          previous modified time.
    *
    * @param fetchTime
    *          current fetch time.
    *
    * @return adjusted page information, including all original information.
    *         NOTE: this may be a different instance than @see CrawlDatum, but
    *         implementations should make sure that it contains at least all
    *         information from @see CrawlDatum.
    */
   public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
       long prevFetchTime, long prevModifiedTime, long fetchTime);

   /**
    * Calculates last fetch time of the given CrawlDatum.
    *
    * @return the date as a long.
    */
   public long calculateLastFetchTime(CrawlDatum datum);

   /**
    * This method provides information whether the page is suitable for selection
    * in the current fetchlist. NOTE: a true return value does not guarantee that
    * the page will be fetched, it just allows it to be included in the further
    * selection process based on scores. The default implementation checks
    * <code>fetchTime</code>, if it is higher than the curTime it returns false,
    * and true otherwise. It will also check that fetchTime is not too remote
    * (more than <code>maxInterval</code>), in which case it lowers the interval
    * and returns true.
    *
    * @param url
    *          URL of the page.
    *
    * @param datum
    *          datum instance.
    *
    * @param curTime
    *          reference time (usually set to the time when the fetchlist
    *          generation process was started).
    *
    * @return true, if the page should be considered for inclusion in the current
    *         fetchlist, otherwise false.
    */
   public boolean shouldFetch(Text url, CrawlDatum datum, long curTime);

   /**
    * This method resets fetchTime, fetchInterval, modifiedTime and page
    * signature, so that it forces refetching.
    *
    * @param url
    *          URL of the page.
    *
    * @param datum
    *          datum instance.
    *
    * @param asap
    *          if true, force refetch as soon as possible - this sets the
    *          fetchTime to now. If false, force refetch whenever the next fetch
    *          time is set.
    *
    * @return adjusted page information, including all original information.
    *         NOTE: this may be a different instance than @see CrawlDatum, but
    *         implementations should make sure that it contains at least all
    *         information from @see CrawlDatum.
    */
   public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap);
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.crawl;

	import org.apache.hadoop.conf.Configurable;
	import org.apache.hadoop.io.Text;

	/**
	* This interface defines the contract for implementations that manipulate fetch
	* times and re-fetch intervals.
	*
	* @author Andrzej Bialecki
	*/
	public interface FetchSchedule extends Configurable {

	/** It is unknown whether page was changed since our last visit. */
	public static final int STATUS_UNKNOWN = 0;
	/** Page is known to have been modified since our last visit. */
	public static final int STATUS_MODIFIED = 1;
	/** Page is known to remain unmodified since our last visit. */
	public static final int STATUS_NOTMODIFIED = 2;

	public static final int SECONDS_PER_DAY = 3600 * 24;

	/**
	* Initialize fetch schedule related data. Implementations should at least set
	* the <code>fetchTime</code> and <code>fetchInterval</code>. The default
	* implementation set the <code>fetchTime</code> to now, using the default
	* <code>fetchInterval</code>.
	*
	* @param url
	* URL of the page.
	*
	* @param datum
	* datum instance to be initialized.
	*
	* @return adjusted page information, including all original information.
	* NOTE: this may be a different instance than @see CrawlDatum, but
	* implementations should make sure that it contains at least all
	* information from @see CrawlDatum.
	*/
	public CrawlDatum initializeSchedule(Text url, CrawlDatum datum);

	/**
	* Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a
	* successfully fetched page. Implementations may use supplied arguments to
	* support different re-fetching schedules.
	*
	* @param url
	* url of the page
	*
	* @param datum
	* page description to be adjusted. NOTE: this instance, passed by
	* reference, may be modified inside the method.
	*
	* @param prevFetchTime
	* previous value of fetch time, or 0 if not available.
	*
	* @param prevModifiedTime
	* previous value of modifiedTime, or 0 if not available.
	*
	* @param fetchTime
	* the latest time, when the page was recently re-fetched. Most
	* FetchSchedule implementations should update the value in @see
	* CrawlDatum to something greater than this value.
	*
	* @param modifiedTime
	* last time the content was modified. This information comes from
	* the protocol implementations, or is set to < 0 if not available.
	* Most FetchSchedule implementations should update the value in @see
	* CrawlDatum to this value.
	*
	* @param state
	* if {@link #STATUS_MODIFIED}, then the content is considered to be
	* "changed" before the <code>fetchTime</code>, if
	* {@link #STATUS_NOTMODIFIED} then the content is known to be
	* unchanged. This information may be obtained by comparing page
	* signatures before and after fetching. If this is set to
	* {@link #STATUS_UNKNOWN}, then it is unknown whether the page was
	* changed; implementations are free to follow a sensible default
	* behavior.
	*
	* @return adjusted page information, including all original information.
	* NOTE: this may be a different instance than @see CrawlDatum, but
	* implementations should make sure that it contains at least all
	* information from @see CrawlDatum}.
	*/
	public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
	long prevFetchTime, long prevModifiedTime, long fetchTime,
	long modifiedTime, int state);

	/**
	* This method specifies how to schedule refetching of pages marked as GONE.
	* Default implementation increases fetchInterval by 50%, and if it exceeds
	* the <code>maxInterval</code> it calls
	* {@link #forceRefetch(Text, CrawlDatum, boolean)}.
	*
	* @param url
	* URL of the page
	*
	* @param datum
	* datum instance to be adjusted.
	*
	* @return adjusted page information, including all original information.
	* NOTE: this may be a different instance than @see CrawlDatum, but
	* implementations should make sure that it contains at least all
	* information from @see CrawlDatum.
	*/
	public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum,
	long prevFetchTime, long prevModifiedTime, long fetchTime);

	/**
	* This method adjusts the fetch schedule if fetching needs to be re-tried due
	* to transient errors. The default implementation sets the next fetch time 1
	* day in the future and increases the retry counter.
	*
	* @param url
	* URL of the page.
	*
	* @param datum
	* page information.
	*
	* @param prevFetchTime
	* previous fetch time.
	*
	* @param prevModifiedTime
	* previous modified time.
	*
	* @param fetchTime
	* current fetch time.
	*
	* @return adjusted page information, including all original information.
	* NOTE: this may be a different instance than @see CrawlDatum, but
	* implementations should make sure that it contains at least all
	* information from @see CrawlDatum.
	*/
	public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum,
	long prevFetchTime, long prevModifiedTime, long fetchTime);

	/**
	* Calculates last fetch time of the given CrawlDatum.
	*
	* @return the date as a long.
	*/
	public long calculateLastFetchTime(CrawlDatum datum);

	/**
	* This method provides information whether the page is suitable for selection
	* in the current fetchlist. NOTE: a true return value does not guarantee that
	* the page will be fetched, it just allows it to be included in the further
	* selection process based on scores. The default implementation checks
	* <code>fetchTime</code>, if it is higher than the curTime it returns false,
	* and true otherwise. It will also check that fetchTime is not too remote
	* (more than <code>maxInterval</code>), in which case it lowers the interval
	* and returns true.
	*
	* @param url
	* URL of the page.
	*
	* @param datum
	* datum instance.
	*
	* @param curTime
	* reference time (usually set to the time when the fetchlist
	* generation process was started).
	*
	* @return true, if the page should be considered for inclusion in the current
	* fetchlist, otherwise false.
	*/
	public boolean shouldFetch(Text url, CrawlDatum datum, long curTime);

	/**
	* This method resets fetchTime, fetchInterval, modifiedTime and page
	* signature, so that it forces refetching.
	*
	* @param url
	* URL of the page.
	*
	* @param datum
	* datum instance.
	*
	* @param asap
	* if true, force refetch as soon as possible - this sets the
	* fetchTime to now. If false, force refetch whenever the next fetch
	* time is set.
	*
	* @return adjusted page information, including all original information.
	* NOTE: this may be a different instance than @see CrawlDatum, but
	* implementations should make sure that it contains at least all
	* information from @see CrawlDatum.
	*/
	public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap);
	}