| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.crawl; |
| |
| import org.apache.hadoop.conf.Configurable; |
| import org.apache.hadoop.io.Text; |
| |
| /** |
| * This interface defines the contract for implementations that manipulate fetch |
| * times and re-fetch intervals. |
| * |
| * @author Andrzej Bialecki |
| */ |
| public interface FetchSchedule extends Configurable { |
| |
| /** It is unknown whether page was changed since our last visit. */ |
| public static final int STATUS_UNKNOWN = 0; |
| /** Page is known to have been modified since our last visit. */ |
| public static final int STATUS_MODIFIED = 1; |
| /** Page is known to remain unmodified since our last visit. */ |
| public static final int STATUS_NOTMODIFIED = 2; |
| |
| public static final int SECONDS_PER_DAY = 3600 * 24; |
| |
| /** |
| * Initialize fetch schedule related data. Implementations should at least set |
| * the <code>fetchTime</code> and <code>fetchInterval</code>. The default |
| * implementation set the <code>fetchTime</code> to now, using the default |
| * <code>fetchInterval</code>. |
| * |
| * @param url |
| * URL of the page. |
| * |
| * @param datum |
| * datum instance to be initialized. |
| * |
| * @return adjusted page information, including all original information. |
| * NOTE: this may be a different instance than @see CrawlDatum, but |
| * implementations should make sure that it contains at least all |
| * information from @see CrawlDatum. |
| */ |
| public CrawlDatum initializeSchedule(Text url, CrawlDatum datum); |
| |
| /** |
| * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a |
| * successfully fetched page. Implementations may use supplied arguments to |
| * support different re-fetching schedules. |
| * |
| * @param url |
| * url of the page |
| * |
| * @param datum |
| * page description to be adjusted. NOTE: this instance, passed by |
| * reference, may be modified inside the method. |
| * |
| * @param prevFetchTime |
| * previous value of fetch time, or 0 if not available. |
| * |
| * @param prevModifiedTime |
| * previous value of modifiedTime, or 0 if not available. |
| * |
| * @param fetchTime |
| * the latest time, when the page was recently re-fetched. Most |
| * FetchSchedule implementations should update the value in @see |
| * CrawlDatum to something greater than this value. |
| * |
| * @param modifiedTime |
| * last time the content was modified. This information comes from |
| * the protocol implementations, or is set to < 0 if not available. |
| * Most FetchSchedule implementations should update the value in @see |
| * CrawlDatum to this value. |
| * |
| * @param state |
| * if {@link #STATUS_MODIFIED}, then the content is considered to be |
| * "changed" before the <code>fetchTime</code>, if |
| * {@link #STATUS_NOTMODIFIED} then the content is known to be |
| * unchanged. This information may be obtained by comparing page |
| * signatures before and after fetching. If this is set to |
| * {@link #STATUS_UNKNOWN}, then it is unknown whether the page was |
| * changed; implementations are free to follow a sensible default |
| * behavior. |
| * |
| * @return adjusted page information, including all original information. |
| * NOTE: this may be a different instance than @see CrawlDatum, but |
| * implementations should make sure that it contains at least all |
| * information from @see CrawlDatum}. |
| */ |
| public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, |
| long prevFetchTime, long prevModifiedTime, long fetchTime, |
| long modifiedTime, int state); |
| |
| /** |
| * This method specifies how to schedule refetching of pages marked as GONE. |
| * Default implementation increases fetchInterval by 50%, and if it exceeds |
| * the <code>maxInterval</code> it calls |
| * {@link #forceRefetch(Text, CrawlDatum, boolean)}. |
| * |
| * @param url |
| * URL of the page |
| * |
| * @param datum |
| * datum instance to be adjusted. |
| * |
| * @return adjusted page information, including all original information. |
| * NOTE: this may be a different instance than @see CrawlDatum, but |
| * implementations should make sure that it contains at least all |
| * information from @see CrawlDatum. |
| */ |
| public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum, |
| long prevFetchTime, long prevModifiedTime, long fetchTime); |
| |
| /** |
| * This method adjusts the fetch schedule if fetching needs to be re-tried due |
| * to transient errors. The default implementation sets the next fetch time 1 |
| * day in the future and increases the retry counter. |
| * |
| * @param url |
| * URL of the page. |
| * |
| * @param datum |
| * page information. |
| * |
| * @param prevFetchTime |
| * previous fetch time. |
| * |
| * @param prevModifiedTime |
| * previous modified time. |
| * |
| * @param fetchTime |
| * current fetch time. |
| * |
| * @return adjusted page information, including all original information. |
| * NOTE: this may be a different instance than @see CrawlDatum, but |
| * implementations should make sure that it contains at least all |
| * information from @see CrawlDatum. |
| */ |
| public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum, |
| long prevFetchTime, long prevModifiedTime, long fetchTime); |
| |
| /** |
| * Calculates last fetch time of the given CrawlDatum. |
| * |
| * @return the date as a long. |
| */ |
| public long calculateLastFetchTime(CrawlDatum datum); |
| |
| /** |
| * This method provides information whether the page is suitable for selection |
| * in the current fetchlist. NOTE: a true return value does not guarantee that |
| * the page will be fetched, it just allows it to be included in the further |
| * selection process based on scores. The default implementation checks |
| * <code>fetchTime</code>, if it is higher than the curTime it returns false, |
| * and true otherwise. It will also check that fetchTime is not too remote |
| * (more than <code>maxInterval</code>), in which case it lowers the interval |
| * and returns true. |
| * |
| * @param url |
| * URL of the page. |
| * |
| * @param datum |
| * datum instance. |
| * |
| * @param curTime |
| * reference time (usually set to the time when the fetchlist |
| * generation process was started). |
| * |
| * @return true, if the page should be considered for inclusion in the current |
| * fetchlist, otherwise false. |
| */ |
| public boolean shouldFetch(Text url, CrawlDatum datum, long curTime); |
| |
| /** |
| * This method resets fetchTime, fetchInterval, modifiedTime and page |
| * signature, so that it forces refetching. |
| * |
| * @param url |
| * URL of the page. |
| * |
| * @param datum |
| * datum instance. |
| * |
| * @param asap |
| * if true, force refetch as soon as possible - this sets the |
| * fetchTime to now. If false, force refetch whenever the next fetch |
| * time is set. |
| * |
| * @return adjusted page information, including all original information. |
| * NOTE: this may be a different instance than @see CrawlDatum, but |
| * implementations should make sure that it contains at least all |
| * information from @see CrawlDatum. |
| */ |
| public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap); |
| } |