| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.crawl; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.conf.Configured; |
| import org.apache.hadoop.io.Text; |
| import org.apache.nutch.crawl.CrawlDatum; |
| |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import java.lang.invoke.MethodHandles; |
| |
| /** |
| * This class provides common methods for implementations of |
| * <code>FetchSchedule</code>. |
| * |
| * @author Andrzej Bialecki |
| */ |
| public abstract class AbstractFetchSchedule extends Configured implements |
| FetchSchedule { |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| protected int defaultInterval; |
| protected int maxInterval; |
| |
| public AbstractFetchSchedule() { |
| super(null); |
| } |
| |
| public AbstractFetchSchedule(Configuration conf) { |
| super(conf); |
| } |
| |
| public void setConf(Configuration conf) { |
| super.setConf(conf); |
| if (conf == null) |
| return; |
| defaultInterval = conf.getInt("db.fetch.interval.default", 0); |
| maxInterval = conf.getInt("db.fetch.interval.max", 0); |
| LOG.info("defaultInterval=" + defaultInterval); |
| LOG.info("maxInterval=" + maxInterval); |
| } |
| |
| /** |
| * Initialize fetch schedule related data. Implementations should at least set |
| * the <code>fetchTime</code> and <code>fetchInterval</code>. The default |
| * implementation sets the <code>fetchTime</code> to now, using the default |
| * <code>fetchInterval</code>. |
| * |
| * @param url |
| * URL of the page. |
| * |
| * @param datum |
| * datum instance to be initialized (modified in place). |
| */ |
| public CrawlDatum initializeSchedule(Text url, CrawlDatum datum) { |
| datum.setFetchTime(System.currentTimeMillis()); |
| datum.setFetchInterval(defaultInterval); |
| datum.setRetriesSinceFetch(0); |
| return datum; |
| } |
| |
| /** |
| * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a |
| * successfully fetched page. NOTE: this implementation resets the retry |
| * counter - extending classes should call super.setFetchSchedule() to |
| * preserve this behavior. |
| */ |
| public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, |
| long prevFetchTime, long prevModifiedTime, long fetchTime, |
| long modifiedTime, int state) { |
| datum.setRetriesSinceFetch(0); |
| return datum; |
| } |
| |
| /** |
| * This method specifies how to schedule refetching of pages marked as GONE. |
| * Default implementation increases fetchInterval by 50% but the value may |
| * never exceed <code>maxInterval</code>. |
| * |
| * @param url |
| * URL of the page. |
| * |
| * @param datum |
| * datum instance to be adjusted. |
| * |
| * @return adjusted page information, including all original information. |
| * NOTE: this may be a different instance than @see CrawlDatum, but |
| * implementations should make sure that it contains at least all |
| * information from @see CrawlDatum. |
| */ |
| public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum, |
| long prevFetchTime, long prevModifiedTime, long fetchTime) { |
| // no page is truly GONE ... just increase the interval by 50% |
| // and try much later. |
| if ((datum.getFetchInterval() * 1.5f) < maxInterval) |
| datum.setFetchInterval(datum.getFetchInterval() * 1.5f); |
| else |
| datum.setFetchInterval(maxInterval * 0.9f); |
| datum.setFetchTime(fetchTime + (long) datum.getFetchInterval() * 1000); |
| return datum; |
| } |
| |
| /** |
| * This method adjusts the fetch schedule if fetching needs to be re-tried due |
| * to transient errors. The default implementation sets the next fetch time 1 |
| * day in the future and increases the retry counter. |
| * |
| * @param url |
| * URL of the page. |
| * |
| * @param datum |
| * page information. |
| * |
| * @param prevFetchTime |
| * previous fetch time. |
| * |
| * @param prevModifiedTime |
| * previous modified time. |
| * |
| * @param fetchTime |
| * current fetch time. |
| * |
| * @return adjusted page information, including all original information. |
| * NOTE: this may be a different instance than @see CrawlDatum, but |
| * implementations should make sure that it contains at least all |
| * information from @see CrawlDatum. |
| */ |
| public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum, |
| long prevFetchTime, long prevModifiedTime, long fetchTime) { |
| datum.setFetchTime(fetchTime + (long) SECONDS_PER_DAY * 1000); |
| datum.setRetriesSinceFetch(datum.getRetriesSinceFetch() + 1); |
| return datum; |
| } |
| |
| /** |
| * This method return the last fetch time of the CrawlDatum |
| * |
| * @return the date as a long. |
| */ |
| public long calculateLastFetchTime(CrawlDatum datum) { |
| if (datum.getStatus() == CrawlDatum.STATUS_DB_UNFETCHED) { |
| return 0L; |
| } else { |
| return datum.getFetchTime() - (long) datum.getFetchInterval() * 1000; |
| } |
| } |
| |
| /** |
| * This method provides information whether the page is suitable for selection |
| * in the current fetchlist. NOTE: a true return value does not guarantee that |
| * the page will be fetched, it just allows it to be included in the further |
| * selection process based on scores. The default implementation checks |
| * <code>fetchTime</code>, if it is higher than the <code>curTime</code> it |
| * returns false, and true otherwise. It will also check that fetchTime is not |
| * too remote (more than <code>maxInterval</code>, in which case it lowers the |
| * interval and returns true. |
| * |
| * @param url |
| * URL of the page. |
| * |
| * @param datum |
| * datum instance. |
| * |
| * @param curTime |
| * reference time (usually set to the time when the fetchlist |
| * generation process was started). |
| * |
| * @return true, if the page should be considered for inclusion in the current |
| * fetchlist, otherwise false. |
| */ |
| public boolean shouldFetch(Text url, CrawlDatum datum, long curTime) { |
| // pages are never truly GONE - we have to check them from time to time. |
| // pages with too long a fetchInterval are adjusted so that they fit within |
| // a maximum fetchInterval (segment retention period). |
| if (datum.getFetchTime() - curTime > (long) maxInterval * 1000) { |
| if (datum.getFetchInterval() > maxInterval) { |
| datum.setFetchInterval(maxInterval * 0.9f); |
| } |
| datum.setFetchTime(curTime); |
| } |
| if (datum.getFetchTime() > curTime) { |
| return false; // not time yet |
| } |
| return true; |
| } |
| |
| /** |
| * This method resets fetchTime, fetchInterval, modifiedTime, |
| * retriesSinceFetch and page signature, so that it forces refetching. |
| * |
| * @param url |
| * URL of the page. |
| * |
| * @param datum |
| * datum instance. |
| * |
| * @param asap |
| * if true, force refetch as soon as possible - this sets the |
| * fetchTime to now. If false, force refetch whenever the next fetch |
| * time is set. |
| */ |
| public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap) { |
| // reduce fetchInterval so that it fits within the max value |
| if (datum.getFetchInterval() > maxInterval) |
| datum.setFetchInterval(maxInterval * 0.9f); |
| datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); |
| datum.setRetriesSinceFetch(0); |
| datum.setSignature(null); |
| datum.setModifiedTime(0L); |
| if (asap) |
| datum.setFetchTime(System.currentTimeMillis()); |
| return datum; |
| } |
| |
| } |