| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.nutch.crawl; |
| |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.lang.invoke.MethodHandles; |
| import java.util.HashMap; |
| |
| import org.apache.commons.lang.StringUtils; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.io.Text; |
| import org.apache.nutch.crawl.CrawlDatum; |
| import org.apache.nutch.metadata.HttpHeaders; |
| import org.apache.nutch.util.MimeUtil; |
| import org.apache.nutch.util.NutchConfiguration; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * Extension of @see AdaptiveFetchSchedule that allows for more flexible |
| * configuration of DEC and INC factors for various MIME-types. |
| * |
| * This class can be typically used in cases where a recrawl consists of many |
| * different MIME-types. It's not very common for MIME-types other than |
| * text/html to change frequently. Using this class you can configure different |
| * factors per MIME-type so to prefer frequently changing MIME-types over |
| * others. |
| * |
| * For it to work this class relies on the Content-Type MetaData key being |
| * present in the CrawlDB. This can either be done when injecting new URL's or |
| * by adding "Content-Type" to the db.parsemeta.to.crawldb configuration setting |
| * to force MIME-types of newly discovered URL's to be added to the CrawlDB. |
| * |
| * @author markus |
| */ |
| public class MimeAdaptiveFetchSchedule extends AdaptiveFetchSchedule { |
| // Loggg |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| // Conf directives |
| public static final String SCHEDULE_INC_RATE = "db.fetch.schedule.adaptive.inc_rate"; |
| public static final String SCHEDULE_DEC_RATE = "db.fetch.schedule.adaptive.dec_rate"; |
| public static final String SCHEDULE_MIME_FILE = "db.fetch.schedule.mime.file"; |
| |
| // Default values for DEC and INC rate |
| private float defaultIncRate; |
| private float defaultDecRate; |
| |
| // Structure to store inc and dec rates per MIME-type |
| private class AdaptiveRate { |
| public float inc; |
| public float dec; |
| |
| public AdaptiveRate(Float inc, Float dec) { |
| this.inc = inc; |
| this.dec = dec; |
| } |
| } |
| |
| // Here we store the mime's and their delta's |
| private HashMap<String, AdaptiveRate> mimeMap; |
| |
| public void setConf(Configuration conf) { |
| super.setConf(conf); |
| if (conf == null) |
| return; |
| |
| // Read and set the default INC and DEC rates in case we cannot set values |
| // based on MIME-type |
| defaultIncRate = conf.getFloat(SCHEDULE_INC_RATE, 0.2f); |
| defaultDecRate = conf.getFloat(SCHEDULE_DEC_RATE, 0.2f); |
| |
| // Where's the mime/factor file? |
| Reader mimeFile = conf.getConfResourceAsReader(conf.get(SCHEDULE_MIME_FILE, |
| "adaptive-mimetypes.txt")); |
| |
| try { |
| readMimeFile(mimeFile); |
| } catch (IOException e) { |
| LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e)); |
| } |
| } |
| |
| @Override |
| public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, |
| long prevFetchTime, long prevModifiedTime, long fetchTime, |
| long modifiedTime, int state) { |
| |
| // Set defaults |
| INC_RATE = defaultIncRate; |
| DEC_RATE = defaultDecRate; |
| |
| // Check if the Content-Type field is available in the CrawlDatum |
| if (datum.getMetaData().containsKey(HttpHeaders.WRITABLE_CONTENT_TYPE)) { |
| // Get the MIME-type of the current URL |
| String currentMime = MimeUtil.cleanMimeType(datum.getMetaData() |
| .get(HttpHeaders.WRITABLE_CONTENT_TYPE).toString()); |
| |
| // Check if this MIME-type exists in our map |
| if (mimeMap.containsKey(currentMime)) { |
| // Yes, set the INC and DEC rates for this MIME-type |
| INC_RATE = mimeMap.get(currentMime).inc; |
| DEC_RATE = mimeMap.get(currentMime).dec; |
| } |
| } |
| |
| return super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime, |
| fetchTime, modifiedTime, state); |
| } |
| |
| /** |
| * Reads the mime types and their associated INC/DEC factors in a HashMap |
| * |
| * @param mimeFile |
| * Reader |
| * @return void |
| */ |
| private void readMimeFile(Reader mimeFile) throws IOException { |
| // Instance of our mime/factor map |
| mimeMap = new HashMap<>(); |
| |
| // Open a reader |
| BufferedReader reader = new BufferedReader(mimeFile); |
| |
| String line = null; |
| String[] splits = null; |
| |
| // Read all lines |
| while ((line = reader.readLine()) != null) { |
| // Skip blank lines and comments |
| if (StringUtils.isNotBlank(line) && !line.startsWith("#")) { |
| // Split the line by TAB |
| splits = line.split("\t"); |
| |
| // Sanity check, we need two or three items |
| if (splits.length == 3) { |
| // Add a lower cased MIME-type and the factor to the map |
| mimeMap.put(StringUtils.lowerCase(splits[0]), new AdaptiveRate( |
| Float.valueOf(splits[1]), Float.valueOf(splits[2]))); |
| } else { |
| LOG.warn("Invalid configuration line in: " + line); |
| } |
| } |
| } |
| } |
| |
| public static void main(String[] args) throws Exception { |
| FetchSchedule fs = new MimeAdaptiveFetchSchedule(); |
| fs.setConf(NutchConfiguration.create()); |
| // we start the time at 0, for simplicity |
| long curTime = 0; |
| long delta = 1000L * 3600L * 24L; // 2 hours |
| // we trigger the update of the page every 30 days |
| long update = 1000L * 3600L * 24L * 30L; // 30 days |
| boolean changed = true; |
| long lastModified = 0; |
| int miss = 0; |
| int totalMiss = 0; |
| int maxMiss = 0; |
| int fetchCnt = 0; |
| int changeCnt = 0; |
| |
| // initial fetchInterval is 10 days |
| CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f); |
| |
| // Set a default MIME-type to test with |
| org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable(); |
| x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text( |
| "text/html; charset=utf-8")); |
| p.setMetaData(x); |
| |
| p.setFetchTime(0); |
| LOG.info(p.toString()); |
| |
| // let's move the timeline a couple of deltas |
| for (int i = 0; i < 10000; i++) { |
| if (lastModified + update < curTime) { |
| // System.out.println("i=" + i + ", lastModified=" + lastModified + |
| // ", update=" + update + ", curTime=" + curTime); |
| changed = true; |
| changeCnt++; |
| lastModified = curTime; |
| } |
| |
| LOG.info(i + ". " + changed + "\twill fetch at " |
| + (p.getFetchTime() / delta) + "\tinterval " |
| + (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed " |
| + miss); |
| |
| if (p.getFetchTime() <= curTime) { |
| fetchCnt++; |
| fs.setFetchSchedule(new Text("http://www.example.com"), p, p |
| .getFetchTime(), p.getModifiedTime(), curTime, lastModified, |
| changed ? FetchSchedule.STATUS_MODIFIED |
| : FetchSchedule.STATUS_NOTMODIFIED); |
| |
| LOG.info("\tfetched & adjusted: " + "\twill fetch at " |
| + (p.getFetchTime() / delta) + "\tinterval " |
| + (p.getFetchInterval() / SECONDS_PER_DAY) + " days"); |
| |
| if (!changed) |
| miss++; |
| if (miss > maxMiss) |
| maxMiss = miss; |
| changed = false; |
| totalMiss += miss; |
| miss = 0; |
| } |
| |
| if (changed) |
| miss++; |
| curTime += delta; |
| } |
| LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss); |
| LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt |
| + " times."); |
| } |
| |
| } |