blob: 1f0301349a7777f014aa94f99510345508c5112e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.crawl;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.HashMap;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.HttpHeaders;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Extension of @see AdaptiveFetchSchedule that allows for more flexible
* configuration of DEC and INC factors for various MIME-types.
*
* This class can be typically used in cases where a recrawl consists of many
* different MIME-types. It's not very common for MIME-types other than
* text/html to change frequently. Using this class you can configure different
* factors per MIME-type so to prefer frequently changing MIME-types over
* others.
*
* For it to work this class relies on the Content-Type MetaData key being
* present in the CrawlDB. This can either be done when injecting new URL's or
* by adding "Content-Type" to the db.parsemeta.to.crawldb configuration setting
* to force MIME-types of newly discovered URL's to be added to the CrawlDB.
*
* @author markus
*/
public class MimeAdaptiveFetchSchedule extends AdaptiveFetchSchedule {
// Loggg
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
// Conf directives
public static final String SCHEDULE_INC_RATE = "db.fetch.schedule.adaptive.inc_rate";
public static final String SCHEDULE_DEC_RATE = "db.fetch.schedule.adaptive.dec_rate";
public static final String SCHEDULE_MIME_FILE = "db.fetch.schedule.mime.file";
// Default values for DEC and INC rate
private float defaultIncRate;
private float defaultDecRate;
// Structure to store inc and dec rates per MIME-type
private class AdaptiveRate {
public float inc;
public float dec;
public AdaptiveRate(Float inc, Float dec) {
this.inc = inc;
this.dec = dec;
}
}
// Here we store the mime's and their delta's
private HashMap<String, AdaptiveRate> mimeMap;
public void setConf(Configuration conf) {
super.setConf(conf);
if (conf == null)
return;
// Read and set the default INC and DEC rates in case we cannot set values
// based on MIME-type
defaultIncRate = conf.getFloat(SCHEDULE_INC_RATE, 0.2f);
defaultDecRate = conf.getFloat(SCHEDULE_DEC_RATE, 0.2f);
// Where's the mime/factor file?
Reader mimeFile = conf.getConfResourceAsReader(conf.get(SCHEDULE_MIME_FILE,
"adaptive-mimetypes.txt"));
try {
readMimeFile(mimeFile);
} catch (IOException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
}
@Override
public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum,
long prevFetchTime, long prevModifiedTime, long fetchTime,
long modifiedTime, int state) {
// Set defaults
INC_RATE = defaultIncRate;
DEC_RATE = defaultDecRate;
// Check if the Content-Type field is available in the CrawlDatum
if (datum.getMetaData().containsKey(HttpHeaders.WRITABLE_CONTENT_TYPE)) {
// Get the MIME-type of the current URL
String currentMime = MimeUtil.cleanMimeType(datum.getMetaData()
.get(HttpHeaders.WRITABLE_CONTENT_TYPE).toString());
// Check if this MIME-type exists in our map
if (mimeMap.containsKey(currentMime)) {
// Yes, set the INC and DEC rates for this MIME-type
INC_RATE = mimeMap.get(currentMime).inc;
DEC_RATE = mimeMap.get(currentMime).dec;
}
}
return super.setFetchSchedule(url, datum, prevFetchTime, prevModifiedTime,
fetchTime, modifiedTime, state);
}
/**
* Reads the mime types and their associated INC/DEC factors in a HashMap
*
* @param mimeFile
* Reader
* @return void
*/
private void readMimeFile(Reader mimeFile) throws IOException {
// Instance of our mime/factor map
mimeMap = new HashMap<>();
// Open a reader
BufferedReader reader = new BufferedReader(mimeFile);
String line = null;
String[] splits = null;
// Read all lines
while ((line = reader.readLine()) != null) {
// Skip blank lines and comments
if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
// Split the line by TAB
splits = line.split("\t");
// Sanity check, we need two or three items
if (splits.length == 3) {
// Add a lower cased MIME-type and the factor to the map
mimeMap.put(StringUtils.lowerCase(splits[0]), new AdaptiveRate(
Float.valueOf(splits[1]), Float.valueOf(splits[2])));
} else {
LOG.warn("Invalid configuration line in: " + line);
}
}
}
}
public static void main(String[] args) throws Exception {
FetchSchedule fs = new MimeAdaptiveFetchSchedule();
fs.setConf(NutchConfiguration.create());
// we start the time at 0, for simplicity
long curTime = 0;
long delta = 1000L * 3600L * 24L; // 2 hours
// we trigger the update of the page every 30 days
long update = 1000L * 3600L * 24L * 30L; // 30 days
boolean changed = true;
long lastModified = 0;
int miss = 0;
int totalMiss = 0;
int maxMiss = 0;
int fetchCnt = 0;
int changeCnt = 0;
// initial fetchInterval is 10 days
CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
// Set a default MIME-type to test with
org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable();
x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text(
"text/html; charset=utf-8"));
p.setMetaData(x);
p.setFetchTime(0);
LOG.info(p.toString());
// let's move the timeline a couple of deltas
for (int i = 0; i < 10000; i++) {
if (lastModified + update < curTime) {
// System.out.println("i=" + i + ", lastModified=" + lastModified +
// ", update=" + update + ", curTime=" + curTime);
changed = true;
changeCnt++;
lastModified = curTime;
}
LOG.info(i + ". " + changed + "\twill fetch at "
+ (p.getFetchTime() / delta) + "\tinterval "
+ (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed "
+ miss);
if (p.getFetchTime() <= curTime) {
fetchCnt++;
fs.setFetchSchedule(new Text("http://www.example.com"), p, p
.getFetchTime(), p.getModifiedTime(), curTime, lastModified,
changed ? FetchSchedule.STATUS_MODIFIED
: FetchSchedule.STATUS_NOTMODIFIED);
LOG.info("\tfetched & adjusted: " + "\twill fetch at "
+ (p.getFetchTime() / delta) + "\tinterval "
+ (p.getFetchInterval() / SECONDS_PER_DAY) + " days");
if (!changed)
miss++;
if (miss > maxMiss)
maxMiss = miss;
changed = false;
totalMiss += miss;
miss = 0;
}
if (changed)
miss++;
curTime += delta;
}
LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt
+ " times.");
}
}