blob: bcbacdd7bc51555005f46af4422b1e01ee77cf06 [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.nutch.util;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.List;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MapFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.hostdb.HostDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import crawlercommons.robots.BaseRobotRules;
import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapIndex;
import crawlercommons.sitemaps.SiteMapParser;
import crawlercommons.sitemaps.SiteMapURL;
* <p>Performs Sitemap processing by fetching sitemap links, parsing the content and merging
* the urls from Sitemap (with the metadata) with the existing crawldb.</p>
* <p>There are two use cases supported in Nutch's Sitemap processing:</p>
* <ol>
* <li>Sitemaps are considered as "remote seed lists". Crawl administrators can prepare a
* list of sitemap links and get only those sitemap pages. This suits well for targeted
* crawl of specific hosts.</li>
* <li>For open web crawl, it is not possible to track each host and get the sitemap links
* manually. Nutch would automatically get the sitemaps for all the hosts seen in the
* crawls and inject the urls from sitemap to the crawldb.</li>
* </ol>
* <p>For more details see:
* </p>
public class SitemapProcessor extends Configured implements Tool {
public static final Logger LOG = LoggerFactory.getLogger(SitemapProcessor.class);
public static final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
public static final String CURRENT_NAME = "current";
public static final String LOCK_NAME = ".locked";
public static final String SITEMAP_STRICT_PARSING = "sitemap.strict.parsing";
public static final String SITEMAP_URL_FILTERING = "sitemap.url.filter";
public static final String SITEMAP_URL_NORMALIZING = "sitemap.url.normalize";
public static final String SITEMAP_ALWAYS_TRY_SITEMAPXML_ON_ROOT = "sitemap.url.default.sitemap.xml";
public static final String SITEMAP_OVERWRITE_EXISTING = "sitemap.url.overwrite.existing";
public static final String SITEMAP_REDIR_MAX = "sitemap.redir.max";
public static final String SITEMAP_SIZE_MAX = "sitemap.size.max";
private static class SitemapMapper extends Mapper<Text, Writable, Text, CrawlDatum> {
private ProtocolFactory protocolFactory = null;
private boolean strict = true;
private boolean filter = true;
private boolean normalize = true;
private boolean tryDefaultSitemapXml = true;
private int maxRedir = 3;
private URLFilters filters = null;
private URLNormalizers normalizers = null;
private CrawlDatum datum = new CrawlDatum();
private SiteMapParser parser = null;
public void setup(Context context) {
Configuration conf = context.getConfiguration();
int maxSize = conf.getInt(SITEMAP_SIZE_MAX, SiteMapParser.MAX_BYTES_ALLOWED);
conf.setInt("http.content.limit", maxSize);
conf.setInt("file.content.limit", maxSize);
this.protocolFactory = new ProtocolFactory(conf);
this.filter = conf.getBoolean(SITEMAP_URL_FILTERING, true);
this.normalize = conf.getBoolean(SITEMAP_URL_NORMALIZING, true);
this.strict = conf.getBoolean(SITEMAP_STRICT_PARSING, true);
this.tryDefaultSitemapXml = conf.getBoolean(SITEMAP_ALWAYS_TRY_SITEMAPXML_ON_ROOT, true);
this.maxRedir = conf.getInt(SITEMAP_REDIR_MAX, 3);
this.parser = new SiteMapParser(strict);
if (filter) {
filters = new URLFilters(conf);
if (normalize) {
normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_DEFAULT);
public void map(Text key, Writable value, Context context) throws IOException, InterruptedException {
String url;
try {
if (value instanceof CrawlDatum) {
// If its an entry from CrawlDb, emit it. It will be merged in the reducer
context.write(key, (CrawlDatum) value);
else if (value instanceof HostDatum) {
generateSitemapsFromHostname(key.toString(), context);
else if (value instanceof Text) {
// Input can be sitemap URL or hostname
url = key.toString();
if (url.startsWith("http://") ||
url.startsWith("https://") ||
url.startsWith("ftp://") ||
url.startsWith("file:/")) {
// For entry from sitemap urls file, fetch the sitemap, extract urls and emit those
if((url = filterNormalize(url)) == null) {
context.getCounter("Sitemap", "filtered_records").increment(1);
context.getCounter("Sitemap", "sitemap_seeds").increment(1);
generateSitemapUrlDatum(protocolFactory.getProtocol(url), url, context);
} else {"generateSitemapsFromHostname: " + key.toString());
generateSitemapsFromHostname(key.toString(), context);
} catch (Exception e) {
LOG.warn("Exception for record {} : {}", key.toString(), StringUtils.stringifyException(e));
/* Filters and or normalizes the input URL */
private String filterNormalize(String url) {
try {
if (normalizers != null)
url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
if (filters != null)
url = filters.filter(url);
} catch (Exception e) {
return null;
return url;
private void generateSitemapsFromHostname(String host, Context context) {
try {
// For entry from hostdb, get sitemap url(s) from robots.txt, fetch the sitemap,
// extract urls and emit those
// try different combinations of schemes one by one till we get rejection in all cases
String url;
if((url = filterNormalize("http://" + host + "/")) == null &&
(url = filterNormalize("https://" + host + "/")) == null &&
(url = filterNormalize("ftp://" + host + "/")) == null &&
(url = filterNormalize("file:/" + host + "/")) == null) {
context.getCounter("Sitemap", "filtered_records").increment(1);
// We may wish to use the robots.txt content as the third parameter for .getRobotRules
BaseRobotRules rules = protocolFactory.getProtocol(url).getRobotRules(new Text(url), datum, null);
List<String> sitemaps = rules.getSitemaps();
if (tryDefaultSitemapXml && sitemaps.size() == 0) {
sitemaps.add(url + "sitemap.xml");
for (String sitemap : sitemaps) {
context.getCounter("Sitemap", "sitemaps_from_hostname").increment(1);
sitemap = filterNormalize(sitemap);
if (sitemap == null) {
context.getCounter("Sitemap", "filtered_sitemaps_from_hostname")
} else {
sitemap, context);
} catch (Exception e) {
LOG.warn("Exception for record {} : {}", host, StringUtils.stringifyException(e));
private void generateSitemapUrlDatum(Protocol protocol, String url, Context context) throws Exception {
ProtocolOutput output = protocol.getProtocolOutput(new Text(url), datum);
ProtocolStatus status = output.getStatus();
Content content = output.getContent();
// Following redirects http > https and what else
int maxRedir = this.maxRedir;
while (!output.getStatus().isSuccess() && output.getStatus().isRedirect() && maxRedir > 0) {
String[] stuff = output.getStatus().getArgs();
url = filterNormalize(stuff[0]);
// get out!
if (url == null) {
output = protocol.getProtocolOutput(new Text(url), datum);
status = output.getStatus();
content = output.getContent();
if(status.getCode() != ProtocolStatus.SUCCESS) {
// If there were any problems fetching the sitemap, log the error and let it go. Not sure how often
// sitemaps are redirected. In future we might have to handle redirects.
context.getCounter("Sitemap", "failed_fetches").increment(1);
LOG.error("Error while fetching the sitemap. Status code: {} for {}", status.getCode(), url);
AbstractSiteMap asm = parser.parseSiteMap(content.getContentType(), content.getContent(), new URL(url));
if(asm instanceof SiteMap) {"Parsing sitemap file: {}", asm.getUrl().toString());
SiteMap sm = (SiteMap) asm;
Collection<SiteMapURL> sitemapUrls = sm.getSiteMapUrls();
for(SiteMapURL sitemapUrl: sitemapUrls) {
// If 'strict' is ON, only allow valid urls. Else allow all urls
if(!strict || sitemapUrl.isValid()) {
String key = filterNormalize(sitemapUrl.getUrl().toString());
if (key != null) {
CrawlDatum sitemapUrlDatum = new CrawlDatum();
sitemapUrlDatum.setScore((float) sitemapUrl.getPriority());
if(sitemapUrl.getChangeFrequency() != null) {
int fetchInterval = -1;
switch(sitemapUrl.getChangeFrequency()) {
case ALWAYS: fetchInterval = 1; break;
case HOURLY: fetchInterval = 3600; break; // 60*60
case DAILY: fetchInterval = 86400; break; // 60*60*24
case WEEKLY: fetchInterval = 604800; break; // 60*60*24*7
case MONTHLY: fetchInterval = 2592000; break; // 60*60*24*30
case YEARLY: fetchInterval = 31536000; break; // 60*60*24*365
case NEVER: fetchInterval = Integer.MAX_VALUE; break; // Loose "NEVER" contract
if(sitemapUrl.getLastModified() != null) {
context.write(new Text(key), sitemapUrlDatum);
else if (asm instanceof SiteMapIndex) {
SiteMapIndex index = (SiteMapIndex) asm;
Collection<AbstractSiteMap> sitemapUrls = index.getSitemaps(true);
if (sitemapUrls.isEmpty()) {
}"Parsing sitemap index file: {}", index.getUrl().toString());
for (AbstractSiteMap sitemap : sitemapUrls) {
String sitemapUrl = filterNormalize(sitemap.getUrl().toString());
if (sitemapUrl != null) {
generateSitemapUrlDatum(protocol, sitemapUrl, context);
private static class SitemapReducer extends Reducer<Text, CrawlDatum, Text, CrawlDatum> {
CrawlDatum sitemapDatum = null;
CrawlDatum originalDatum = null;
private boolean overwriteExisting = false; // DO NOT ENABLE!!
public void setup(Context context) {
Configuration conf = context.getConfiguration();
this.overwriteExisting = conf.getBoolean(SITEMAP_OVERWRITE_EXISTING, false);
public void reduce(Text key, Iterable<CrawlDatum> values, Context context)
throws IOException, InterruptedException {
sitemapDatum = null;
originalDatum = null;
for (CrawlDatum curr: values) {
if(curr.getStatus() == CrawlDatum.STATUS_INJECTED) {
sitemapDatum = new CrawlDatum();
else {
originalDatum = new CrawlDatum();
if(originalDatum != null) {
// The url was already present in crawldb. If we got the same url from sitemap too, save
// the information from sitemap to the original datum. Emit the original crawl datum
if(sitemapDatum != null && overwriteExisting) {
context.getCounter("Sitemap", "existing_sitemap_entries").increment(1);
context.write(key, originalDatum);
else if(sitemapDatum != null) {
// For the newly discovered links via sitemap, set the status as unfetched and emit
context.getCounter("Sitemap", "new_sitemap_entries").increment(1);
context.write(key, sitemapDatum);
public void sitemap(Path crawldb, Path hostdb, Path sitemapUrlDir, boolean strict, boolean filter,
boolean normalize, int threads) throws Exception {
long start = System.currentTimeMillis();"SitemapProcessor: Starting at {}", sdf.format(start));
FileSystem fs = crawldb.getFileSystem(getConf());
Path old = new Path(crawldb, "old");
Path current = new Path(crawldb, "current");
Path tempCrawlDb = new Path(crawldb, "crawldb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
// lock an existing crawldb to prevent multiple simultaneous updates
Path lock = new Path(crawldb, LOCK_NAME);
if (!fs.exists(current))
LockUtil.createLockFile(fs, lock, false);
Configuration conf = getConf();
conf.setBoolean(SITEMAP_STRICT_PARSING, strict);
conf.setBoolean(SITEMAP_URL_FILTERING, filter);
conf.setBoolean(SITEMAP_URL_NORMALIZING, normalize);
conf.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
Job job = Job.getInstance(conf, "SitemapProcessor_" + crawldb.toString());
// add crawlDb, sitemap url directory and hostDb to input paths
MultipleInputs.addInputPath(job, current, SequenceFileInputFormat.class);
if (sitemapUrlDir != null)
MultipleInputs.addInputPath(job, sitemapUrlDir, KeyValueTextInputFormat.class);
if (hostdb != null)
MultipleInputs.addInputPath(job, new Path(hostdb, CURRENT_NAME), SequenceFileInputFormat.class);
FileOutputFormat.setOutputPath(job, tempCrawlDb);
MultithreadedMapper.setMapperClass(job, SitemapMapper.class);
MultithreadedMapper.setNumberOfThreads(job, threads);
try {
boolean success = job.waitForCompletion(true);
if (!success) {
String message = "SitemapProcessor_" + crawldb.toString()
+ " job did not succeed, job status: " + job.getStatus().getState()
+ ", reason: " + job.getStatus().getFailureInfo();
NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs);
// throw exception so that calling routine can exit with error
throw new RuntimeException(message);
boolean preserveBackup = conf.getBoolean("db.preserve.backup", true);
if (!preserveBackup && fs.exists(old))
fs.delete(old, true);
FSUtils.replace(fs, old, current, true);
FSUtils.replace(fs, current, tempCrawlDb, true);
LockUtil.removeLockFile(fs, lock);
if (LOG.isInfoEnabled()) {
long filteredRecords = job.getCounters().findCounter("Sitemap", "filtered_records").getValue();
long fromHostname = job.getCounters().findCounter("Sitemap", "sitemaps_from_hostname").getValue();
long fromSeeds = job.getCounters().findCounter("Sitemap", "sitemap_seeds").getValue();
long failedFetches = job.getCounters().findCounter("Sitemap", "failed_fetches").getValue();
long newSitemapEntries = job.getCounters().findCounter("Sitemap", "new_sitemap_entries").getValue();"SitemapProcessor: Total records rejected by filters: {}", filteredRecords);"SitemapProcessor: Total sitemaps from host name: {}", fromHostname);"SitemapProcessor: Total sitemaps from seed urls: {}", fromSeeds);"SitemapProcessor: Total failed sitemap fetches: {}", failedFetches);"SitemapProcessor: Total new sitemap entries added: {}", newSitemapEntries);
long end = System.currentTimeMillis();"SitemapProcessor: Finished at {}, elapsed: {}", sdf.format(end), TimingUtil.elapsedTime(start, end));
} catch (IOException | InterruptedException | ClassNotFoundException e) {
LOG.error("SitemapProcessor_" + crawldb.toString(), e);
NutchJob.cleanupAfterFailure(tempCrawlDb, lock, fs);
throw e;
public static void main(String[] args) throws Exception {
int res =, new SitemapProcessor(), args);
public static void usage() {
System.err.println("Usage:\n SitemapProcessor <crawldb> [-hostdb <hostdb>] [-sitemapUrls <url_dir>] " +
"[-threads <threads>] [-force] [-noStrict] [-noFilter] [-noNormalize]\n");
System.err.println("\t<crawldb>\t\tpath to crawldb where the sitemap urls would be injected");
System.err.println("\t-hostdb <hostdb>\tpath of a hostdb. Sitemap(s) from these hosts would be downloaded");
System.err.println("\t-sitemapUrls <url_dir>\tpath to directory with sitemap urls or hostnames");
System.err.println("\t-threads <threads>\tNumber of threads created per mapper to fetch sitemap urls (default: 8)");
System.err.println("\t-force\t\t\tforce update even if CrawlDb appears to be locked (CAUTION advised)");
System.err.println("\t-noStrict\t\tBy default Sitemap parser rejects invalid urls. '-noStrict' disables that.");
System.err.println("\t-noFilter\t\tturn off URLFilters on urls (optional)");
System.err.println("\t-noNormalize\t\tturn off URLNormalizer on urls (optional)");
public int run(String[] args) throws Exception {
if (args.length < 3) {
return -1;
Path crawlDb = new Path(args[0]);
Path hostDb = null;
Path urlDir = null;
boolean strict = true;
boolean filter = true;
boolean normalize = true;
int threads = 8;
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-hostdb")) {
hostDb = new Path(args[++i]);"SitemapProcessor: hostdb: {}", hostDb);
else if (args[i].equals("-sitemapUrls")) {
urlDir = new Path(args[++i]);"SitemapProcessor: sitemap urls dir: {}", urlDir);
else if (args[i].equals("-threads")) {
threads = Integer.parseInt(args[++i]);"SitemapProcessor: threads: {}", threads);
else if (args[i].equals("-noStrict")) {"SitemapProcessor: 'strict' parsing disabled");
strict = false;
else if (args[i].equals("-noFilter")) {"SitemapProcessor: filtering disabled");
filter = false;
else if (args[i].equals("-noNormalize")) {"SitemapProcessor: normalizing disabled");
normalize = false;
else {"SitemapProcessor: Found invalid argument \"{}\"\n", args[i]);
return -1;
try {
sitemap(crawlDb, hostDb, urlDir, strict, filter, normalize, threads);
return 0;
} catch (Exception e) {
LOG.error("SitemapProcessor: {}", StringUtils.stringifyException(e));
return -1;