blob: a7b2930ba30dcaa486af398963e18b1aad1184b0 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.tools;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.lang.invoke.MethodHandles;
import java.net.InetAddress;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.util.URLUtil;
/**
* A simple tool that will spin up multiple threads to resolve urls to ip
* addresses. This can be used to verify that pages that are failing due to
* UnknownHostException during fetching are actually bad and are not failing due
* to a dns problem in fetching.
*/
public class ResolveUrls {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private String urlsFile = null;
private int numThreads = 100;
private ExecutorService pool = null;
private static AtomicInteger numTotal = new AtomicInteger(0);
private static AtomicInteger numErrored = new AtomicInteger(0);
private static AtomicInteger numResolved = new AtomicInteger(0);
private static AtomicLong totalTime = new AtomicLong(0L);
/**
* A Thread which gets the ip address of a single host by name.
*/
private static class ResolverThread extends Thread {
private String url = null;
public ResolverThread(String url) {
this.url = url;
}
public void run() {
numTotal.incrementAndGet();
String host = URLUtil.getHost(url);
long start = System.currentTimeMillis();
try {
// get the address by name and if no error is thrown then it
// is resolved successfully
InetAddress.getByName(host);
LOG.info("Resolved: " + host);
numResolved.incrementAndGet();
} catch (Exception uhe) {
LOG.info("Error Resolving: " + host);
numErrored.incrementAndGet();
}
long end = System.currentTimeMillis();
long total = (end - start);
totalTime.addAndGet(total);
LOG.info(", " + total + " millis");
}
}
/**
* Creates a thread pool for resolving urls. Reads in the url file on the
* local filesystem. For each url it attempts to resolve it keeping a total
* account of the number resolved, errored, and the amount of time.
*/
public void resolveUrls() {
try {
// create a thread pool with a fixed number of threads
pool = Executors.newFixedThreadPool(numThreads);
// read in the urls file and loop through each line, one url per line
BufferedReader buffRead = new BufferedReader(new FileReader(new File(
urlsFile)));
String urlStr = null;
while ((urlStr = buffRead.readLine()) != null) {
// spin up a resolver thread per url
LOG.info("Starting: " + urlStr);
pool.execute(new ResolverThread(urlStr));
}
// close the file and wait for up to 60 seconds before shutting down
// the thread pool to give urls time to finish resolving
buffRead.close();
pool.awaitTermination(60, TimeUnit.SECONDS);
} catch (Exception e) {
// on error shutdown the thread pool immediately
pool.shutdownNow();
LOG.info(StringUtils.stringifyException(e));
}
// shutdown the thread pool and log totals
pool.shutdown();
LOG.info("Total: " + numTotal.get() + ", Resovled: " + numResolved.get()
+ ", Errored: " + numErrored.get() + ", Average Time: "
+ totalTime.get() / numTotal.get());
}
/**
* Create a new ResolveUrls with a file from the local file system.
*
* @param urlsFile
* The local urls file, one url per line.
*/
public ResolveUrls(String urlsFile) {
this(urlsFile, 100);
}
/**
* Create a new ResolveUrls with a urls file and a number of threads for the
* Thread pool. Number of threads is 100 by default.
*
* @param urlsFile
* The local urls file, one url per line.
* @param numThreads
* The number of threads used to resolve urls in parallel.
*/
public ResolveUrls(String urlsFile, int numThreads) {
this.urlsFile = urlsFile;
this.numThreads = numThreads;
}
/**
* Runs the resolve urls tool.
*/
public static void main(String[] args) {
Options options = new Options();
OptionBuilder.withArgName("help");
OptionBuilder.withDescription("show this help message");
Option helpOpts = OptionBuilder.create("help");
options.addOption(helpOpts);
OptionBuilder.withArgName("urls");
OptionBuilder.hasArg();
OptionBuilder.withDescription("the urls file to check");
Option urlOpts = OptionBuilder.create("urls");
options.addOption(urlOpts);
OptionBuilder.withArgName("numThreads");
OptionBuilder.hasArgs();
OptionBuilder.withDescription("the number of threads to use");
Option numThreadOpts = OptionBuilder.create("numThreads");
options.addOption(numThreadOpts);
CommandLineParser parser = new GnuParser();
try {
// parse out common line arguments
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || !line.hasOption("urls")) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("ResolveUrls", options);
return;
}
// get the urls and the number of threads and start the resolver
String urls = line.getOptionValue("urls");
int numThreads = 100;
String numThreadsStr = line.getOptionValue("numThreads");
if (numThreadsStr != null) {
numThreads = Integer.parseInt(numThreadsStr);
}
ResolveUrls resolve = new ResolveUrls(urls, numThreads);
resolve.resolveUrls();
} catch (Exception e) {
LOG.error("ResolveUrls: " + StringUtils.stringifyException(e));
}
}
}