src/java/org/apache/nutch/tools/Benchmark.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.tools;

 import java.io.OutputStream;
 import java.lang.invoke.MethodHandles;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDb;
 import org.apache.nutch.crawl.CrawlDbReader;
 import org.apache.nutch.crawl.Generator;
 import org.apache.nutch.crawl.Injector;
 import org.apache.nutch.crawl.LinkDb;
 import org.apache.nutch.fetcher.Fetcher;
 import org.apache.nutch.parse.ParseSegment;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 public class Benchmark extends Configured implements Tool {

   private static final Logger LOG = LoggerFactory
 	      .getLogger(MethodHandles.lookup().lookupClass());

   public static void main(String[] args) throws Exception {
     Configuration conf = NutchConfiguration.create();
     int res = ToolRunner.run(conf, new Benchmark(), args);
     System.exit(res);
   }

   @SuppressWarnings("unused")
   private static String getDate() {
     return new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(System
         .currentTimeMillis()));
   }

   private void createSeeds(FileSystem fs, Path seedsDir, int count)
       throws Exception {
     OutputStream os = fs.create(new Path(seedsDir, "seeds"));
     for (int i = 0; i < count; i++) {
       String url = "http://www.test-" + i + ".com/\r\n";
       os.write(url.getBytes());
     }
     os.flush();
     os.close();
   }

   public static final class BenchmarkResults {
     Map<String, Map<String, Long>> timings = new HashMap<>();
     List<String> runs = new ArrayList<>();
     List<String> stages = new ArrayList<>();
     int seeds, depth, threads;
     boolean delete;
     long topN;
     long elapsed;
     String plugins;

     public void addTiming(String stage, String run, long timing) {
       if (!runs.contains(run)) {
         runs.add(run);
       }
       if (!stages.contains(stage)) {
         stages.add(stage);
       }
       Map<String, Long> t = timings.get(stage);
       if (t == null) {
         t = new HashMap<>();
         timings.put(stage, t);
       }
       t.put(run, timing);
     }

     public String toString() {
       StringBuilder sb = new StringBuilder();
       sb.append("* Plugins:\t" + plugins + "\n");
       sb.append("* Seeds:\t" + seeds + "\n");
       sb.append("* Depth:\t" + depth + "\n");
       sb.append("* Threads:\t" + threads + "\n");
       sb.append("* TopN:\t" + topN + "\n");
       sb.append("* Delete:\t" + delete + "\n");
       sb.append("* TOTAL ELAPSED:\t" + elapsed + "\n");
       for (String stage : stages) {
         Map<String, Long> timing = timings.get(stage);
         if (timing == null)
           continue;
         sb.append("- stage: " + stage + "\n");
         for (String r : runs) {
           Long Time = timing.get(r);
           if (Time == null) {
             continue;
           }
           sb.append("\trun " + r + "\t" + Time + "\n");
         }
       }
       return sb.toString();
     }

     public List<String> getStages() {
       return stages;
     }

     public List<String> getRuns() {
       return runs;
     }
   }

   @Override
   public int run(String[] args) throws Exception {
     String plugins = "protocol-http|parse-tika|scoring-opic|urlfilter-regex|urlnormalizer-pass";
     int seeds = 1;
     int depth = 10;
     int threads = 10;
     boolean delete = true;
     long topN = Long.MAX_VALUE;

     if (args.length == 0) {
       System.err
           .println("Usage: Benchmark [-seeds NN] [-depth NN] [-threads NN] [-keep] [-maxPerHost NN] [-plugins <regex>]");
       System.err
           .println("\t-seeds NN\tcreate NN unique hosts in a seed list (default: 1)");
       System.err.println("\t-depth NN\tperform NN crawl cycles (default: 10)");
       System.err
           .println("\t-threads NN\tuse NN threads per Fetcher task (default: 10)");
       System.err
           .println("\t-keep\tkeep segment data (default: delete after updatedb)");
       System.err.println("\t-plugins <regex>\toverride 'plugin.includes'.");
       System.err.println("\tNOTE: if not specified, this is reset to: "
           + plugins);
       System.err
           .println("\tNOTE: if 'default' is specified then a value set in nutch-default/nutch-site is used.");
       System.err
           .println("\t-maxPerHost NN\tmax. # of URLs per host in a fetchlist");
       return -1;
     }
     int maxPerHost = Integer.MAX_VALUE;
     for (int i = 0; i < args.length; i++) {
       if (args[i].equals("-seeds")) {
         seeds = Integer.parseInt(args[++i]);
       } else if (args[i].equals("-threads")) {
         threads = Integer.parseInt(args[++i]);
       } else if (args[i].equals("-depth")) {
         depth = Integer.parseInt(args[++i]);
       } else if (args[i].equals("-keep")) {
         delete = false;
       } else if (args[i].equals("-plugins")) {
         plugins = args[++i];
       } else if (args[i].equalsIgnoreCase("-maxPerHost")) {
         maxPerHost = Integer.parseInt(args[++i]);
       } else {
         LOG.error("Invalid argument: '" + args[i] + "'");
         return -1;
       }
     }
     BenchmarkResults res = benchmark(seeds, depth, threads, maxPerHost, topN,
         delete, plugins);
     System.out.println(res);
     return 0;
   }

   public BenchmarkResults benchmark(int seeds, int depth, int threads,
       int maxPerHost, long topN, boolean delete, String plugins)
       throws Exception {
     Configuration conf = getConf();
     conf.set("http.proxy.host", "localhost");
     conf.setInt("http.proxy.port", 8181);
     conf.set("http.agent.name", "test");
     conf.set("http.robots.agents", "test,*");
     if (!plugins.equals("default")) {
       conf.set("plugin.includes", plugins);
     }
     conf.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
     conf.set(Generator.GENERATOR_COUNT_MODE,
         Generator.GENERATOR_COUNT_VALUE_HOST);
     @SuppressWarnings("unused")
     Job job = NutchJob.getInstance(getConf());
     FileSystem fs = FileSystem.get(conf);
     Path dir = new Path(getConf().get("hadoop.tmp.dir"), "bench-"
         + System.currentTimeMillis());
     fs.mkdirs(dir);
     Path rootUrlDir = new Path(dir, "seed");
     fs.mkdirs(rootUrlDir);
     createSeeds(fs, rootUrlDir, seeds);

     if (LOG.isInfoEnabled()) {
       LOG.info("crawl started in: " + dir);
       LOG.info("rootUrlDir = " + rootUrlDir);
       LOG.info("threads = " + threads);
       LOG.info("depth = " + depth);
     }
     BenchmarkResults res = new BenchmarkResults();
     res.delete = delete;
     res.depth = depth;
     res.plugins = plugins;
     res.seeds = seeds;
     res.threads = threads;
     res.topN = topN;
     Path crawlDb = new Path(dir + "/crawldb");
     Path linkDb = new Path(dir + "/linkdb");
     Path segments = new Path(dir + "/segments");
     res.elapsed = System.currentTimeMillis();
     Injector injector = new Injector(getConf());
     Generator generator = new Generator(getConf());
     Fetcher fetcher = new Fetcher(getConf());
     ParseSegment parseSegment = new ParseSegment(getConf());
     CrawlDb crawlDbTool = new CrawlDb(getConf());
     LinkDb linkDbTool = new LinkDb(getConf());

     // initialize crawlDb
     long start = System.currentTimeMillis();
     injector.inject(crawlDb, rootUrlDir);
     long delta = System.currentTimeMillis() - start;
     res.addTiming("inject", "0", delta);
     int i;
     for (i = 0; i < depth; i++) { // generate new segment
       start = System.currentTimeMillis();
       Path[] segs = generator.generate(crawlDb, segments, -1, topN,
           System.currentTimeMillis());
       delta = System.currentTimeMillis() - start;
       res.addTiming("generate", i + "", delta);
       if (segs == null) {
         LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
         break;
       }
       start = System.currentTimeMillis();
       fetcher.fetch(segs[0], threads); // fetch it
       delta = System.currentTimeMillis() - start;
       res.addTiming("fetch", i + "", delta);
       if (!Fetcher.isParsing(conf)) {
         start = System.currentTimeMillis();
         parseSegment.parse(segs[0]); // parse it, if needed
         delta = System.currentTimeMillis() - start;
         res.addTiming("parse", i + "", delta);
       }
       start = System.currentTimeMillis();
       crawlDbTool.update(crawlDb, segs, true, true); // update crawldb
       delta = System.currentTimeMillis() - start;
       res.addTiming("update", i + "", delta);
       start = System.currentTimeMillis();
       linkDbTool.invert(linkDb, segs, true, true, false); // invert links
       delta = System.currentTimeMillis() - start;
       res.addTiming("invert", i + "", delta);
       // delete data
       if (delete) {
         for (Path p : segs) {
           fs.delete(p, true);
         }
       }
     }
     if (i == 0) {
       LOG.warn("No URLs to fetch - check your seed list and URL filters.");
     }
     if (LOG.isInfoEnabled()) {
       LOG.info("crawl finished: " + dir);
     }
     res.elapsed = System.currentTimeMillis() - res.elapsed;
     @SuppressWarnings("resource")
     CrawlDbReader dbreader = new CrawlDbReader();
     dbreader.processStatJob(crawlDb.toString(), conf, false);
     return res;
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.tools;

	import java.io.OutputStream;
	import java.lang.invoke.MethodHandles;
	import java.text.SimpleDateFormat;
	import java.util.ArrayList;
	import java.util.Date;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.conf.Configured;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.util.Tool;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.util.ToolRunner;
	import org.apache.nutch.crawl.CrawlDb;
	import org.apache.nutch.crawl.CrawlDbReader;
	import org.apache.nutch.crawl.Generator;
	import org.apache.nutch.crawl.Injector;
	import org.apache.nutch.crawl.LinkDb;
	import org.apache.nutch.fetcher.Fetcher;
	import org.apache.nutch.parse.ParseSegment;
	import org.apache.nutch.util.NutchConfiguration;
	import org.apache.nutch.util.NutchJob;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	public class Benchmark extends Configured implements Tool {

	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	public static void main(String[] args) throws Exception {
	Configuration conf = NutchConfiguration.create();
	int res = ToolRunner.run(conf, new Benchmark(), args);
	System.exit(res);
	}

	@SuppressWarnings("unused")
	private static String getDate() {
	return new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(System
	.currentTimeMillis()));
	}

	private void createSeeds(FileSystem fs, Path seedsDir, int count)
	throws Exception {
	OutputStream os = fs.create(new Path(seedsDir, "seeds"));
	for (int i = 0; i < count; i++) {
	String url = "http://www.test-" + i + ".com/\r\n";
	os.write(url.getBytes());
	}
	os.flush();
	os.close();
	}

	public static final class BenchmarkResults {
	Map<String, Map<String, Long>> timings = new HashMap<>();
	List<String> runs = new ArrayList<>();
	List<String> stages = new ArrayList<>();
	int seeds, depth, threads;
	boolean delete;
	long topN;
	long elapsed;
	String plugins;

	public void addTiming(String stage, String run, long timing) {
	if (!runs.contains(run)) {
	runs.add(run);
	}
	if (!stages.contains(stage)) {
	stages.add(stage);
	}
	Map<String, Long> t = timings.get(stage);
	if (t == null) {
	t = new HashMap<>();
	timings.put(stage, t);
	}
	t.put(run, timing);
	}

	public String toString() {
	StringBuilder sb = new StringBuilder();
	sb.append("* Plugins:\t" + plugins + "\n");
	sb.append("* Seeds:\t" + seeds + "\n");
	sb.append("* Depth:\t" + depth + "\n");
	sb.append("* Threads:\t" + threads + "\n");
	sb.append("* TopN:\t" + topN + "\n");
	sb.append("* Delete:\t" + delete + "\n");
	sb.append("* TOTAL ELAPSED:\t" + elapsed + "\n");
	for (String stage : stages) {
	Map<String, Long> timing = timings.get(stage);
	if (timing == null)
	continue;
	sb.append("- stage: " + stage + "\n");
	for (String r : runs) {
	Long Time = timing.get(r);
	if (Time == null) {
	continue;
	}
	sb.append("\trun " + r + "\t" + Time + "\n");
	}
	}
	return sb.toString();
	}

	public List<String> getStages() {
	return stages;
	}

	public List<String> getRuns() {
	return runs;
	}
	}

	@Override
	public int run(String[] args) throws Exception {
	String plugins = "protocol-http\|parse-tika\|scoring-opic\|urlfilter-regex\|urlnormalizer-pass";
	int seeds = 1;
	int depth = 10;
	int threads = 10;
	boolean delete = true;
	long topN = Long.MAX_VALUE;

	if (args.length == 0) {
	System.err
	.println("Usage: Benchmark [-seeds NN] [-depth NN] [-threads NN] [-keep] [-maxPerHost NN] [-plugins <regex>]");
	System.err
	.println("\t-seeds NN\tcreate NN unique hosts in a seed list (default: 1)");
	System.err.println("\t-depth NN\tperform NN crawl cycles (default: 10)");
	System.err
	.println("\t-threads NN\tuse NN threads per Fetcher task (default: 10)");
	System.err
	.println("\t-keep\tkeep segment data (default: delete after updatedb)");
	System.err.println("\t-plugins <regex>\toverride 'plugin.includes'.");
	System.err.println("\tNOTE: if not specified, this is reset to: "
	+ plugins);
	System.err
	.println("\tNOTE: if 'default' is specified then a value set in nutch-default/nutch-site is used.");
	System.err
	.println("\t-maxPerHost NN\tmax. # of URLs per host in a fetchlist");
	return -1;
	}
	int maxPerHost = Integer.MAX_VALUE;
	for (int i = 0; i < args.length; i++) {
	if (args[i].equals("-seeds")) {
	seeds = Integer.parseInt(args[++i]);
	} else if (args[i].equals("-threads")) {
	threads = Integer.parseInt(args[++i]);
	} else if (args[i].equals("-depth")) {
	depth = Integer.parseInt(args[++i]);
	} else if (args[i].equals("-keep")) {
	delete = false;
	} else if (args[i].equals("-plugins")) {
	plugins = args[++i];
	} else if (args[i].equalsIgnoreCase("-maxPerHost")) {
	maxPerHost = Integer.parseInt(args[++i]);
	} else {
	LOG.error("Invalid argument: '" + args[i] + "'");
	return -1;
	}
	}
	BenchmarkResults res = benchmark(seeds, depth, threads, maxPerHost, topN,
	delete, plugins);
	System.out.println(res);
	return 0;
	}

	public BenchmarkResults benchmark(int seeds, int depth, int threads,
	int maxPerHost, long topN, boolean delete, String plugins)
	throws Exception {
	Configuration conf = getConf();
	conf.set("http.proxy.host", "localhost");
	conf.setInt("http.proxy.port", 8181);
	conf.set("http.agent.name", "test");
	conf.set("http.robots.agents", "test,*");
	if (!plugins.equals("default")) {
	conf.set("plugin.includes", plugins);
	}
	conf.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
	conf.set(Generator.GENERATOR_COUNT_MODE,
	Generator.GENERATOR_COUNT_VALUE_HOST);
	@SuppressWarnings("unused")
	Job job = NutchJob.getInstance(getConf());
	FileSystem fs = FileSystem.get(conf);
	Path dir = new Path(getConf().get("hadoop.tmp.dir"), "bench-"
	+ System.currentTimeMillis());
	fs.mkdirs(dir);
	Path rootUrlDir = new Path(dir, "seed");
	fs.mkdirs(rootUrlDir);
	createSeeds(fs, rootUrlDir, seeds);

	if (LOG.isInfoEnabled()) {
	LOG.info("crawl started in: " + dir);
	LOG.info("rootUrlDir = " + rootUrlDir);
	LOG.info("threads = " + threads);
	LOG.info("depth = " + depth);
	}
	BenchmarkResults res = new BenchmarkResults();
	res.delete = delete;
	res.depth = depth;
	res.plugins = plugins;
	res.seeds = seeds;
	res.threads = threads;
	res.topN = topN;
	Path crawlDb = new Path(dir + "/crawldb");
	Path linkDb = new Path(dir + "/linkdb");
	Path segments = new Path(dir + "/segments");
	res.elapsed = System.currentTimeMillis();
	Injector injector = new Injector(getConf());
	Generator generator = new Generator(getConf());
	Fetcher fetcher = new Fetcher(getConf());
	ParseSegment parseSegment = new ParseSegment(getConf());
	CrawlDb crawlDbTool = new CrawlDb(getConf());
	LinkDb linkDbTool = new LinkDb(getConf());

	// initialize crawlDb
	long start = System.currentTimeMillis();
	injector.inject(crawlDb, rootUrlDir);
	long delta = System.currentTimeMillis() - start;
	res.addTiming("inject", "0", delta);
	int i;
	for (i = 0; i < depth; i++) { // generate new segment
	start = System.currentTimeMillis();
	Path[] segs = generator.generate(crawlDb, segments, -1, topN,
	System.currentTimeMillis());
	delta = System.currentTimeMillis() - start;
	res.addTiming("generate", i + "", delta);
	if (segs == null) {
	LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
	break;
	}
	start = System.currentTimeMillis();
	fetcher.fetch(segs[0], threads); // fetch it
	delta = System.currentTimeMillis() - start;
	res.addTiming("fetch", i + "", delta);
	if (!Fetcher.isParsing(conf)) {
	start = System.currentTimeMillis();
	parseSegment.parse(segs[0]); // parse it, if needed
	delta = System.currentTimeMillis() - start;
	res.addTiming("parse", i + "", delta);
	}
	start = System.currentTimeMillis();
	crawlDbTool.update(crawlDb, segs, true, true); // update crawldb
	delta = System.currentTimeMillis() - start;
	res.addTiming("update", i + "", delta);
	start = System.currentTimeMillis();
	linkDbTool.invert(linkDb, segs, true, true, false); // invert links
	delta = System.currentTimeMillis() - start;
	res.addTiming("invert", i + "", delta);
	// delete data
	if (delete) {
	for (Path p : segs) {
	fs.delete(p, true);
	}
	}
	}
	if (i == 0) {
	LOG.warn("No URLs to fetch - check your seed list and URL filters.");
	}
	if (LOG.isInfoEnabled()) {
	LOG.info("crawl finished: " + dir);
	}
	res.elapsed = System.currentTimeMillis() - res.elapsed;
	@SuppressWarnings("resource")
	CrawlDbReader dbreader = new CrawlDbReader();
	dbreader.processStatJob(crawlDb.toString(), conf, false);
	return res;
	}

	}