mapreduce/src/test/mapred/org/apache/hadoop/mapred/MRBench.java - hadoop - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hadoop.mapred;

 import java.io.IOException;
 import java.io.PrintStream;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.Random;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.UTF8;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;

 /**
  * Runs a job multiple times and takes average of all runs.
  */
 public class MRBench extends Configured implements Tool{

   private static final Log LOG = LogFactory.getLog(MRBench.class);
   private static Path BASE_DIR =
     new Path(System.getProperty("test.build.data","/benchmarks/MRBench"));
   private static Path INPUT_DIR = new Path(BASE_DIR, "mr_input");
   private static Path OUTPUT_DIR = new Path(BASE_DIR, "mr_output");

   public static enum Order {RANDOM, ASCENDING, DESCENDING};

   /**
    * Takes input format as text lines, runs some processing on it and
    * writes out data as text again.
    */
   public static class Map extends MapReduceBase
     implements Mapper<WritableComparable, Text, UTF8, UTF8> {

     public void map(WritableComparable key, Text value,
                     OutputCollector<UTF8, UTF8> output,
                     Reporter reporter) throws IOException
     {
       String line = value.toString();
       output.collect(new UTF8(process(line)), new UTF8(""));
     }
     public String process(String line) {
       return line;
     }
   }

   /**
    * Ignores the key and writes values to the output.
    */
   public static class Reduce extends MapReduceBase
     implements Reducer<UTF8, UTF8, UTF8, UTF8> {

     public void reduce(UTF8 key, Iterator<UTF8> values,
                        OutputCollector<UTF8, UTF8> output, Reporter reporter) throws IOException
     {
       while(values.hasNext()) {
         output.collect(key, new UTF8(values.next().toString()));
       }
     }
   }

   /**
    * Generate a text file on the given filesystem with the given path name.
    * The text file will contain the given number of lines of generated data.
    * The generated data are string representations of numbers.  Each line
    * is the same length, which is achieved by padding each number with
    * an appropriate number of leading '0' (zero) characters.  The order of
    * generated data is one of ascending, descending, or random.
    */
   public void generateTextFile(FileSystem fs, Path inputFile,
                                       long numLines, Order sortOrder) throws IOException
   {
     LOG.info("creating control file: "+numLines+" numLines, "+sortOrder+" sortOrder");
     PrintStream output = null;
     try {
       output = new PrintStream(fs.create(inputFile));
       int padding = String.valueOf(numLines).length();
       switch(sortOrder) {
       case RANDOM:
         for (long l = 0; l < numLines; l++) {
           output.println(pad((new Random()).nextLong(), padding));
         }
         break;
       case ASCENDING:
         for (long l = 0; l < numLines; l++) {
           output.println(pad(l, padding));
         }
         break;
       case DESCENDING:
         for (long l = numLines; l > 0; l--) {
           output.println(pad(l, padding));
         }
         break;
       }
     } finally {
       if (output != null)
         output.close();
     }
     LOG.info("created control file: " + inputFile);
   }

   /**
    * Convert the given number to a string and pad the number with
    * leading '0' (zero) characters so that the string is exactly
    * the given length.
    */
   private static String pad(long number, int length) {
     String str = String.valueOf(number);
     StringBuffer value = new StringBuffer();
     for (int i = str.length(); i < length; i++) {
       value.append("0");
     }
     value.append(str);
     return value.toString();
   }

   /**
    * Create the job configuration.
    */
   private JobConf setupJob(int numMaps, int numReduces, String jarFile) {
     JobConf jobConf = new JobConf(getConf());
     jobConf.setJarByClass(MRBench.class);
     FileInputFormat.addInputPath(jobConf, INPUT_DIR);

     jobConf.setInputFormat(TextInputFormat.class);
     jobConf.setOutputFormat(TextOutputFormat.class);

     jobConf.setOutputValueClass(UTF8.class);

     jobConf.setMapOutputKeyClass(UTF8.class);
     jobConf.setMapOutputValueClass(UTF8.class);

     if (null != jarFile) {
       jobConf.setJar(jarFile);
     }
     jobConf.setMapperClass(Map.class);
     jobConf.setReducerClass(Reduce.class);

     jobConf.setNumMapTasks(numMaps);
     jobConf.setNumReduceTasks(numReduces);

     return jobConf;
   }

   /**
    * Runs a MapReduce task, given number of times. The input to each run
    * is the same file.
    */
   private ArrayList<Long> runJobInSequence(JobConf masterJobConf, int numRuns) throws IOException {
     Random rand = new Random();
     ArrayList<Long> execTimes = new ArrayList<Long>();

     for (int i = 0; i < numRuns; i++) {
       // create a new job conf every time, reusing same object does not work
       JobConf jobConf = new JobConf(masterJobConf);
       // reset the job jar because the copy constructor doesn't
       jobConf.setJar(masterJobConf.getJar());
       // give a new random name to output of the mapred tasks
       FileOutputFormat.setOutputPath(jobConf,
                          new Path(OUTPUT_DIR, "output_" + rand.nextInt()));

       LOG.info("Running job " + i + ":" +
                " input=" + FileInputFormat.getInputPaths(jobConf)[0] +
                " output=" + FileOutputFormat.getOutputPath(jobConf));

       // run the mapred task now
       long curTime = System.currentTimeMillis();
       JobClient.runJob(jobConf);
       execTimes.add(new Long(System.currentTimeMillis() - curTime));
     }
     return execTimes;
   }

   /**
    * <pre>
    * Usage: mrbench
    *    [-baseDir <base DFS path for output/input, default is /benchmarks/MRBench>]
    *    [-jar <local path to job jar file containing Mapper and Reducer implementations, default is current jar file>]
    *    [-numRuns <number of times to run the job, default is 1>]
    *    [-maps <number of maps for each run, default is 2>]
    *    [-reduces <number of reduces for each run, default is 1>]
    *    [-inputLines <number of input lines to generate, default is 1>]
    *    [-inputType <type of input to generate, one of ascending (default), descending, random>]
    *    [-verbose]
    * </pre>
    */
   public static void main (String[] args) throws Exception {
     int res = ToolRunner.run(new MRBench(), args);
     System.exit(res);
   }

   @Override
   public int run(String[] args) throws Exception {
     String version = "MRBenchmark.0.0.2";
     System.out.println(version);

     String usage =
       "Usage: mrbench " +
       "[-baseDir <base DFS path for output/input, default is /benchmarks/MRBench>] " +
       "[-jar <local path to job jar file containing Mapper and Reducer implementations, default is current jar file>] " +
       "[-numRuns <number of times to run the job, default is 1>] " +
       "[-maps <number of maps for each run, default is 2>] " +
       "[-reduces <number of reduces for each run, default is 1>] " +
       "[-inputLines <number of input lines to generate, default is 1>] " +
       "[-inputType <type of input to generate, one of ascending (default), descending, random>] " +
       "[-verbose]";

     String jarFile = null;
     int inputLines = 1;
     int numRuns = 1;
     int numMaps = 2;
     int numReduces = 1;
     boolean verbose = false;
     Order inputSortOrder = Order.ASCENDING;
     for (int i = 0; i < args.length; i++) { // parse command line
       if (args[i].equals("-jar")) {
         jarFile = args[++i];
       } else if (args[i].equals("-numRuns")) {
         numRuns = Integer.parseInt(args[++i]);
       } else if (args[i].equals("-baseDir")) {
         BASE_DIR = new Path(args[++i]);
       } else if (args[i].equals("-maps")) {
         numMaps = Integer.parseInt(args[++i]);
       } else if (args[i].equals("-reduces")) {
         numReduces = Integer.parseInt(args[++i]);
       } else if (args[i].equals("-inputLines")) {
         inputLines = Integer.parseInt(args[++i]);
       } else if (args[i].equals("-inputType")) {
         String s = args[++i];
         if (s.equalsIgnoreCase("ascending")) {
           inputSortOrder = Order.ASCENDING;
         } else if (s.equalsIgnoreCase("descending")) {
           inputSortOrder = Order.DESCENDING;
         } else if (s.equalsIgnoreCase("random")) {
           inputSortOrder = Order.RANDOM;
         } else {
           inputSortOrder = null;
         }
       } else if (args[i].equals("-verbose")) {
         verbose = true;
       } else {
         System.err.println(usage);
         System.exit(-1);
       }
     }

     if (numRuns < 1 ||  // verify args
         numMaps < 1 ||
         numReduces < 1 ||
         inputLines < 0 ||
         inputSortOrder == null)
       {
         System.err.println(usage);
         return -1;
       }

     JobConf jobConf = setupJob(numMaps, numReduces, jarFile);
     FileSystem fs = FileSystem.get(jobConf);
     Path inputFile = new Path(INPUT_DIR, "input_" + (new Random()).nextInt() + ".txt");
     generateTextFile(fs, inputFile, inputLines, inputSortOrder);

     // setup test output directory
     fs.mkdirs(BASE_DIR);
     ArrayList<Long> execTimes = new ArrayList<Long>();
     try {
       execTimes = runJobInSequence(jobConf, numRuns);
     } finally {
       // delete output -- should we really do this?
       fs.delete(BASE_DIR, true);
     }

     if (verbose) {
       // Print out a report
       System.out.println("Total MapReduce jobs executed: " + numRuns);
       System.out.println("Total lines of data per job: " + inputLines);
       System.out.println("Maps per job: " + numMaps);
       System.out.println("Reduces per job: " + numReduces);
     }
     int i = 0;
     long totalTime = 0;
     for (Long time : execTimes) {
       totalTime += time.longValue();
       if (verbose) {
         System.out.println("Total milliseconds for task: " + (++i) +
                            " = " +  time);
       }
     }
     long avgTime = totalTime / numRuns;
     System.out.println("DataLines\tMaps\tReduces\tAvgTime (milliseconds)");
     System.out.println(inputLines + "\t\t" + numMaps + "\t" +
                        numReduces + "\t" + avgTime);
     return 0;
   }

 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.hadoop.mapred;

	import java.io.IOException;
	import java.io.PrintStream;
	import java.util.ArrayList;
	import java.util.Iterator;
	import java.util.Random;

	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;
	import org.apache.hadoop.conf.Configured;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.UTF8;
	import org.apache.hadoop.io.WritableComparable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.util.Tool;
	import org.apache.hadoop.util.ToolRunner;

	/**
	* Runs a job multiple times and takes average of all runs.
	*/
	public class MRBench extends Configured implements Tool{

	private static final Log LOG = LogFactory.getLog(MRBench.class);
	private static Path BASE_DIR =
	new Path(System.getProperty("test.build.data","/benchmarks/MRBench"));
	private static Path INPUT_DIR = new Path(BASE_DIR, "mr_input");
	private static Path OUTPUT_DIR = new Path(BASE_DIR, "mr_output");

	public static enum Order {RANDOM, ASCENDING, DESCENDING};

	/**
	* Takes input format as text lines, runs some processing on it and
	* writes out data as text again.
	*/
	public static class Map extends MapReduceBase
	implements Mapper<WritableComparable, Text, UTF8, UTF8> {

	public void map(WritableComparable key, Text value,
	OutputCollector<UTF8, UTF8> output,
	Reporter reporter) throws IOException
	{
	String line = value.toString();
	output.collect(new UTF8(process(line)), new UTF8(""));
	}
	public String process(String line) {
	return line;
	}
	}

	/**
	* Ignores the key and writes values to the output.
	*/
	public static class Reduce extends MapReduceBase
	implements Reducer<UTF8, UTF8, UTF8, UTF8> {

	public void reduce(UTF8 key, Iterator<UTF8> values,
	OutputCollector<UTF8, UTF8> output, Reporter reporter) throws IOException
	{
	while(values.hasNext()) {
	output.collect(key, new UTF8(values.next().toString()));
	}
	}
	}

	/**
	* Generate a text file on the given filesystem with the given path name.
	* The text file will contain the given number of lines of generated data.
	* The generated data are string representations of numbers. Each line
	* is the same length, which is achieved by padding each number with
	* an appropriate number of leading '0' (zero) characters. The order of
	* generated data is one of ascending, descending, or random.
	*/
	public void generateTextFile(FileSystem fs, Path inputFile,
	long numLines, Order sortOrder) throws IOException
	{
	LOG.info("creating control file: "+numLines+" numLines, "+sortOrder+" sortOrder");
	PrintStream output = null;
	try {
	output = new PrintStream(fs.create(inputFile));
	int padding = String.valueOf(numLines).length();
	switch(sortOrder) {
	case RANDOM:
	for (long l = 0; l < numLines; l++) {
	output.println(pad((new Random()).nextLong(), padding));
	}
	break;
	case ASCENDING:
	for (long l = 0; l < numLines; l++) {
	output.println(pad(l, padding));
	}
	break;
	case DESCENDING:
	for (long l = numLines; l > 0; l--) {
	output.println(pad(l, padding));
	}
	break;
	}
	} finally {
	if (output != null)
	output.close();
	}
	LOG.info("created control file: " + inputFile);
	}

	/**
	* Convert the given number to a string and pad the number with
	* leading '0' (zero) characters so that the string is exactly
	* the given length.
	*/
	private static String pad(long number, int length) {
	String str = String.valueOf(number);
	StringBuffer value = new StringBuffer();
	for (int i = str.length(); i < length; i++) {
	value.append("0");
	}
	value.append(str);
	return value.toString();
	}

	/**
	* Create the job configuration.
	*/
	private JobConf setupJob(int numMaps, int numReduces, String jarFile) {
	JobConf jobConf = new JobConf(getConf());
	jobConf.setJarByClass(MRBench.class);
	FileInputFormat.addInputPath(jobConf, INPUT_DIR);

	jobConf.setInputFormat(TextInputFormat.class);
	jobConf.setOutputFormat(TextOutputFormat.class);

	jobConf.setOutputValueClass(UTF8.class);

	jobConf.setMapOutputKeyClass(UTF8.class);
	jobConf.setMapOutputValueClass(UTF8.class);

	if (null != jarFile) {
	jobConf.setJar(jarFile);
	}
	jobConf.setMapperClass(Map.class);
	jobConf.setReducerClass(Reduce.class);

	jobConf.setNumMapTasks(numMaps);
	jobConf.setNumReduceTasks(numReduces);

	return jobConf;
	}

	/**
	* Runs a MapReduce task, given number of times. The input to each run
	* is the same file.
	*/
	private ArrayList<Long> runJobInSequence(JobConf masterJobConf, int numRuns) throws IOException {
	Random rand = new Random();
	ArrayList<Long> execTimes = new ArrayList<Long>();

	for (int i = 0; i < numRuns; i++) {
	// create a new job conf every time, reusing same object does not work
	JobConf jobConf = new JobConf(masterJobConf);
	// reset the job jar because the copy constructor doesn't
	jobConf.setJar(masterJobConf.getJar());
	// give a new random name to output of the mapred tasks
	FileOutputFormat.setOutputPath(jobConf,
	new Path(OUTPUT_DIR, "output_" + rand.nextInt()));

	LOG.info("Running job " + i + ":" +
	" input=" + FileInputFormat.getInputPaths(jobConf)[0] +
	" output=" + FileOutputFormat.getOutputPath(jobConf));

	// run the mapred task now
	long curTime = System.currentTimeMillis();
	JobClient.runJob(jobConf);
	execTimes.add(new Long(System.currentTimeMillis() - curTime));
	}
	return execTimes;
	}

	/**
	* <pre>
	* Usage: mrbench
	* [-baseDir <base DFS path for output/input, default is /benchmarks/MRBench>]
	* [-jar <local path to job jar file containing Mapper and Reducer implementations, default is current jar file>]
	* [-numRuns <number of times to run the job, default is 1>]
	* [-maps <number of maps for each run, default is 2>]
	* [-reduces <number of reduces for each run, default is 1>]
	* [-inputLines <number of input lines to generate, default is 1>]
	* [-inputType <type of input to generate, one of ascending (default), descending, random>]
	* [-verbose]
	* </pre>
	*/
	public static void main (String[] args) throws Exception {
	int res = ToolRunner.run(new MRBench(), args);
	System.exit(res);
	}

	@Override
	public int run(String[] args) throws Exception {
	String version = "MRBenchmark.0.0.2";
	System.out.println(version);

	String usage =
	"Usage: mrbench " +
	"[-baseDir <base DFS path for output/input, default is /benchmarks/MRBench>] " +
	"[-jar <local path to job jar file containing Mapper and Reducer implementations, default is current jar file>] " +
	"[-numRuns <number of times to run the job, default is 1>] " +
	"[-maps <number of maps for each run, default is 2>] " +
	"[-reduces <number of reduces for each run, default is 1>] " +
	"[-inputLines <number of input lines to generate, default is 1>] " +
	"[-inputType <type of input to generate, one of ascending (default), descending, random>] " +
	"[-verbose]";

	String jarFile = null;
	int inputLines = 1;
	int numRuns = 1;
	int numMaps = 2;
	int numReduces = 1;
	boolean verbose = false;
	Order inputSortOrder = Order.ASCENDING;
	for (int i = 0; i < args.length; i++) { // parse command line
	if (args[i].equals("-jar")) {
	jarFile = args[++i];
	} else if (args[i].equals("-numRuns")) {
	numRuns = Integer.parseInt(args[++i]);
	} else if (args[i].equals("-baseDir")) {
	BASE_DIR = new Path(args[++i]);
	} else if (args[i].equals("-maps")) {
	numMaps = Integer.parseInt(args[++i]);
	} else if (args[i].equals("-reduces")) {
	numReduces = Integer.parseInt(args[++i]);
	} else if (args[i].equals("-inputLines")) {
	inputLines = Integer.parseInt(args[++i]);
	} else if (args[i].equals("-inputType")) {
	String s = args[++i];
	if (s.equalsIgnoreCase("ascending")) {
	inputSortOrder = Order.ASCENDING;
	} else if (s.equalsIgnoreCase("descending")) {
	inputSortOrder = Order.DESCENDING;
	} else if (s.equalsIgnoreCase("random")) {
	inputSortOrder = Order.RANDOM;
	} else {
	inputSortOrder = null;
	}
	} else if (args[i].equals("-verbose")) {
	verbose = true;
	} else {
	System.err.println(usage);
	System.exit(-1);
	}
	}

	if (numRuns < 1 \|\| // verify args
	numMaps < 1 \|\|
	numReduces < 1 \|\|
	inputLines < 0 \|\|
	inputSortOrder == null)
	{
	System.err.println(usage);
	return -1;
	}

	JobConf jobConf = setupJob(numMaps, numReduces, jarFile);
	FileSystem fs = FileSystem.get(jobConf);
	Path inputFile = new Path(INPUT_DIR, "input_" + (new Random()).nextInt() + ".txt");
	generateTextFile(fs, inputFile, inputLines, inputSortOrder);

	// setup test output directory
	fs.mkdirs(BASE_DIR);
	ArrayList<Long> execTimes = new ArrayList<Long>();
	try {
	execTimes = runJobInSequence(jobConf, numRuns);
	} finally {
	// delete output -- should we really do this?
	fs.delete(BASE_DIR, true);
	}

	if (verbose) {
	// Print out a report
	System.out.println("Total MapReduce jobs executed: " + numRuns);
	System.out.println("Total lines of data per job: " + inputLines);
	System.out.println("Maps per job: " + numMaps);
	System.out.println("Reduces per job: " + numReduces);
	}
	int i = 0;
	long totalTime = 0;
	for (Long time : execTimes) {
	totalTime += time.longValue();
	if (verbose) {
	System.out.println("Total milliseconds for task: " + (++i) +
	" = " + time);
	}
	}
	long avgTime = totalTime / numRuns;
	System.out.println("DataLines\tMaps\tReduces\tAvgTime (milliseconds)");
	System.out.println(inputLines + "\t\t" + numMaps + "\t" +
	numReduces + "\t" + avgTime);
	return 0;
	}

	}