src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java - hadoop-mapreduce - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.hadoop.mapred.gridmix;

 import java.io.IOException;
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.util.Arrays;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Random;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.BytesWritable;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapred.ClusterStatus;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapreduce.InputFormat;
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.mapreduce.RecordWriter;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 // TODO can replace with form of GridmixJob
 class GenerateData extends GridmixJob {

   public GenerateData(Configuration conf, Path outdir, long genbytes)
       throws IOException {
     super(conf, 0L, "GRIDMIX_GENDATA");
     job.getConfiguration().setLong("gridmix.gendata.bytes", genbytes);
     FileOutputFormat.setOutputPath(job, outdir);
   }

   @Override
   public Job call() throws IOException, InterruptedException,
                            ClassNotFoundException {
     job.setMapperClass(GenDataMapper.class);
     job.setNumReduceTasks(0);
     job.setMapOutputKeyClass(NullWritable.class);
     job.setMapOutputValueClass(BytesWritable.class);
     job.setInputFormatClass(GenDataFormat.class);
     job.setOutputFormatClass(RawBytesOutputFormat.class);
     job.setJarByClass(GenerateData.class);
     FileInputFormat.addInputPath(job, new Path("ignored"));
     job.submit();
     return job;
   }

   public static class GenDataMapper
       extends Mapper<NullWritable,LongWritable,NullWritable,BytesWritable> {

     private BytesWritable val;
     private final Random r = new Random();

     @Override
     protected void setup(Context context)
         throws IOException, InterruptedException {
       val = new BytesWritable(new byte[
           context.getConfiguration().getInt("gendata.val.bytes", 1024 * 1024)]);
     }

     @Override
     public void map(NullWritable key, LongWritable value, Context context)
         throws IOException, InterruptedException {
       for (long bytes = value.get(); bytes > 0; bytes -= val.getLength()) {
         r.nextBytes(val.getBytes());
         val.setSize((int)Math.min(val.getLength(), bytes));
         context.write(key, val);
       }
     }

   }

   static class GenDataFormat extends InputFormat<NullWritable,LongWritable> {

     @Override
     public List<InputSplit> getSplits(JobContext jobCtxt) throws IOException {
       final JobClient client = new JobClient(jobCtxt.getConfiguration());
       ClusterStatus stat = client.getClusterStatus(true);
       final long toGen =
         jobCtxt.getConfiguration().getLong("gridmix.gendata.bytes", -1);
       if (toGen < 0) {
         throw new IOException("Invalid/missing generation bytes: " + toGen);
       }
       final int nTrackers = stat.getTaskTrackers();
       final long bytesPerTracker = toGen / nTrackers;
       final ArrayList<InputSplit> splits = new ArrayList<InputSplit>(nTrackers);
       final Pattern trackerPattern = Pattern.compile("tracker_([^:]*):.*");
       final Matcher m = trackerPattern.matcher("");
       for (String tracker : stat.getActiveTrackerNames()) {
         m.reset(tracker);
         if (!m.find()) {
           System.err.println("Skipping node: " + tracker);
           continue;
         }
         final String name = m.group(1);
         splits.add(new GenSplit(bytesPerTracker, new String[] { name }));
       }
       return splits;
     }

     @Override
     public RecordReader<NullWritable,LongWritable> createRecordReader(
         InputSplit split, final TaskAttemptContext taskContext)
         throws IOException {
       return new RecordReader<NullWritable,LongWritable>() {
         long written = 0L;
         long write = 0L;
         long RINTERVAL;
         long toWrite;
         final NullWritable key = NullWritable.get();
         final LongWritable val = new LongWritable();

         @Override
         public void initialize(InputSplit split, TaskAttemptContext ctxt)
             throws IOException, InterruptedException {
           toWrite = split.getLength();
           RINTERVAL = ctxt.getConfiguration().getInt(
               "gendata.report.interval.mb", 10) << 20;
         }
         @Override
         public boolean nextKeyValue() throws IOException {
           written += write;
           write = Math.min(toWrite - written, RINTERVAL);
           val.set(write);
           return written < toWrite;
         }
         @Override
         public float getProgress() throws IOException {
           return written / ((float)toWrite);
         }
         @Override
         public NullWritable getCurrentKey() { return key; }
         @Override
         public LongWritable getCurrentValue() { return val; }
         @Override
         public void close() throws IOException {
           taskContext.setStatus("Wrote " + toWrite);
         }
       };
     }
   }

   static class GenSplit extends InputSplit implements Writable {
     private long bytes;
     private int nLoc;
     private String[] locations;

     public GenSplit() { }
     public GenSplit(long bytes, String[] locations) {
       this(bytes, locations.length, locations);
     }
     public GenSplit(long bytes, int nLoc, String[] locations) {
       this.bytes = bytes;
       this.nLoc = nLoc;
       this.locations = Arrays.copyOf(locations, nLoc);
     }
     @Override
     public long getLength() {
       return bytes;
     }
     @Override
     public String[] getLocations() {
       return locations;
     }
     @Override
     public void readFields(DataInput in) throws IOException {
       bytes = in.readLong();
       nLoc = in.readInt();
       if (null == locations || locations.length < nLoc) {
         locations = new String[nLoc];
       }
       for (int i = 0; i < nLoc; ++i) {
         locations[i] = Text.readString(in);
       }
     }
     @Override
     public void write(DataOutput out) throws IOException {
       out.writeLong(bytes);
       out.writeInt(nLoc);
       for (int i = 0; i < nLoc; ++i) {
         Text.writeString(out, locations[i]);
       }
     }
   }

   static class RawBytesOutputFormat
       extends FileOutputFormat<NullWritable,BytesWritable> {

     @Override
     public RecordWriter<NullWritable,BytesWritable> getRecordWriter(
         TaskAttemptContext job) throws IOException {

       Path file = getDefaultWorkFile(job, "");
       FileSystem fs = file.getFileSystem(job.getConfiguration());
       final FSDataOutputStream fileOut = fs.create(file, false);
       return new RecordWriter<NullWritable,BytesWritable>() {
         @Override
         public void write(NullWritable key, BytesWritable value)
             throws IOException {
           fileOut.write(value.getBytes(), 0, value.getLength());
         }
         @Override
         public void close(TaskAttemptContext ctxt) throws IOException {
           fileOut.close();
         }
       };
     }
   }

 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.hadoop.mapred.gridmix;

	import java.io.IOException;
	import java.io.DataInput;
	import java.io.DataOutput;
	import java.util.Arrays;
	import java.util.ArrayList;
	import java.util.List;
	import java.util.Random;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FSDataOutputStream;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.BytesWritable;
	import org.apache.hadoop.io.LongWritable;
	import org.apache.hadoop.io.NullWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.io.Writable;
	import org.apache.hadoop.mapred.ClusterStatus;
	import org.apache.hadoop.mapred.JobClient;
	import org.apache.hadoop.mapreduce.InputFormat;
	import org.apache.hadoop.mapreduce.InputSplit;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.mapreduce.JobContext;
	import org.apache.hadoop.mapreduce.Mapper;
	import org.apache.hadoop.mapreduce.RecordReader;
	import org.apache.hadoop.mapreduce.RecordWriter;
	import org.apache.hadoop.mapreduce.TaskAttemptContext;
	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

	// TODO can replace with form of GridmixJob
	class GenerateData extends GridmixJob {

	public GenerateData(Configuration conf, Path outdir, long genbytes)
	throws IOException {
	super(conf, 0L, "GRIDMIX_GENDATA");
	job.getConfiguration().setLong("gridmix.gendata.bytes", genbytes);
	FileOutputFormat.setOutputPath(job, outdir);
	}

	@Override
	public Job call() throws IOException, InterruptedException,
	ClassNotFoundException {
	job.setMapperClass(GenDataMapper.class);
	job.setNumReduceTasks(0);
	job.setMapOutputKeyClass(NullWritable.class);
	job.setMapOutputValueClass(BytesWritable.class);
	job.setInputFormatClass(GenDataFormat.class);
	job.setOutputFormatClass(RawBytesOutputFormat.class);
	job.setJarByClass(GenerateData.class);
	FileInputFormat.addInputPath(job, new Path("ignored"));
	job.submit();
	return job;
	}

	public static class GenDataMapper
	extends Mapper<NullWritable,LongWritable,NullWritable,BytesWritable> {

	private BytesWritable val;
	private final Random r = new Random();

	@Override
	protected void setup(Context context)
	throws IOException, InterruptedException {
	val = new BytesWritable(new byte[
	context.getConfiguration().getInt("gendata.val.bytes", 1024 * 1024)]);
	}

	@Override
	public void map(NullWritable key, LongWritable value, Context context)
	throws IOException, InterruptedException {
	for (long bytes = value.get(); bytes > 0; bytes -= val.getLength()) {
	r.nextBytes(val.getBytes());
	val.setSize((int)Math.min(val.getLength(), bytes));
	context.write(key, val);
	}
	}

	}

	static class GenDataFormat extends InputFormat<NullWritable,LongWritable> {

	@Override
	public List<InputSplit> getSplits(JobContext jobCtxt) throws IOException {
	final JobClient client = new JobClient(jobCtxt.getConfiguration());
	ClusterStatus stat = client.getClusterStatus(true);
	final long toGen =
	jobCtxt.getConfiguration().getLong("gridmix.gendata.bytes", -1);
	if (toGen < 0) {
	throw new IOException("Invalid/missing generation bytes: " + toGen);
	}
	final int nTrackers = stat.getTaskTrackers();
	final long bytesPerTracker = toGen / nTrackers;
	final ArrayList<InputSplit> splits = new ArrayList<InputSplit>(nTrackers);
	final Pattern trackerPattern = Pattern.compile("tracker_([^:]):.");
	final Matcher m = trackerPattern.matcher("");
	for (String tracker : stat.getActiveTrackerNames()) {
	m.reset(tracker);
	if (!m.find()) {
	System.err.println("Skipping node: " + tracker);
	continue;
	}
	final String name = m.group(1);
	splits.add(new GenSplit(bytesPerTracker, new String[] { name }));
	}
	return splits;
	}

	@Override
	public RecordReader<NullWritable,LongWritable> createRecordReader(
	InputSplit split, final TaskAttemptContext taskContext)
	throws IOException {
	return new RecordReader<NullWritable,LongWritable>() {
	long written = 0L;
	long write = 0L;
	long RINTERVAL;
	long toWrite;
	final NullWritable key = NullWritable.get();
	final LongWritable val = new LongWritable();

	@Override
	public void initialize(InputSplit split, TaskAttemptContext ctxt)
	throws IOException, InterruptedException {
	toWrite = split.getLength();
	RINTERVAL = ctxt.getConfiguration().getInt(
	"gendata.report.interval.mb", 10) << 20;
	}
	@Override
	public boolean nextKeyValue() throws IOException {
	written += write;
	write = Math.min(toWrite - written, RINTERVAL);
	val.set(write);
	return written < toWrite;
	}
	@Override
	public float getProgress() throws IOException {
	return written / ((float)toWrite);
	}
	@Override
	public NullWritable getCurrentKey() { return key; }
	@Override
	public LongWritable getCurrentValue() { return val; }
	@Override
	public void close() throws IOException {
	taskContext.setStatus("Wrote " + toWrite);
	}
	};
	}
	}

	static class GenSplit extends InputSplit implements Writable {
	private long bytes;
	private int nLoc;
	private String[] locations;

	public GenSplit() { }
	public GenSplit(long bytes, String[] locations) {
	this(bytes, locations.length, locations);
	}
	public GenSplit(long bytes, int nLoc, String[] locations) {
	this.bytes = bytes;
	this.nLoc = nLoc;
	this.locations = Arrays.copyOf(locations, nLoc);
	}
	@Override
	public long getLength() {
	return bytes;
	}
	@Override
	public String[] getLocations() {
	return locations;
	}
	@Override
	public void readFields(DataInput in) throws IOException {
	bytes = in.readLong();
	nLoc = in.readInt();
	if (null == locations \|\| locations.length < nLoc) {
	locations = new String[nLoc];
	}
	for (int i = 0; i < nLoc; ++i) {
	locations[i] = Text.readString(in);
	}
	}
	@Override
	public void write(DataOutput out) throws IOException {
	out.writeLong(bytes);
	out.writeInt(nLoc);
	for (int i = 0; i < nLoc; ++i) {
	Text.writeString(out, locations[i]);
	}
	}
	}

	static class RawBytesOutputFormat
	extends FileOutputFormat<NullWritable,BytesWritable> {

	@Override
	public RecordWriter<NullWritable,BytesWritable> getRecordWriter(
	TaskAttemptContext job) throws IOException {

	Path file = getDefaultWorkFile(job, "");
	FileSystem fs = file.getFileSystem(job.getConfiguration());
	final FSDataOutputStream fileOut = fs.create(file, false);
	return new RecordWriter<NullWritable,BytesWritable>() {
	@Override
	public void write(NullWritable key, BytesWritable value)
	throws IOException {
	fileOut.write(value.getBytes(), 0, value.getLength());
	}
	@Override
	public void close(TaskAttemptContext ctxt) throws IOException {
	fileOut.close();
	}
	};
	}
	}

	}