src/main/java/stresso/trie/Init.java - fluo-examples - Git at Google

 /*
  * Copyright 2014 Stresso authors (see AUTHORS)
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package stresso.trie;

 import java.io.BufferedOutputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.util.Collection;

 import org.apache.accumulo.core.client.Connector;
 import org.apache.accumulo.core.client.mapreduce.AccumuloFileOutputFormat;
 import org.apache.accumulo.core.client.mapreduce.lib.partition.RangePartitioner;
 import org.apache.accumulo.core.data.Key;
 import org.apache.accumulo.core.data.Value;
 import org.apache.commons.codec.binary.Base64;
 import org.apache.fluo.api.client.FluoClient;
 import org.apache.fluo.api.client.FluoFactory;
 import org.apache.fluo.api.config.FluoConfiguration;
 import org.apache.fluo.core.util.AccumuloUtil;
 import org.apache.fluo.mapreduce.FluoKeyValue;
 import org.apache.fluo.mapreduce.FluoKeyValueGenerator;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;

 public class Init extends Configured implements Tool {

   public static final String TRIE_STOP_LEVEL_PROP = FluoConfiguration.FLUO_PREFIX + ".stress.trie.stopLevel";
   public static final String TRIE_NODE_SIZE_PROP = FluoConfiguration.FLUO_PREFIX + ".stress.trie.node.size";

   public static class UniqueReducer extends Reducer<LongWritable,NullWritable,LongWritable,NullWritable> {
     @Override
     protected void reduce(LongWritable key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
       context.write(key, NullWritable.get());
     }
   }

   public static class InitMapper extends Mapper<LongWritable,NullWritable,Text,LongWritable> {

     private int stopLevel;
     private int nodeSize;
     private static final LongWritable ONE = new LongWritable(1);

     private Text outputKey = new Text();

     @Override
     protected void setup(Context context) throws IOException, InterruptedException {
       nodeSize = context.getConfiguration().getInt(TRIE_NODE_SIZE_PROP, 0);
       stopLevel = context.getConfiguration().getInt(TRIE_STOP_LEVEL_PROP, 0);
     }

     @Override
     protected void map(LongWritable key, NullWritable val, Context context) throws IOException, InterruptedException {
       Node node = new Node(key.get(), 64 / nodeSize, nodeSize);
       while (node != null) {
         outputKey.set(node.getRowId());
         context.write(outputKey, ONE);
         if (node.getLevel() <= stopLevel)
           node = null;
         else
           node = node.getParent();
       }
     }
   }

   public static class InitCombiner extends Reducer<Text,LongWritable,Text,LongWritable> {

     private LongWritable outputVal = new LongWritable();

     @Override
     protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
       long sum = 0;
       for (LongWritable l : values) {
         sum += l.get();
       }

       outputVal.set(sum);
       context.write(key, outputVal);
     }
   }

   public static class InitReducer extends Reducer<Text,LongWritable,Key,Value> {
     private FluoKeyValueGenerator fkvg = new FluoKeyValueGenerator();

     @Override
     protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
       long sum = 0;
       for (LongWritable l : values) {
         sum += l.get();
       }

       fkvg.setRow(key).setColumn(Constants.COUNT_SEEN_COL).setValue(sum + "");

       FluoKeyValue[] kvs = fkvg.getKeyValues();
       for (FluoKeyValue kv : kvs) {
         context.write(kv.getKey(), kv.getValue());
       }
     }
   }

   @Override
   public int run(String[] args) throws Exception {
     if (args.length != 3) {
       System.err.println("Usage: " + this.getClass().getSimpleName() + " <fluoProps> <input dir> <tmp dir>");
       System.exit(-1);
     }

     FluoConfiguration props = new FluoConfiguration(new File(args[0]));
     Path input = new Path(args[1]);
     Path tmp = new Path(args[2]);

     int stopLevel;
     int nodeSize;
     try (FluoClient client = FluoFactory.newClient(props)) {
       nodeSize = client.getAppConfiguration().getInt(Constants.NODE_SIZE_PROP);
       stopLevel = client.getAppConfiguration().getInt(Constants.STOP_LEVEL_PROP);
     }

     int ret = unique(input, new Path(tmp, "nums"));
     if (ret != 0)
       return ret;

     return buildTree(nodeSize, props, tmp, stopLevel);
   }

   private int unique(Path input, Path tmp) throws Exception {
     Job job = Job.getInstance(getConf());
     job.setJarByClass(Init.class);

     job.setJobName(Init.class.getName() + "_unique");

     job.setInputFormatClass(SequenceFileInputFormat.class);
     SequenceFileInputFormat.addInputPath(job, input);

     job.setReducerClass(UniqueReducer.class);

     job.setOutputKeyClass(LongWritable.class);
     job.setOutputValueClass(NullWritable.class);

     job.setOutputFormatClass(SequenceFileOutputFormat.class);
     SequenceFileOutputFormat.setOutputPath(job, tmp);

     boolean success = job.waitForCompletion(true);
     return success ? 0 : 1;

   }

   private int buildTree(int nodeSize, FluoConfiguration props, Path tmp, int stopLevel) throws Exception {
     Job job = Job.getInstance(getConf());

     job.setJarByClass(Init.class);

     job.setJobName(Init.class.getName() + "_load");

     job.setMapOutputKeyClass(Text.class);
     job.setMapOutputValueClass(LongWritable.class);

     job.getConfiguration().setInt(TRIE_NODE_SIZE_PROP, nodeSize);
     job.getConfiguration().setInt(TRIE_STOP_LEVEL_PROP, stopLevel);

     job.setInputFormatClass(SequenceFileInputFormat.class);
     SequenceFileInputFormat.addInputPath(job, new Path(tmp, "nums"));

     job.setMapperClass(InitMapper.class);
     job.setCombinerClass(InitCombiner.class);
     job.setReducerClass(InitReducer.class);

     job.setOutputFormatClass(AccumuloFileOutputFormat.class);

     job.setPartitionerClass(RangePartitioner.class);

     FileSystem fs = FileSystem.get(job.getConfiguration());
     Connector conn = AccumuloUtil.getConnector(props);

     Path splitsPath = new Path(tmp, "splits.txt");

     Collection<Text> splits1 = writeSplits(props, fs, conn, splitsPath);

     RangePartitioner.setSplitFile(job, splitsPath.toString());
     job.setNumReduceTasks(splits1.size() + 1);

     Path outPath = new Path(tmp, "out");
     AccumuloFileOutputFormat.setOutputPath(job, outPath);

     boolean success = job.waitForCompletion(true);

     if (success) {
       Path failPath = new Path(tmp, "failures");
       fs.mkdirs(failPath);
       conn.tableOperations().importDirectory(props.getAccumuloTable(), outPath.toString(), failPath.toString(), false);
     }
     return success ? 0 : 1;
   }

   private Collection<Text> writeSplits(FluoConfiguration props, FileSystem fs, Connector conn, Path splitsPath) throws Exception {
     Collection<Text> splits1 = conn.tableOperations().listSplits(props.getAccumuloTable());
     OutputStream out = new BufferedOutputStream(fs.create(splitsPath));
     for (Text split : splits1) {
       out.write(Base64.encodeBase64(split.copyBytes()));
       out.write('\n');
     }

     out.close();
     return splits1;
   }

   public static void main(String[] args) throws Exception {
     int ret = ToolRunner.run(new Init(), args);
     System.exit(ret);
   }

 }
	/*
	* Copyright 2014 Stresso authors (see AUTHORS)
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package stresso.trie;

	import java.io.BufferedOutputStream;
	import java.io.File;
	import java.io.IOException;
	import java.io.OutputStream;
	import java.util.Collection;

	import org.apache.accumulo.core.client.Connector;
	import org.apache.accumulo.core.client.mapreduce.AccumuloFileOutputFormat;
	import org.apache.accumulo.core.client.mapreduce.lib.partition.RangePartitioner;
	import org.apache.accumulo.core.data.Key;
	import org.apache.accumulo.core.data.Value;
	import org.apache.commons.codec.binary.Base64;
	import org.apache.fluo.api.client.FluoClient;
	import org.apache.fluo.api.client.FluoFactory;
	import org.apache.fluo.api.config.FluoConfiguration;
	import org.apache.fluo.core.util.AccumuloUtil;
	import org.apache.fluo.mapreduce.FluoKeyValue;
	import org.apache.fluo.mapreduce.FluoKeyValueGenerator;
	import org.apache.hadoop.conf.Configured;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.LongWritable;
	import org.apache.hadoop.io.NullWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.mapreduce.Mapper;
	import org.apache.hadoop.mapreduce.Reducer;
	import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
	import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
	import org.apache.hadoop.util.Tool;
	import org.apache.hadoop.util.ToolRunner;

	public class Init extends Configured implements Tool {

	public static final String TRIE_STOP_LEVEL_PROP = FluoConfiguration.FLUO_PREFIX + ".stress.trie.stopLevel";
	public static final String TRIE_NODE_SIZE_PROP = FluoConfiguration.FLUO_PREFIX + ".stress.trie.node.size";

	public static class UniqueReducer extends Reducer<LongWritable,NullWritable,LongWritable,NullWritable> {
	@Override
	protected void reduce(LongWritable key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
	context.write(key, NullWritable.get());
	}
	}

	public static class InitMapper extends Mapper<LongWritable,NullWritable,Text,LongWritable> {

	private int stopLevel;
	private int nodeSize;
	private static final LongWritable ONE = new LongWritable(1);

	private Text outputKey = new Text();

	@Override
	protected void setup(Context context) throws IOException, InterruptedException {
	nodeSize = context.getConfiguration().getInt(TRIE_NODE_SIZE_PROP, 0);
	stopLevel = context.getConfiguration().getInt(TRIE_STOP_LEVEL_PROP, 0);
	}

	@Override
	protected void map(LongWritable key, NullWritable val, Context context) throws IOException, InterruptedException {
	Node node = new Node(key.get(), 64 / nodeSize, nodeSize);
	while (node != null) {
	outputKey.set(node.getRowId());
	context.write(outputKey, ONE);
	if (node.getLevel() <= stopLevel)
	node = null;
	else
	node = node.getParent();
	}
	}
	}

	public static class InitCombiner extends Reducer<Text,LongWritable,Text,LongWritable> {

	private LongWritable outputVal = new LongWritable();

	@Override
	protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
	long sum = 0;
	for (LongWritable l : values) {
	sum += l.get();
	}

	outputVal.set(sum);
	context.write(key, outputVal);
	}
	}

	public static class InitReducer extends Reducer<Text,LongWritable,Key,Value> {
	private FluoKeyValueGenerator fkvg = new FluoKeyValueGenerator();

	@Override
	protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
	long sum = 0;
	for (LongWritable l : values) {
	sum += l.get();
	}

	fkvg.setRow(key).setColumn(Constants.COUNT_SEEN_COL).setValue(sum + "");

	FluoKeyValue[] kvs = fkvg.getKeyValues();
	for (FluoKeyValue kv : kvs) {
	context.write(kv.getKey(), kv.getValue());
	}
	}
	}

	@Override
	public int run(String[] args) throws Exception {
	if (args.length != 3) {
	System.err.println("Usage: " + this.getClass().getSimpleName() + " <fluoProps> <input dir> <tmp dir>");
	System.exit(-1);
	}

	FluoConfiguration props = new FluoConfiguration(new File(args[0]));
	Path input = new Path(args[1]);
	Path tmp = new Path(args[2]);

	int stopLevel;
	int nodeSize;
	try (FluoClient client = FluoFactory.newClient(props)) {
	nodeSize = client.getAppConfiguration().getInt(Constants.NODE_SIZE_PROP);
	stopLevel = client.getAppConfiguration().getInt(Constants.STOP_LEVEL_PROP);
	}

	int ret = unique(input, new Path(tmp, "nums"));
	if (ret != 0)
	return ret;

	return buildTree(nodeSize, props, tmp, stopLevel);
	}

	private int unique(Path input, Path tmp) throws Exception {
	Job job = Job.getInstance(getConf());
	job.setJarByClass(Init.class);

	job.setJobName(Init.class.getName() + "_unique");

	job.setInputFormatClass(SequenceFileInputFormat.class);
	SequenceFileInputFormat.addInputPath(job, input);

	job.setReducerClass(UniqueReducer.class);

	job.setOutputKeyClass(LongWritable.class);
	job.setOutputValueClass(NullWritable.class);

	job.setOutputFormatClass(SequenceFileOutputFormat.class);
	SequenceFileOutputFormat.setOutputPath(job, tmp);

	boolean success = job.waitForCompletion(true);
	return success ? 0 : 1;

	}

	private int buildTree(int nodeSize, FluoConfiguration props, Path tmp, int stopLevel) throws Exception {
	Job job = Job.getInstance(getConf());

	job.setJarByClass(Init.class);

	job.setJobName(Init.class.getName() + "_load");

	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(LongWritable.class);

	job.getConfiguration().setInt(TRIE_NODE_SIZE_PROP, nodeSize);
	job.getConfiguration().setInt(TRIE_STOP_LEVEL_PROP, stopLevel);

	job.setInputFormatClass(SequenceFileInputFormat.class);
	SequenceFileInputFormat.addInputPath(job, new Path(tmp, "nums"));

	job.setMapperClass(InitMapper.class);
	job.setCombinerClass(InitCombiner.class);
	job.setReducerClass(InitReducer.class);

	job.setOutputFormatClass(AccumuloFileOutputFormat.class);

	job.setPartitionerClass(RangePartitioner.class);

	FileSystem fs = FileSystem.get(job.getConfiguration());
	Connector conn = AccumuloUtil.getConnector(props);

	Path splitsPath = new Path(tmp, "splits.txt");

	Collection<Text> splits1 = writeSplits(props, fs, conn, splitsPath);

	RangePartitioner.setSplitFile(job, splitsPath.toString());
	job.setNumReduceTasks(splits1.size() + 1);

	Path outPath = new Path(tmp, "out");
	AccumuloFileOutputFormat.setOutputPath(job, outPath);

	boolean success = job.waitForCompletion(true);

	if (success) {
	Path failPath = new Path(tmp, "failures");
	fs.mkdirs(failPath);
	conn.tableOperations().importDirectory(props.getAccumuloTable(), outPath.toString(), failPath.toString(), false);
	}
	return success ? 0 : 1;
	}

	private Collection<Text> writeSplits(FluoConfiguration props, FileSystem fs, Connector conn, Path splitsPath) throws Exception {
	Collection<Text> splits1 = conn.tableOperations().listSplits(props.getAccumuloTable());
	OutputStream out = new BufferedOutputStream(fs.create(splitsPath));
	for (Text split : splits1) {
	out.write(Base64.encodeBase64(split.copyBytes()));
	out.write('\n');
	}

	out.close();
	return splits1;
	}

	public static void main(String[] args) throws Exception {
	int ret = ToolRunner.run(new Init(), args);
	System.exit(ret);
	}

	}