blob: 8e7076b8ad21486d32a5bdb39b6486435da6a935 [file] [log] [blame]
/*
* Copyright 2014 Stresso authors (see AUTHORS)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package stresso.trie;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Collection;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.mapreduce.AccumuloFileOutputFormat;
import org.apache.accumulo.core.client.mapreduce.lib.partition.RangePartitioner;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Value;
import org.apache.commons.codec.binary.Base64;
import org.apache.fluo.api.client.FluoClient;
import org.apache.fluo.api.client.FluoFactory;
import org.apache.fluo.api.config.FluoConfiguration;
import org.apache.fluo.core.util.AccumuloUtil;
import org.apache.fluo.mapreduce.FluoKeyValue;
import org.apache.fluo.mapreduce.FluoKeyValueGenerator;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Init extends Configured implements Tool {
public static final String TRIE_STOP_LEVEL_PROP = FluoConfiguration.FLUO_PREFIX + ".stress.trie.stopLevel";
public static final String TRIE_NODE_SIZE_PROP = FluoConfiguration.FLUO_PREFIX + ".stress.trie.node.size";
public static class UniqueReducer extends Reducer<LongWritable,NullWritable,LongWritable,NullWritable> {
@Override
protected void reduce(LongWritable key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
public static class InitMapper extends Mapper<LongWritable,NullWritable,Text,LongWritable> {
private int stopLevel;
private int nodeSize;
private static final LongWritable ONE = new LongWritable(1);
private Text outputKey = new Text();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
nodeSize = context.getConfiguration().getInt(TRIE_NODE_SIZE_PROP, 0);
stopLevel = context.getConfiguration().getInt(TRIE_STOP_LEVEL_PROP, 0);
}
@Override
protected void map(LongWritable key, NullWritable val, Context context) throws IOException, InterruptedException {
Node node = new Node(key.get(), 64 / nodeSize, nodeSize);
while (node != null) {
outputKey.set(node.getRowId());
context.write(outputKey, ONE);
if (node.getLevel() <= stopLevel)
node = null;
else
node = node.getParent();
}
}
}
public static class InitCombiner extends Reducer<Text,LongWritable,Text,LongWritable> {
private LongWritable outputVal = new LongWritable();
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long sum = 0;
for (LongWritable l : values) {
sum += l.get();
}
outputVal.set(sum);
context.write(key, outputVal);
}
}
public static class InitReducer extends Reducer<Text,LongWritable,Key,Value> {
private FluoKeyValueGenerator fkvg = new FluoKeyValueGenerator();
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long sum = 0;
for (LongWritable l : values) {
sum += l.get();
}
fkvg.setRow(key).setColumn(Constants.COUNT_SEEN_COL).setValue(sum + "");
FluoKeyValue[] kvs = fkvg.getKeyValues();
for (FluoKeyValue kv : kvs) {
context.write(kv.getKey(), kv.getValue());
}
}
}
@Override
public int run(String[] args) throws Exception {
if (args.length != 3) {
System.err.println("Usage: " + this.getClass().getSimpleName() + " <fluoProps> <input dir> <tmp dir>");
System.exit(-1);
}
FluoConfiguration props = new FluoConfiguration(new File(args[0]));
Path input = new Path(args[1]);
Path tmp = new Path(args[2]);
int stopLevel;
int nodeSize;
try (FluoClient client = FluoFactory.newClient(props)) {
nodeSize = client.getAppConfiguration().getInt(Constants.NODE_SIZE_PROP);
stopLevel = client.getAppConfiguration().getInt(Constants.STOP_LEVEL_PROP);
}
int ret = unique(input, new Path(tmp, "nums"));
if (ret != 0)
return ret;
return buildTree(nodeSize, props, tmp, stopLevel);
}
private int unique(Path input, Path tmp) throws Exception {
Job job = Job.getInstance(getConf());
job.setJarByClass(Init.class);
job.setJobName(Init.class.getName() + "_unique");
job.setInputFormatClass(SequenceFileInputFormat.class);
SequenceFileInputFormat.addInputPath(job, input);
job.setReducerClass(UniqueReducer.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(NullWritable.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
SequenceFileOutputFormat.setOutputPath(job, tmp);
boolean success = job.waitForCompletion(true);
return success ? 0 : 1;
}
private int buildTree(int nodeSize, FluoConfiguration props, Path tmp, int stopLevel) throws Exception {
Job job = Job.getInstance(getConf());
job.setJarByClass(Init.class);
job.setJobName(Init.class.getName() + "_load");
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.getConfiguration().setInt(TRIE_NODE_SIZE_PROP, nodeSize);
job.getConfiguration().setInt(TRIE_STOP_LEVEL_PROP, stopLevel);
job.setInputFormatClass(SequenceFileInputFormat.class);
SequenceFileInputFormat.addInputPath(job, new Path(tmp, "nums"));
job.setMapperClass(InitMapper.class);
job.setCombinerClass(InitCombiner.class);
job.setReducerClass(InitReducer.class);
job.setOutputFormatClass(AccumuloFileOutputFormat.class);
job.setPartitionerClass(RangePartitioner.class);
FileSystem fs = FileSystem.get(job.getConfiguration());
Connector conn = AccumuloUtil.getConnector(props);
Path splitsPath = new Path(tmp, "splits.txt");
Collection<Text> splits1 = writeSplits(props, fs, conn, splitsPath);
RangePartitioner.setSplitFile(job, splitsPath.toString());
job.setNumReduceTasks(splits1.size() + 1);
Path outPath = new Path(tmp, "out");
AccumuloFileOutputFormat.setOutputPath(job, outPath);
boolean success = job.waitForCompletion(true);
if (success) {
Path failPath = new Path(tmp, "failures");
fs.mkdirs(failPath);
conn.tableOperations().importDirectory(props.getAccumuloTable(), outPath.toString(), failPath.toString(), false);
}
return success ? 0 : 1;
}
private Collection<Text> writeSplits(FluoConfiguration props, FileSystem fs, Connector conn, Path splitsPath) throws Exception {
Collection<Text> splits1 = conn.tableOperations().listSplits(props.getAccumuloTable());
OutputStream out = new BufferedOutputStream(fs.create(splitsPath));
for (Text split : splits1) {
out.write(Base64.encodeBase64(split.copyBytes()));
out.write('\n');
}
out.close();
return splits1;
}
public static void main(String[] args) throws Exception {
int ret = ToolRunner.run(new Init(), args);
System.exit(ret);
}
}