| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.trevni.avro.mapreduce; |
| |
| import static org.junit.Assert.assertEquals; |
| |
| import java.io.IOException; |
| import java.util.StringTokenizer; |
| |
| import org.apache.avro.Schema; |
| import org.apache.avro.generic.GenericData; |
| import org.apache.avro.generic.GenericData.Record; |
| import org.apache.avro.mapreduce.AvroJob; |
| import org.apache.avro.mapreduce.AvroKeyInputFormat; |
| import org.apache.avro.mapred.AvroKey; |
| import org.apache.avro.mapred.Pair; |
| import org.apache.hadoop.fs.Path; |
| import org.apache.hadoop.io.LongWritable; |
| import org.apache.hadoop.io.NullWritable; |
| import org.apache.hadoop.io.Text; |
| import org.apache.hadoop.mapreduce.Job; |
| import org.apache.hadoop.mapreduce.Mapper; |
| import org.apache.hadoop.mapreduce.Reducer; |
| import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; |
| import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; |
| import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; |
| import org.apache.trevni.avro.WordCountUtil; |
| import org.apache.trevni.avro.mapreduce.AvroTrevniKeyOutputFormat; |
| import org.junit.Test; |
| |
| public class TestKeyWordCount { |
| |
| private static long total = 0; |
| |
| static final Schema STRING = Schema.create(Schema.Type.STRING); |
| static { GenericData.setStringType(STRING, GenericData.StringType.String); } |
| static final Schema LONG = Schema.create(Schema.Type.LONG); |
| |
| |
| private static class WordCountMapper extends |
| Mapper<AvroKey<String>, NullWritable, Text, LongWritable> { |
| private LongWritable mCount = new LongWritable(); |
| private Text mText = new Text(); |
| |
| @Override |
| protected void setup(Context context) { |
| mCount.set(1); |
| } |
| |
| @Override |
| protected void map(AvroKey<String> key, NullWritable value, Context context) |
| throws IOException, InterruptedException { |
| |
| try { |
| StringTokenizer tokens = new StringTokenizer(key.datum()); |
| while (tokens.hasMoreTokens()) { |
| mText.set(tokens.nextToken()); |
| context.write(mText, mCount); |
| } |
| } catch (Exception e) { |
| throw new RuntimeException(key + " " + key.datum() , e); |
| } |
| |
| } |
| } |
| |
| private static class WordCountReducer extends Reducer< Text, LongWritable, AvroKey<GenericData.Record>, NullWritable> { |
| |
| private AvroKey<GenericData.Record> result ; |
| |
| @Override |
| protected void setup(Context context) { |
| result = new AvroKey<GenericData.Record>(); |
| result.datum(new Record(Pair.getPairSchema(STRING,LONG))); |
| } |
| |
| @Override |
| protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { |
| long count = 0; |
| for (LongWritable value: values) { |
| count += value.get(); |
| } |
| |
| result.datum().put("key", key.toString()); |
| result.datum().put("value", count); |
| |
| context.write(result, NullWritable.get()); |
| } |
| } |
| |
| |
| |
| public static class Counter extends |
| Mapper<AvroKey<GenericData.Record>, NullWritable, NullWritable, NullWritable> { |
| @Override |
| protected void map(AvroKey<GenericData.Record> key, NullWritable value, Context context) |
| throws IOException, InterruptedException { |
| total += (Long)key.datum().get("value"); |
| } |
| } |
| |
| |
| @Test public void testIOFormat() throws Exception { |
| checkOutputFormat(); |
| checkInputFormat(); |
| } |
| |
| public void checkOutputFormat() throws Exception { |
| Job job = new Job(); |
| |
| WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyTest", "part-r-00000"); |
| |
| wordCountUtil.writeLinesFile(); |
| |
| AvroJob.setInputKeySchema(job, STRING); |
| AvroJob.setOutputKeySchema(job, Pair.getPairSchema(STRING,LONG)); |
| |
| job.setMapperClass(WordCountMapper.class); |
| job.setReducerClass(WordCountReducer.class); |
| |
| job.setMapOutputKeyClass(Text.class); |
| job.setMapOutputValueClass(LongWritable.class); |
| |
| FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/in")); |
| FileOutputFormat.setOutputPath(job, new Path(wordCountUtil.getDir().toString() + "/out")); |
| FileOutputFormat.setCompressOutput(job, true); |
| |
| job.setInputFormatClass(AvroKeyInputFormat.class); |
| job.setOutputFormatClass(AvroTrevniKeyOutputFormat.class); |
| |
| job.waitForCompletion(true); |
| |
| wordCountUtil.validateCountsFile(); |
| } |
| |
| public void checkInputFormat() throws Exception { |
| Job job = new Job(); |
| |
| WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyTest"); |
| |
| job.setMapperClass(Counter.class); |
| |
| Schema subSchema = Schema.parse("{\"type\":\"record\"," + |
| "\"name\":\"PairValue\","+ |
| "\"fields\": [ " + |
| "{\"name\":\"value\", \"type\":\"long\"}" + |
| "]}"); |
| AvroJob.setInputKeySchema(job, subSchema); |
| |
| FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/out/*")); |
| job.setInputFormatClass(AvroTrevniKeyInputFormat.class); |
| |
| job.setNumReduceTasks(0); |
| job.setOutputFormatClass(NullOutputFormat.class); |
| |
| total = 0; |
| job.waitForCompletion(true); |
| assertEquals(WordCountUtil.TOTAL, total); |
| |
| } |
| |
| } |