lang/java/trevni/avro/src/test/java/org/apache/trevni/avro/mapreduce/TestKeyValueWordCount.java - avro - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.trevni.avro.mapreduce;

 import static org.junit.Assert.assertEquals;

 import java.io.IOException;
 import java.util.StringTokenizer;

 import org.apache.avro.Schema;
 import org.apache.avro.generic.GenericData;
 import org.apache.avro.mapred.AvroKey;
 import org.apache.avro.mapred.AvroValue;
 import org.apache.avro.mapreduce.AvroJob;
 import org.apache.avro.mapreduce.AvroKeyInputFormat;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
 import org.apache.trevni.avro.WordCountUtil;
 import org.junit.Test;

 public class TestKeyValueWordCount {

   private static long total = 0;

   static final Schema STRING = Schema.create(Schema.Type.STRING);
   static { GenericData.setStringType(STRING, GenericData.StringType.String); }
   static final Schema LONG = Schema.create(Schema.Type.LONG);

   private static class WordCountMapper extends
       Mapper<AvroKey<String>, NullWritable, Text, LongWritable> {
     private LongWritable mCount = new LongWritable();
     private Text mText = new Text();

     @Override
     protected void setup(Context context) {
       mCount.set(1);
     }

     @Override
     protected void map(AvroKey<String> key, NullWritable value, Context context)
         throws IOException, InterruptedException {

       try {
         StringTokenizer tokens = new StringTokenizer(key.datum());
         while (tokens.hasMoreTokens()) {
           mText.set(tokens.nextToken());
           context.write(mText, mCount);
         }
       } catch (Exception e) {
         throw new RuntimeException(key + " " + key.datum(), e);
       }

     }
   }

   private static class WordCountReducer extends Reducer< Text, LongWritable, AvroKey<String>, AvroValue<Long>> {

     AvroKey<String> resultKey = new AvroKey<String>();
     AvroValue<Long> resultValue = new AvroValue<Long>();

     @Override
     protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
       long sum = 0;
       for (LongWritable value: values) {
         sum += value.get();
       }
       resultKey.datum(key.toString());
       resultValue.datum(sum);

       context.write(resultKey, resultValue);
     }
   }

   public static class Counter extends
   Mapper<AvroKey<String>, AvroValue<Long>, NullWritable, NullWritable> {
     @Override
     protected void map(AvroKey<String> key, AvroValue<Long> value, Context context)
         throws IOException, InterruptedException {
       total += value.datum();
     }
   }

   @Test public void testIOFormat() throws Exception {
     checkOutputFormat();
     checkInputFormat();
   }

   public void checkOutputFormat() throws Exception {
     Job job = new Job();

     WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyValueTest", "part-r-00000");

     wordCountUtil.writeLinesFile();

     AvroJob.setInputKeySchema(job, STRING);
     AvroJob.setOutputKeySchema(job, STRING);
     AvroJob.setOutputValueSchema(job, LONG);

     job.setMapperClass(WordCountMapper.class);
     job.setReducerClass(WordCountReducer.class);

     job.setMapOutputKeyClass(Text.class);
     job.setMapOutputValueClass(LongWritable.class);

     FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/in"));
     FileOutputFormat.setOutputPath(job, new Path(wordCountUtil.getDir().toString() + "/out"));
     FileOutputFormat.setCompressOutput(job, true);

     job.setInputFormatClass(AvroKeyInputFormat.class);
     job.setOutputFormatClass(AvroTrevniKeyValueOutputFormat.class);

     job.waitForCompletion(true);

     wordCountUtil.validateCountsFileGenericRecord();
   }

   public void checkInputFormat() throws Exception {
     Job job = new Job();

     WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyValueTest");

     job.setMapperClass(Counter.class);

     FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/out/*"));
     job.setInputFormatClass(AvroTrevniKeyValueInputFormat.class);

     job.setNumReduceTasks(0);
     job.setOutputFormatClass(NullOutputFormat.class);

     total = 0;
     job.waitForCompletion(true);
     assertEquals(WordCountUtil.TOTAL, total);

   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.trevni.avro.mapreduce;

	import static org.junit.Assert.assertEquals;

	import java.io.IOException;
	import java.util.StringTokenizer;

	import org.apache.avro.Schema;
	import org.apache.avro.generic.GenericData;
	import org.apache.avro.mapred.AvroKey;
	import org.apache.avro.mapred.AvroValue;
	import org.apache.avro.mapreduce.AvroJob;
	import org.apache.avro.mapreduce.AvroKeyInputFormat;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.LongWritable;
	import org.apache.hadoop.io.NullWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.mapreduce.Mapper;
	import org.apache.hadoop.mapreduce.Reducer;
	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
	import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
	import org.apache.trevni.avro.WordCountUtil;
	import org.junit.Test;

	public class TestKeyValueWordCount {

	private static long total = 0;

	static final Schema STRING = Schema.create(Schema.Type.STRING);
	static { GenericData.setStringType(STRING, GenericData.StringType.String); }
	static final Schema LONG = Schema.create(Schema.Type.LONG);

	private static class WordCountMapper extends
	Mapper<AvroKey<String>, NullWritable, Text, LongWritable> {
	private LongWritable mCount = new LongWritable();
	private Text mText = new Text();

	@Override
	protected void setup(Context context) {
	mCount.set(1);
	}

	@Override
	protected void map(AvroKey<String> key, NullWritable value, Context context)
	throws IOException, InterruptedException {

	try {
	StringTokenizer tokens = new StringTokenizer(key.datum());
	while (tokens.hasMoreTokens()) {
	mText.set(tokens.nextToken());
	context.write(mText, mCount);
	}
	} catch (Exception e) {
	throw new RuntimeException(key + " " + key.datum(), e);
	}

	}
	}

	private static class WordCountReducer extends Reducer< Text, LongWritable, AvroKey<String>, AvroValue<Long>> {

	AvroKey<String> resultKey = new AvroKey<String>();
	AvroValue<Long> resultValue = new AvroValue<Long>();

	@Override
	protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
	long sum = 0;
	for (LongWritable value: values) {
	sum += value.get();
	}
	resultKey.datum(key.toString());
	resultValue.datum(sum);

	context.write(resultKey, resultValue);
	}
	}

	public static class Counter extends
	Mapper<AvroKey<String>, AvroValue<Long>, NullWritable, NullWritable> {
	@Override
	protected void map(AvroKey<String> key, AvroValue<Long> value, Context context)
	throws IOException, InterruptedException {
	total += value.datum();
	}
	}

	@Test public void testIOFormat() throws Exception {
	checkOutputFormat();
	checkInputFormat();
	}

	public void checkOutputFormat() throws Exception {
	Job job = new Job();

	WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyValueTest", "part-r-00000");

	wordCountUtil.writeLinesFile();

	AvroJob.setInputKeySchema(job, STRING);
	AvroJob.setOutputKeySchema(job, STRING);
	AvroJob.setOutputValueSchema(job, LONG);

	job.setMapperClass(WordCountMapper.class);
	job.setReducerClass(WordCountReducer.class);

	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(LongWritable.class);

	FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/in"));
	FileOutputFormat.setOutputPath(job, new Path(wordCountUtil.getDir().toString() + "/out"));
	FileOutputFormat.setCompressOutput(job, true);

	job.setInputFormatClass(AvroKeyInputFormat.class);
	job.setOutputFormatClass(AvroTrevniKeyValueOutputFormat.class);

	job.waitForCompletion(true);

	wordCountUtil.validateCountsFileGenericRecord();
	}

	public void checkInputFormat() throws Exception {
	Job job = new Job();

	WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyValueTest");

	job.setMapperClass(Counter.class);

	FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/out/*"));
	job.setInputFormatClass(AvroTrevniKeyValueInputFormat.class);

	job.setNumReduceTasks(0);
	job.setOutputFormatClass(NullOutputFormat.class);

	total = 0;
	job.waitForCompletion(true);
	assertEquals(WordCountUtil.TOTAL, total);

	}
	}