lang/java/trevni/avro/src/test/java/org/apache/trevni/avro/mapreduce/TestKeyWordCount.java - avro - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.trevni.avro.mapreduce;

 import static org.junit.Assert.assertEquals;

 import java.io.IOException;
 import java.util.StringTokenizer;

 import org.apache.avro.Schema;
 import org.apache.avro.generic.GenericData;
 import org.apache.avro.generic.GenericData.Record;
 import org.apache.avro.mapreduce.AvroJob;
 import org.apache.avro.mapreduce.AvroKeyInputFormat;
 import org.apache.avro.mapred.AvroKey;
 import org.apache.avro.mapred.Pair;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
 import org.apache.trevni.avro.WordCountUtil;
 import org.apache.trevni.avro.mapreduce.AvroTrevniKeyOutputFormat;
 import org.junit.Test;

 public class TestKeyWordCount {

   private static long total = 0;

   static final Schema STRING = Schema.create(Schema.Type.STRING);
   static { GenericData.setStringType(STRING, GenericData.StringType.String); }
   static final Schema LONG = Schema.create(Schema.Type.LONG);


   private static class WordCountMapper extends
       Mapper<AvroKey<String>, NullWritable, Text, LongWritable> {
     private LongWritable mCount = new LongWritable();
     private Text mText = new Text();

     @Override
     protected void setup(Context context) {
       mCount.set(1);
     }

     @Override
     protected void map(AvroKey<String> key, NullWritable value, Context context)
         throws IOException, InterruptedException {

       try {
         StringTokenizer tokens = new StringTokenizer(key.datum());
         while (tokens.hasMoreTokens()) {
           mText.set(tokens.nextToken());
           context.write(mText, mCount);
         }
       } catch (Exception e) {
         throw new RuntimeException(key + " "  + key.datum() , e);
       }

     }
   }

   private static class WordCountReducer extends Reducer< Text, LongWritable, AvroKey<GenericData.Record>, NullWritable> {

     private AvroKey<GenericData.Record> result ;

     @Override
     protected void setup(Context context) {
       result = new AvroKey<GenericData.Record>();
       result.datum(new Record(Pair.getPairSchema(STRING,LONG)));
     }

     @Override
     protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
       long count = 0;
       for (LongWritable value: values) {
         count += value.get();
       }

       result.datum().put("key", key.toString());
       result.datum().put("value", count);

       context.write(result, NullWritable.get());
     }
   }


   public static class Counter extends
   Mapper<AvroKey<GenericData.Record>, NullWritable, NullWritable, NullWritable> {
     @Override
     protected void map(AvroKey<GenericData.Record> key, NullWritable value, Context context)
         throws IOException, InterruptedException {
       total += (Long)key.datum().get("value");
     }
   }


   @Test public void testIOFormat() throws Exception {
     checkOutputFormat();
     checkInputFormat();
   }

   public void checkOutputFormat() throws Exception {
     Job job = new Job();

     WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyTest", "part-r-00000");

     wordCountUtil.writeLinesFile();

     AvroJob.setInputKeySchema(job, STRING);
     AvroJob.setOutputKeySchema(job, Pair.getPairSchema(STRING,LONG));

     job.setMapperClass(WordCountMapper.class);
     job.setReducerClass(WordCountReducer.class);

     job.setMapOutputKeyClass(Text.class);
     job.setMapOutputValueClass(LongWritable.class);

     FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/in"));
     FileOutputFormat.setOutputPath(job, new Path(wordCountUtil.getDir().toString() + "/out"));
     FileOutputFormat.setCompressOutput(job, true);

     job.setInputFormatClass(AvroKeyInputFormat.class);
     job.setOutputFormatClass(AvroTrevniKeyOutputFormat.class);

     job.waitForCompletion(true);

     wordCountUtil.validateCountsFile();
   }

   public void checkInputFormat() throws Exception {
     Job job = new Job();

     WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyTest");

     job.setMapperClass(Counter.class);

     Schema subSchema = Schema.parse("{\"type\":\"record\"," +
                                     "\"name\":\"PairValue\","+
                                     "\"fields\": [ " +
                                     "{\"name\":\"value\", \"type\":\"long\"}" +
                                     "]}");
     AvroJob.setInputKeySchema(job, subSchema);

     FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/out/*"));
     job.setInputFormatClass(AvroTrevniKeyInputFormat.class);

     job.setNumReduceTasks(0);
     job.setOutputFormatClass(NullOutputFormat.class);

     total = 0;
     job.waitForCompletion(true);
     assertEquals(WordCountUtil.TOTAL, total);

   }

 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.trevni.avro.mapreduce;

	import static org.junit.Assert.assertEquals;

	import java.io.IOException;
	import java.util.StringTokenizer;

	import org.apache.avro.Schema;
	import org.apache.avro.generic.GenericData;
	import org.apache.avro.generic.GenericData.Record;
	import org.apache.avro.mapreduce.AvroJob;
	import org.apache.avro.mapreduce.AvroKeyInputFormat;
	import org.apache.avro.mapred.AvroKey;
	import org.apache.avro.mapred.Pair;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.LongWritable;
	import org.apache.hadoop.io.NullWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.mapreduce.Mapper;
	import org.apache.hadoop.mapreduce.Reducer;
	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
	import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
	import org.apache.trevni.avro.WordCountUtil;
	import org.apache.trevni.avro.mapreduce.AvroTrevniKeyOutputFormat;
	import org.junit.Test;

	public class TestKeyWordCount {

	private static long total = 0;

	static final Schema STRING = Schema.create(Schema.Type.STRING);
	static { GenericData.setStringType(STRING, GenericData.StringType.String); }
	static final Schema LONG = Schema.create(Schema.Type.LONG);


	private static class WordCountMapper extends
	Mapper<AvroKey<String>, NullWritable, Text, LongWritable> {
	private LongWritable mCount = new LongWritable();
	private Text mText = new Text();

	@Override
	protected void setup(Context context) {
	mCount.set(1);
	}

	@Override
	protected void map(AvroKey<String> key, NullWritable value, Context context)
	throws IOException, InterruptedException {

	try {
	StringTokenizer tokens = new StringTokenizer(key.datum());
	while (tokens.hasMoreTokens()) {
	mText.set(tokens.nextToken());
	context.write(mText, mCount);
	}
	} catch (Exception e) {
	throw new RuntimeException(key + " " + key.datum() , e);
	}

	}
	}

	private static class WordCountReducer extends Reducer< Text, LongWritable, AvroKey<GenericData.Record>, NullWritable> {

	private AvroKey<GenericData.Record> result ;

	@Override
	protected void setup(Context context) {
	result = new AvroKey<GenericData.Record>();
	result.datum(new Record(Pair.getPairSchema(STRING,LONG)));
	}

	@Override
	protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
	long count = 0;
	for (LongWritable value: values) {
	count += value.get();
	}

	result.datum().put("key", key.toString());
	result.datum().put("value", count);

	context.write(result, NullWritable.get());
	}
	}



	public static class Counter extends
	Mapper<AvroKey<GenericData.Record>, NullWritable, NullWritable, NullWritable> {
	@Override
	protected void map(AvroKey<GenericData.Record> key, NullWritable value, Context context)
	throws IOException, InterruptedException {
	total += (Long)key.datum().get("value");
	}
	}


	@Test public void testIOFormat() throws Exception {
	checkOutputFormat();
	checkInputFormat();
	}

	public void checkOutputFormat() throws Exception {
	Job job = new Job();

	WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyTest", "part-r-00000");

	wordCountUtil.writeLinesFile();

	AvroJob.setInputKeySchema(job, STRING);
	AvroJob.setOutputKeySchema(job, Pair.getPairSchema(STRING,LONG));

	job.setMapperClass(WordCountMapper.class);
	job.setReducerClass(WordCountReducer.class);

	job.setMapOutputKeyClass(Text.class);
	job.setMapOutputValueClass(LongWritable.class);

	FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/in"));
	FileOutputFormat.setOutputPath(job, new Path(wordCountUtil.getDir().toString() + "/out"));
	FileOutputFormat.setCompressOutput(job, true);

	job.setInputFormatClass(AvroKeyInputFormat.class);
	job.setOutputFormatClass(AvroTrevniKeyOutputFormat.class);

	job.waitForCompletion(true);

	wordCountUtil.validateCountsFile();
	}

	public void checkInputFormat() throws Exception {
	Job job = new Job();

	WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyTest");

	job.setMapperClass(Counter.class);

	Schema subSchema = Schema.parse("{\"type\":\"record\"," +
	"\"name\":\"PairValue\","+
	"\"fields\": [ " +
	"{\"name\":\"value\", \"type\":\"long\"}" +
	"]}");
	AvroJob.setInputKeySchema(job, subSchema);

	FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/out/*"));
	job.setInputFormatClass(AvroTrevniKeyInputFormat.class);

	job.setNumReduceTasks(0);
	job.setOutputFormatClass(NullOutputFormat.class);

	total = 0;
	job.waitForCompletion(true);
	assertEquals(WordCountUtil.TOTAL, total);

	}

	}