blob: 4f4025d41af53f19b16844b3f0544a9c67fd44b3 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.apache.avro.mapreduce;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroValue;
import org.apache.avro.mapred.FsInput;
import org.apache.avro.specific.SpecificDatumReader;
import org.apache.avro.reflect.ReflectData;
import org.apache.avro.reflect.ReflectDatumReader;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
public class TestWordCount {
@Rule
public TemporaryFolder tmpFolder = new TemporaryFolder();
public static final Schema STATS_SCHEMA = new Schema.Parser().parse("{\"name\":\"stats\",\"type\":\"record\","
+ "\"fields\":[{\"name\":\"count\",\"type\":\"int\"}," + "{\"name\":\"name\",\"type\":\"string\"}]}");
public static class ReflectStats {
String name;
int count;
}
// permit data written as SpecficStats to be read as ReflectStats
private static Schema REFLECT_STATS_SCHEMA = ReflectData.get().getSchema(ReflectStats.class);
static {
REFLECT_STATS_SCHEMA.addAlias(TextStats.SCHEMA$.getFullName());
}
private static class LineCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private IntWritable mOne;
@Override
protected void setup(Context context) {
mOne = new IntWritable(1);
}
@Override
protected void map(LongWritable fileByteOffset, Text line, Context context)
throws IOException, InterruptedException {
context.write(line, mOne);
}
}
private static class StatCountMapper extends Mapper<AvroKey<TextStats>, NullWritable, Text, IntWritable> {
private IntWritable mCount;
private Text mText;
@Override
protected void setup(Context context) {
mCount = new IntWritable(0);
mText = new Text("");
}
@Override
protected void map(AvroKey<TextStats> record, NullWritable ignore, Context context)
throws IOException, InterruptedException {
mCount.set(record.datum().getCount());
mText.set(record.datum().getName().toString());
context.write(mText, mCount);
}
}
private static class ReflectCountMapper extends Mapper<AvroKey<ReflectStats>, NullWritable, Text, IntWritable> {
private IntWritable mCount;
private Text mText;
@Override
protected void setup(Context context) {
mCount = new IntWritable(0);
mText = new Text("");
}
@Override
protected void map(AvroKey<ReflectStats> record, NullWritable ignore, Context context)
throws IOException, InterruptedException {
mCount.set(record.datum().count);
mText.set(record.datum().name);
context.write(mText, mCount);
}
}
private static class AvroSumReducer extends Reducer<Text, IntWritable, AvroKey<CharSequence>, AvroValue<Integer>> {
@Override
protected void reduce(Text key, Iterable<IntWritable> counts, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable count : counts) {
sum += count.get();
}
context.write(new AvroKey<>(key.toString()), new AvroValue<>(sum));
}
}
private static class GenericStatsReducer
extends Reducer<Text, IntWritable, AvroKey<GenericData.Record>, NullWritable> {
private AvroKey<GenericData.Record> mStats;
@Override
protected void setup(Context context) {
mStats = new AvroKey<>(null);
}
@Override
protected void reduce(Text line, Iterable<IntWritable> counts, Context context)
throws IOException, InterruptedException {
GenericData.Record record = new GenericData.Record(STATS_SCHEMA);
int sum = 0;
for (IntWritable count : counts) {
sum += count.get();
}
record.put("name", new Utf8(line.toString()));
record.put("count", sum);
mStats.datum(record);
context.write(mStats, NullWritable.get());
}
}
private static class SpecificStatsReducer extends Reducer<Text, IntWritable, AvroKey<TextStats>, NullWritable> {
private AvroKey<TextStats> mStats;
@Override
protected void setup(Context context) {
mStats = new AvroKey<>(null);
}
@Override
protected void reduce(Text line, Iterable<IntWritable> counts, Context context)
throws IOException, InterruptedException {
TextStats record = new TextStats();
record.setCount(0);
for (IntWritable count : counts) {
record.setCount(record.getCount() + count.get());
}
record.setName(line.toString());
mStats.datum(record);
context.write(mStats, NullWritable.get());
}
}
private static class ReflectStatsReducer extends Reducer<Text, IntWritable, AvroKey<ReflectStats>, NullWritable> {
private AvroKey<ReflectStats> mStats;
@Override
protected void setup(Context context) {
mStats = new AvroKey<>(null);
}
@Override
protected void reduce(Text line, Iterable<IntWritable> counts, Context context)
throws IOException, InterruptedException {
ReflectStats record = new ReflectStats();
record.count = 0;
for (IntWritable count : counts) {
record.count += count.get();
}
record.name = line.toString();
mStats.datum(record);
context.write(mStats, NullWritable.get());
}
}
private static class SortMapper extends Mapper<AvroKey<TextStats>, NullWritable, AvroKey<TextStats>, NullWritable> {
@Override
protected void map(AvroKey<TextStats> key, NullWritable value, Context context)
throws IOException, InterruptedException {
context.write(key, value);
}
}
private static class SortReducer extends Reducer<AvroKey<TextStats>, NullWritable, AvroKey<TextStats>, NullWritable> {
@Override
protected void reduce(AvroKey<TextStats> key, Iterable<NullWritable> ignore, Context context)
throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
@Test
public void testAvroGenericOutput() throws Exception {
Job job = Job.getInstance();
FileInputFormat.setInputPaths(job,
new Path(getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt").toURI().toString()));
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(LineCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(GenericStatsReducer.class);
AvroJob.setOutputKeySchema(job, STATS_SCHEMA);
job.setOutputFormatClass(AvroKeyOutputFormat.class);
Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-generic");
FileOutputFormat.setOutputPath(job, outputPath);
Assert.assertTrue(job.waitForCompletion(true));
// Check that the results from the MapReduce were as expected.
FileSystem fileSystem = FileSystem.get(job.getConfiguration());
FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*"));
Assert.assertEquals(1, outputFiles.length);
DataFileReader<GenericData.Record> reader = new DataFileReader<>(
new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new GenericDatumReader<>(STATS_SCHEMA));
Map<String, Integer> counts = new HashMap<>();
for (GenericData.Record record : reader) {
counts.put(((Utf8) record.get("name")).toString(), (Integer) record.get("count"));
}
reader.close();
Assert.assertEquals(3, counts.get("apple").intValue());
Assert.assertEquals(2, counts.get("banana").intValue());
Assert.assertEquals(1, counts.get("carrot").intValue());
}
@Test
public void testAvroSpecificOutput() throws Exception {
Job job = Job.getInstance();
FileInputFormat.setInputPaths(job,
new Path(getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt").toURI().toString()));
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(LineCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(SpecificStatsReducer.class);
AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$);
job.setOutputFormatClass(AvroKeyOutputFormat.class);
Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-specific");
FileOutputFormat.setOutputPath(job, outputPath);
Assert.assertTrue(job.waitForCompletion(true));
// Check that the results from the MapReduce were as expected.
FileSystem fileSystem = FileSystem.get(job.getConfiguration());
FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*"));
Assert.assertEquals(1, outputFiles.length);
DataFileReader<TextStats> reader = new DataFileReader<>(
new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new SpecificDatumReader<>());
Map<String, Integer> counts = new HashMap<>();
for (TextStats record : reader) {
counts.put(record.getName().toString(), record.getCount());
}
reader.close();
Assert.assertEquals(3, counts.get("apple").intValue());
Assert.assertEquals(2, counts.get("banana").intValue());
Assert.assertEquals(1, counts.get("carrot").intValue());
}
@Test
public void testAvroReflectOutput() throws Exception {
Job job = Job.getInstance();
FileInputFormat.setInputPaths(job,
new Path(getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt").toURI().toString()));
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(LineCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(ReflectStatsReducer.class);
AvroJob.setOutputKeySchema(job, REFLECT_STATS_SCHEMA);
job.setOutputFormatClass(AvroKeyOutputFormat.class);
Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-reflect");
FileOutputFormat.setOutputPath(job, outputPath);
Assert.assertTrue(job.waitForCompletion(true));
// Check that the results from the MapReduce were as expected.
FileSystem fileSystem = FileSystem.get(job.getConfiguration());
FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*"));
Assert.assertEquals(1, outputFiles.length);
DataFileReader<ReflectStats> reader = new DataFileReader<>(
new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new ReflectDatumReader<>());
Map<String, Integer> counts = new HashMap<>();
for (ReflectStats record : reader) {
counts.put(record.name, record.count);
}
reader.close();
Assert.assertEquals(3, counts.get("apple").intValue());
Assert.assertEquals(2, counts.get("banana").intValue());
Assert.assertEquals(1, counts.get("carrot").intValue());
}
@Test
public void testAvroInput() throws Exception {
Job job = Job.getInstance();
FileInputFormat.setInputPaths(job,
new Path(getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.avro").toURI().toString()));
job.setInputFormatClass(AvroKeyInputFormat.class);
AvroJob.setInputKeySchema(job, TextStats.SCHEMA$);
job.setMapperClass(StatCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(SpecificStatsReducer.class);
AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$);
job.setOutputFormatClass(AvroKeyOutputFormat.class);
Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-specific-input");
FileOutputFormat.setOutputPath(job, outputPath);
Assert.assertTrue(job.waitForCompletion(true));
// Check that the results from the MapReduce were as expected.
FileSystem fileSystem = FileSystem.get(job.getConfiguration());
FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*"));
Assert.assertEquals(1, outputFiles.length);
DataFileReader<TextStats> reader = new DataFileReader<>(
new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new SpecificDatumReader<>());
Map<String, Integer> counts = new HashMap<>();
for (TextStats record : reader) {
counts.put(record.getName().toString(), record.getCount());
}
reader.close();
Assert.assertEquals(3, counts.get("apple").intValue());
Assert.assertEquals(2, counts.get("banana").intValue());
Assert.assertEquals(1, counts.get("carrot").intValue());
}
@Test
public void testReflectInput() throws Exception {
Job job = Job.getInstance();
FileInputFormat.setInputPaths(job,
new Path(getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.avro").toURI().toString()));
job.setInputFormatClass(AvroKeyInputFormat.class);
AvroJob.setInputKeySchema(job, REFLECT_STATS_SCHEMA);
job.setMapperClass(ReflectCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(ReflectStatsReducer.class);
AvroJob.setOutputKeySchema(job, REFLECT_STATS_SCHEMA);
job.setOutputFormatClass(AvroKeyOutputFormat.class);
Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-reflect-input");
FileOutputFormat.setOutputPath(job, outputPath);
Assert.assertTrue(job.waitForCompletion(true));
// Check that the results from the MapReduce were as expected.
FileSystem fileSystem = FileSystem.get(job.getConfiguration());
FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*"));
Assert.assertEquals(1, outputFiles.length);
DataFileReader<ReflectStats> reader = new DataFileReader<>(
new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new ReflectDatumReader<>());
Map<String, Integer> counts = new HashMap<>();
for (ReflectStats record : reader) {
counts.put(record.name, record.count);
}
reader.close();
Assert.assertEquals(3, counts.get("apple").intValue());
Assert.assertEquals(2, counts.get("banana").intValue());
Assert.assertEquals(1, counts.get("carrot").intValue());
}
@Test
public void testAvroMapOutput() throws Exception {
Job job = Job.getInstance();
FileInputFormat.setInputPaths(job,
new Path(getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.avro").toURI().toString()));
job.setInputFormatClass(AvroKeyInputFormat.class);
AvroJob.setInputKeySchema(job, TextStats.SCHEMA$);
job.setMapperClass(SortMapper.class);
AvroJob.setMapOutputKeySchema(job, TextStats.SCHEMA$);
job.setMapOutputValueClass(NullWritable.class);
job.setReducerClass(SortReducer.class);
AvroJob.setOutputKeySchema(job, TextStats.SCHEMA$);
job.setOutputFormatClass(AvroKeyOutputFormat.class);
Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-specific-input");
FileOutputFormat.setOutputPath(job, outputPath);
Assert.assertTrue(job.waitForCompletion(true));
// Check that the results from the MapReduce were as expected.
FileSystem fileSystem = FileSystem.get(job.getConfiguration());
FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*"));
Assert.assertEquals(1, outputFiles.length);
DataFileReader<TextStats> reader = new DataFileReader<>(
new FsInput(outputFiles[0].getPath(), job.getConfiguration()), new SpecificDatumReader<>());
Map<String, Integer> counts = new HashMap<>();
for (TextStats record : reader) {
counts.put(record.getName().toString(), record.getCount());
}
reader.close();
Assert.assertEquals(3, counts.get("apple").intValue());
Assert.assertEquals(2, counts.get("banana").intValue());
Assert.assertEquals(1, counts.get("carrot").intValue());
}
/**
* Tests the MR output to text files when using AvroKey and AvroValue records.
*/
@Test
public void testAvroUsingTextFileOutput() throws Exception {
Job job = Job.getInstance();
FileInputFormat.setInputPaths(job,
new Path(getClass().getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt").toURI().toString()));
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(LineCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(AvroSumReducer.class);
AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));
job.setOutputFormatClass(TextOutputFormat.class);
Path outputPath = new Path(tmpFolder.getRoot().getPath() + "/out-text");
FileOutputFormat.setOutputPath(job, outputPath);
Assert.assertTrue(job.waitForCompletion(true));
// Check that the results from the MapReduce were as expected.
FileSystem fileSystem = FileSystem.get(job.getConfiguration());
FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/part-*"));
Assert.assertEquals(1, outputFiles.length);
Path filePath = outputFiles[0].getPath();
InputStream inputStream = filePath.getFileSystem(job.getConfiguration()).open(filePath);
Assert.assertNotNull(inputStream);
try (BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream))) {
Assert.assertTrue(reader.ready());
Assert.assertEquals("apple\t3", reader.readLine());
Assert.assertEquals("banana\t2", reader.readLine());
Assert.assertEquals("carrot\t1", reader.readLine());
Assert.assertFalse(reader.ready());
}
}
}