parquet-avro/src/test/java/parquet/avro/TestInputOutputFormat.java - parquet-java - Git at Google

 /**
  * Copyright 2012 Twitter, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package parquet.avro;

 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
 import java.util.Arrays;
 import org.apache.avro.Schema;
 import org.apache.avro.generic.GenericRecord;
 import org.apache.avro.generic.GenericRecordBuilder;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 import org.junit.Test;
 import parquet.Log;

 import static java.lang.Thread.sleep;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNull;

 public class TestInputOutputFormat {
   private static final Log LOG = Log.getLog(TestInputOutputFormat.class);

   private static Schema avroSchema;
   static {
     avroSchema = Schema.createRecord("record1", null, null, false);
     avroSchema.setFields(
         Arrays.asList(new Schema.Field("a",
             Schema.createUnion(Arrays.asList(Schema.create(Schema.Type.INT), Schema.create(Schema.Type.NULL))),
             null, null)));
   }

   public static GenericRecord nextRecord(Integer i) {
     return new GenericRecordBuilder(avroSchema).set("a", i).build();
   };

   public static class MyMapper extends Mapper<LongWritable, Text, Void, GenericRecord> {

     public void run(Context context) throws IOException ,InterruptedException {
       for (int i = 0; i < 10; i++) {
         GenericRecord a;
         a = TestInputOutputFormat.nextRecord(i == 4 ? null : i);
         context.write(null, a);
       }
     }
   }

   public static class MyMapper2 extends Mapper<Void, GenericRecord, LongWritable, Text> {
     protected void map(Void key, GenericRecord value, Context context) throws IOException ,InterruptedException {
       context.write(null, new Text(value.toString()));
     }

   }

   @Test
   public void testReadWrite() throws Exception {

     final Configuration conf = new Configuration();
     final Path inputPath = new Path("src/test/java/parquet/avro/TestInputOutputFormat.java");
     final Path parquetPath = new Path("target/test/hadoop/TestInputOutputFormat/parquet");
     final Path outputPath = new Path("target/test/hadoop/TestInputOutputFormat/out");
     final FileSystem fileSystem = parquetPath.getFileSystem(conf);
     fileSystem.delete(parquetPath, true);
     fileSystem.delete(outputPath, true);
     {
       final Job job = new Job(conf, "write");

       // input not really used
       TextInputFormat.addInputPath(job, inputPath);
       job.setInputFormatClass(TextInputFormat.class);

       job.setMapperClass(TestInputOutputFormat.MyMapper.class);
       job.setNumReduceTasks(0);

       job.setOutputFormatClass(AvroParquetOutputFormat.class);
       AvroParquetOutputFormat.setOutputPath(job, parquetPath);
       AvroParquetOutputFormat.setSchema(job, avroSchema);

       waitForJob(job);
     }
     {
       final Job job = new Job(conf, "read");
       job.setInputFormatClass(AvroParquetInputFormat.class);
       AvroParquetInputFormat.setInputPaths(job, parquetPath);

       job.setMapperClass(TestInputOutputFormat.MyMapper2.class);
       job.setNumReduceTasks(0);

       job.setOutputFormatClass(TextOutputFormat.class);
       TextOutputFormat.setOutputPath(job, outputPath);

       waitForJob(job);
     }

     final BufferedReader out = new BufferedReader(new FileReader(new File(outputPath.toString(), "part-m-00000")));
     String lineOut = null;
     int lineNumber = 0;
     while ((lineOut = out.readLine()) != null) {
       lineOut = lineOut.substring(lineOut.indexOf("\t") + 1);
       GenericRecord a = nextRecord(lineNumber == 4 ? null : lineNumber);
       assertEquals("line " + lineNumber, a.toString(), lineOut);
       ++ lineNumber;
     }
     assertNull("line " + lineNumber, out.readLine());
     out.close();
   }

   private void waitForJob(Job job) throws Exception {
     job.submit();
     while (!job.isComplete()) {
       LOG.debug("waiting for job " + job.getJobName());
       sleep(100);
     }
     LOG.info("status for job " + job.getJobName() + ": " + (job.isSuccessful() ? "SUCCESS" : "FAILURE"));
     if (!job.isSuccessful()) {
       throw new RuntimeException("job failed " + job.getJobName());
     }
   }

 }
	/**
	* Copyright 2012 Twitter, Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package parquet.avro;

	import java.io.BufferedReader;
	import java.io.File;
	import java.io.FileReader;
	import java.io.IOException;
	import java.util.Arrays;
	import org.apache.avro.Schema;
	import org.apache.avro.generic.GenericRecord;
	import org.apache.avro.generic.GenericRecordBuilder;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.LongWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.mapreduce.Mapper;
	import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
	import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
	import org.junit.Test;
	import parquet.Log;

	import static java.lang.Thread.sleep;
	import static org.junit.Assert.assertEquals;
	import static org.junit.Assert.assertNull;

	public class TestInputOutputFormat {
	private static final Log LOG = Log.getLog(TestInputOutputFormat.class);

	private static Schema avroSchema;
	static {
	avroSchema = Schema.createRecord("record1", null, null, false);
	avroSchema.setFields(
	Arrays.asList(new Schema.Field("a",
	Schema.createUnion(Arrays.asList(Schema.create(Schema.Type.INT), Schema.create(Schema.Type.NULL))),
	null, null)));
	}

	public static GenericRecord nextRecord(Integer i) {
	return new GenericRecordBuilder(avroSchema).set("a", i).build();
	};

	public static class MyMapper extends Mapper<LongWritable, Text, Void, GenericRecord> {

	public void run(Context context) throws IOException ,InterruptedException {
	for (int i = 0; i < 10; i++) {
	GenericRecord a;
	a = TestInputOutputFormat.nextRecord(i == 4 ? null : i);
	context.write(null, a);
	}
	}
	}

	public static class MyMapper2 extends Mapper<Void, GenericRecord, LongWritable, Text> {
	protected void map(Void key, GenericRecord value, Context context) throws IOException ,InterruptedException {
	context.write(null, new Text(value.toString()));
	}

	}

	@Test
	public void testReadWrite() throws Exception {

	final Configuration conf = new Configuration();
	final Path inputPath = new Path("src/test/java/parquet/avro/TestInputOutputFormat.java");
	final Path parquetPath = new Path("target/test/hadoop/TestInputOutputFormat/parquet");
	final Path outputPath = new Path("target/test/hadoop/TestInputOutputFormat/out");
	final FileSystem fileSystem = parquetPath.getFileSystem(conf);
	fileSystem.delete(parquetPath, true);
	fileSystem.delete(outputPath, true);
	{
	final Job job = new Job(conf, "write");

	// input not really used
	TextInputFormat.addInputPath(job, inputPath);
	job.setInputFormatClass(TextInputFormat.class);

	job.setMapperClass(TestInputOutputFormat.MyMapper.class);
	job.setNumReduceTasks(0);

	job.setOutputFormatClass(AvroParquetOutputFormat.class);
	AvroParquetOutputFormat.setOutputPath(job, parquetPath);
	AvroParquetOutputFormat.setSchema(job, avroSchema);

	waitForJob(job);
	}
	{
	final Job job = new Job(conf, "read");
	job.setInputFormatClass(AvroParquetInputFormat.class);
	AvroParquetInputFormat.setInputPaths(job, parquetPath);

	job.setMapperClass(TestInputOutputFormat.MyMapper2.class);
	job.setNumReduceTasks(0);

	job.setOutputFormatClass(TextOutputFormat.class);
	TextOutputFormat.setOutputPath(job, outputPath);

	waitForJob(job);
	}

	final BufferedReader out = new BufferedReader(new FileReader(new File(outputPath.toString(), "part-m-00000")));
	String lineOut = null;
	int lineNumber = 0;
	while ((lineOut = out.readLine()) != null) {
	lineOut = lineOut.substring(lineOut.indexOf("\t") + 1);
	GenericRecord a = nextRecord(lineNumber == 4 ? null : lineNumber);
	assertEquals("line " + lineNumber, a.toString(), lineOut);
	++ lineNumber;
	}
	assertNull("line " + lineNumber, out.readLine());
	out.close();
	}

	private void waitForJob(Job job) throws Exception {
	job.submit();
	while (!job.isComplete()) {
	LOG.debug("waiting for job " + job.getJobName());
	sleep(100);
	}
	LOG.info("status for job " + job.getJobName() + ": " + (job.isSuccessful() ? "SUCCESS" : "FAILURE"));
	if (!job.isSuccessful()) {
	throw new RuntimeException("job failed " + job.getJobName());
	}
	}

	}