src/org/apache/pig/builtin/TrevniStorage.java - pig - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.pig.builtin;

 import java.io.IOException;
 import java.io.OutputStream;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map.Entry;

 import org.apache.avro.Schema;
 import org.apache.avro.generic.GenericData;
 import org.apache.avro.generic.GenericData.Record;
 import org.apache.avro.mapred.AvroJob;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.mapreduce.InputFormat;
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.hadoop.mapreduce.OutputFormat;
 import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.mapreduce.RecordWriter;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.pig.LoadPushDown;
 import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration;
 import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigFileInputFormat;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.impl.util.Utils;
 import org.apache.pig.impl.util.avro.AvroRecordWriter;
 import org.apache.pig.impl.util.avro.AvroStorageDataConversionUtilities;
 import org.apache.trevni.ColumnFileMetaData;
 import org.apache.trevni.MetaData;
 import org.apache.trevni.avro.AvroColumnReader;
 import org.apache.trevni.avro.AvroColumnWriter;
 import org.apache.trevni.avro.AvroTrevniOutputFormat;
 import org.apache.trevni.avro.HadoopInput;

 import com.google.common.collect.Lists;

 /**
  *
  * Pig Store/Load Function for Trevni.
  *
  */

 public class TrevniStorage extends AvroStorage implements LoadPushDown{

   /**
    * Create new instance of TrevniStorage with no arguments (useful
    * for loading files without specifying parameters).
    */
   public TrevniStorage() {
     super();
   }

   /**
    * Create new instance of TrevniStorage.
    *  @param sn Specifies the input/output schema or record name.
    *  @param opts Options for AvroStorage:
    *  <li><code>-namespace</code> Namespace for an automatically generated
    *    output schema.</li>
    *  <li><code>-schemafile</code> Specifies URL for avro schema file
    *    from which to read the input schema (can be local file, hdfs,
    *    url, etc).</li>
    *  <li><code>-examplefile</code> Specifies URL for avro data file from
    *    which to copy the input schema (can be local file, hdfs, url, etc).</li>
    *  <li><code>-allowrecursive</code> Option to allow recursive schema
    *    definitions (default is false).</li>
    */
   public TrevniStorage(final String sn, final String opts) {
     super(sn, opts);
   }

   /*
    * @see org.apache.pig.LoadFunc#getInputFormat()
    */
   @Override
   public InputFormat<NullWritable, GenericData.Record> getInputFormat()
       throws IOException {

     class TrevniStorageInputFormat
     extends PigFileInputFormat<NullWritable, GenericData.Record> {

       @Override protected boolean isSplitable(JobContext jc, Path p) {
         return false;
       }

       @Override protected  List<FileStatus> listStatus(final JobContext job)
           throws IOException {
         List<FileStatus> results = Lists.newArrayList();
         job.getConfiguration().setBoolean(MRConfiguration.INPUT_DIR_RECURSIVE, true);
         for (FileStatus file : super.listStatus(job)) {
           if (Utils.VISIBLE_FILES.accept(file.getPath())) {
             results.add(file);
           }
         }
         return results;
       }

       @Override
       public RecordReader<NullWritable, GenericData.Record>
       createRecordReader(final InputSplit is, final TaskAttemptContext tc)
           throws IOException, InterruptedException {
         RecordReader<NullWritable, GenericData.Record> rr =
             new RecordReader<NullWritable, GenericData.Record>() {

           private FileSplit fsplit;
           private AvroColumnReader.Params params;
           private AvroColumnReader<GenericData.Record> reader;
           private float rows;
           private long row = 0;
           private GenericData.Record currentRecord = null;

           @Override
           public void close() throws IOException {
             reader.close();
           }

           @Override
           public NullWritable getCurrentKey()
               throws IOException, InterruptedException {
             return NullWritable.get();
           }

           @Override
           public Record getCurrentValue()
               throws IOException, InterruptedException {
             return currentRecord;
           }

           @Override
           public float getProgress()
               throws IOException, InterruptedException {
             return row / rows;
           }

           @Override
           public void initialize(final InputSplit isplit,
               final TaskAttemptContext tac)
                   throws IOException, InterruptedException {
             fsplit = (FileSplit) isplit;
             params = new AvroColumnReader.Params(
                 new HadoopInput(fsplit.getPath(), tac.getConfiguration()));
             Schema inputSchema = getInputAvroSchema();
             params.setSchema(inputSchema);
             reader = new AvroColumnReader<GenericData.Record>(params);
             rows = reader.getRowCount();
           }

           @Override
           public boolean nextKeyValue()
               throws IOException, InterruptedException {
             if (reader.hasNext()) {
               currentRecord = reader.next();
               row++;
               return true;
             } else {
               return false;
             }
           }
         };

         // rr.initialize(is, tc);
         tc.setStatus(is.toString());
         return rr;
       }

     }

     return new TrevniStorageInputFormat();

   }

   /*
    * @see org.apache.pig.StoreFuncInterface#getOutputFormat()
    */
   @Override
   public OutputFormat<NullWritable, Object> getOutputFormat()
       throws IOException {
     class TrevniStorageOutputFormat
         extends FileOutputFormat<NullWritable, Object> {

       private Schema schema;

       TrevniStorageOutputFormat(final Schema s) {
         schema = s;
         if (s == null) {
           String schemaString = getProperties(
                 AvroStorage.class, udfContextSignature)
               .getProperty(OUTPUT_AVRO_SCHEMA);
           if (schemaString != null) {
             schema = (new Schema.Parser()).parse(schemaString);
           }
         }

       }

       @Override
       public RecordWriter<NullWritable, Object>
           getRecordWriter(final TaskAttemptContext tc)
           throws IOException, InterruptedException {

         if (schema == null) {
           String schemaString = getProperties(
                 AvroStorage.class, udfContextSignature)
               .getProperty(OUTPUT_AVRO_SCHEMA);
           if (schemaString != null) {
             schema = (new Schema.Parser()).parse(schemaString);
           }
           if (schema == null) {
             throw new IOException("Null output schema");
           }
         }

         final ColumnFileMetaData meta = new ColumnFileMetaData();

         for (Entry<String, String> e : tc.getConfiguration()) {
           if (e.getKey().startsWith(
               org.apache.trevni.avro.AvroTrevniOutputFormat.META_PREFIX)) {
             meta.put(e.getKey().substring(AvroJob.TEXT_PREFIX.length()),
                 e.getValue().getBytes(MetaData.UTF8));
           }
         }

         final Path dir = getOutputPath(tc);
         final FileSystem fs = FileSystem.get(tc.getConfiguration());
         final long blockSize = fs.getDefaultBlockSize();

         if (!fs.mkdirs(dir)) {
           throw new IOException("Failed to create directory: " + dir);
         }

         meta.setCodec("deflate");

         return new AvroRecordWriter(dir, tc.getConfiguration()) {
           private int part = 0;
           private Schema avroRecordWriterSchema;
           private AvroColumnWriter<GenericData.Record> writer;

           private void flush() throws IOException {
             Integer taskAttemptId = tc.getTaskAttemptID().getTaskID().getId();
             String partName = String.format("%05d_%03d", taskAttemptId, part++);
             OutputStream out = fs.create(
                 new Path(dir, "part-" + partName + AvroTrevniOutputFormat.EXT));
             try {
               writer.writeTo(out);
             } finally {
               out.flush();
               out.close();
             }
           }

           @Override
           public void close(final TaskAttemptContext arg0)
               throws IOException, InterruptedException {
             flush();
           }

           @Override
           public void write(final NullWritable n, final Object o)
               throws IOException, InterruptedException {
             GenericData.Record r =
                 AvroStorageDataConversionUtilities
                 .packIntoAvro((Tuple) o, schema);
             writer.write(r);
             if (writer.sizeEstimate() >= blockSize) {
               flush();
               writer = new AvroColumnWriter<GenericData.Record>(
                   avroRecordWriterSchema, meta);
             }
           }

           @Override
           public void prepareToWrite(Schema s) throws IOException {
             avroRecordWriterSchema = s;
             writer = new AvroColumnWriter<GenericData.Record>(
                 avroRecordWriterSchema, meta);
           }
         };
       }
     }

     return new TrevniStorageOutputFormat(schema);
   }

   @Override
   public  Schema getAvroSchema(Path p[], final Job job) throws IOException {

     ArrayList<FileStatus> statusList = new ArrayList<FileStatus>();
     FileSystem fs = FileSystem.get(p[0].toUri(), job.getConfiguration());
     for (Path temp : p) {
       for (FileStatus tempf : fs.globStatus(temp, Utils.VISIBLE_FILES)) {
         statusList.add(tempf);
       }
     }
     FileStatus[] statusArray = (FileStatus[]) statusList
         .toArray(new FileStatus[statusList.size()]);

     if (statusArray == null) {
       throw new IOException("Path " + p.toString() + " does not exist.");
     }

     if (statusArray.length == 0) {
       throw new IOException("No path matches pattern " + p.toString());
     }

     Path filePath = Utils.depthFirstSearchForFile(statusArray, fs);

     if (filePath == null) {
       throw new IOException("No path matches pattern " + p.toString());
     }

     AvroColumnReader.Params params =
         new AvroColumnReader.Params(
             new HadoopInput(filePath, job.getConfiguration()));
     AvroColumnReader<GenericData.Record> reader =
         new AvroColumnReader<GenericData.Record>(params);
     Schema s = reader.getFileSchema();
     reader.close();
     return s;
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.pig.builtin;

	import java.io.IOException;
	import java.io.OutputStream;
	import java.util.ArrayList;
	import java.util.List;
	import java.util.Map.Entry;

	import org.apache.avro.Schema;
	import org.apache.avro.generic.GenericData;
	import org.apache.avro.generic.GenericData.Record;
	import org.apache.avro.mapred.AvroJob;
	import org.apache.hadoop.fs.FileStatus;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.NullWritable;
	import org.apache.hadoop.mapreduce.InputFormat;
	import org.apache.hadoop.mapreduce.InputSplit;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.mapreduce.JobContext;
	import org.apache.hadoop.mapreduce.OutputFormat;
	import org.apache.hadoop.mapreduce.RecordReader;
	import org.apache.hadoop.mapreduce.RecordWriter;
	import org.apache.hadoop.mapreduce.TaskAttemptContext;
	import org.apache.hadoop.mapreduce.lib.input.FileSplit;
	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
	import org.apache.pig.LoadPushDown;
	import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration;
	import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigFileInputFormat;
	import org.apache.pig.data.Tuple;
	import org.apache.pig.impl.util.Utils;
	import org.apache.pig.impl.util.avro.AvroRecordWriter;
	import org.apache.pig.impl.util.avro.AvroStorageDataConversionUtilities;
	import org.apache.trevni.ColumnFileMetaData;
	import org.apache.trevni.MetaData;
	import org.apache.trevni.avro.AvroColumnReader;
	import org.apache.trevni.avro.AvroColumnWriter;
	import org.apache.trevni.avro.AvroTrevniOutputFormat;
	import org.apache.trevni.avro.HadoopInput;

	import com.google.common.collect.Lists;

	/**
	*
	* Pig Store/Load Function for Trevni.
	*
	*/

	public class TrevniStorage extends AvroStorage implements LoadPushDown{

	/**
	* Create new instance of TrevniStorage with no arguments (useful
	* for loading files without specifying parameters).
	*/
	public TrevniStorage() {
	super();
	}

	/**
	* Create new instance of TrevniStorage.
	* @param sn Specifies the input/output schema or record name.
	* @param opts Options for AvroStorage:
	* <li><code>-namespace</code> Namespace for an automatically generated
	* output schema.</li>
	* <li><code>-schemafile</code> Specifies URL for avro schema file
	* from which to read the input schema (can be local file, hdfs,
	* url, etc).</li>
	* <li><code>-examplefile</code> Specifies URL for avro data file from
	* which to copy the input schema (can be local file, hdfs, url, etc).</li>
	* <li><code>-allowrecursive</code> Option to allow recursive schema
	* definitions (default is false).</li>
	*/
	public TrevniStorage(final String sn, final String opts) {
	super(sn, opts);
	}

	/*
	* @see org.apache.pig.LoadFunc#getInputFormat()
	*/
	@Override
	public InputFormat<NullWritable, GenericData.Record> getInputFormat()
	throws IOException {

	class TrevniStorageInputFormat
	extends PigFileInputFormat<NullWritable, GenericData.Record> {

	@Override protected boolean isSplitable(JobContext jc, Path p) {
	return false;
	}

	@Override protected List<FileStatus> listStatus(final JobContext job)
	throws IOException {
	List<FileStatus> results = Lists.newArrayList();
	job.getConfiguration().setBoolean(MRConfiguration.INPUT_DIR_RECURSIVE, true);
	for (FileStatus file : super.listStatus(job)) {
	if (Utils.VISIBLE_FILES.accept(file.getPath())) {
	results.add(file);
	}
	}
	return results;
	}

	@Override
	public RecordReader<NullWritable, GenericData.Record>
	createRecordReader(final InputSplit is, final TaskAttemptContext tc)
	throws IOException, InterruptedException {
	RecordReader<NullWritable, GenericData.Record> rr =
	new RecordReader<NullWritable, GenericData.Record>() {

	private FileSplit fsplit;
	private AvroColumnReader.Params params;
	private AvroColumnReader<GenericData.Record> reader;
	private float rows;
	private long row = 0;
	private GenericData.Record currentRecord = null;

	@Override
	public void close() throws IOException {
	reader.close();
	}

	@Override
	public NullWritable getCurrentKey()
	throws IOException, InterruptedException {
	return NullWritable.get();
	}

	@Override
	public Record getCurrentValue()
	throws IOException, InterruptedException {
	return currentRecord;
	}

	@Override
	public float getProgress()
	throws IOException, InterruptedException {
	return row / rows;
	}

	@Override
	public void initialize(final InputSplit isplit,
	final TaskAttemptContext tac)
	throws IOException, InterruptedException {
	fsplit = (FileSplit) isplit;
	params = new AvroColumnReader.Params(
	new HadoopInput(fsplit.getPath(), tac.getConfiguration()));
	Schema inputSchema = getInputAvroSchema();
	params.setSchema(inputSchema);
	reader = new AvroColumnReader<GenericData.Record>(params);
	rows = reader.getRowCount();
	}

	@Override
	public boolean nextKeyValue()
	throws IOException, InterruptedException {
	if (reader.hasNext()) {
	currentRecord = reader.next();
	row++;
	return true;
	} else {
	return false;
	}
	}
	};

	// rr.initialize(is, tc);
	tc.setStatus(is.toString());
	return rr;
	}

	}

	return new TrevniStorageInputFormat();

	}

	/*
	* @see org.apache.pig.StoreFuncInterface#getOutputFormat()
	*/
	@Override
	public OutputFormat<NullWritable, Object> getOutputFormat()
	throws IOException {
	class TrevniStorageOutputFormat
	extends FileOutputFormat<NullWritable, Object> {

	private Schema schema;

	TrevniStorageOutputFormat(final Schema s) {
	schema = s;
	if (s == null) {
	String schemaString = getProperties(
	AvroStorage.class, udfContextSignature)
	.getProperty(OUTPUT_AVRO_SCHEMA);
	if (schemaString != null) {
	schema = (new Schema.Parser()).parse(schemaString);
	}
	}

	}

	@Override
	public RecordWriter<NullWritable, Object>
	getRecordWriter(final TaskAttemptContext tc)
	throws IOException, InterruptedException {

	if (schema == null) {
	String schemaString = getProperties(
	AvroStorage.class, udfContextSignature)
	.getProperty(OUTPUT_AVRO_SCHEMA);
	if (schemaString != null) {
	schema = (new Schema.Parser()).parse(schemaString);
	}
	if (schema == null) {
	throw new IOException("Null output schema");
	}
	}

	final ColumnFileMetaData meta = new ColumnFileMetaData();

	for (Entry<String, String> e : tc.getConfiguration()) {
	if (e.getKey().startsWith(
	org.apache.trevni.avro.AvroTrevniOutputFormat.META_PREFIX)) {
	meta.put(e.getKey().substring(AvroJob.TEXT_PREFIX.length()),
	e.getValue().getBytes(MetaData.UTF8));
	}
	}

	final Path dir = getOutputPath(tc);
	final FileSystem fs = FileSystem.get(tc.getConfiguration());
	final long blockSize = fs.getDefaultBlockSize();

	if (!fs.mkdirs(dir)) {
	throw new IOException("Failed to create directory: " + dir);
	}

	meta.setCodec("deflate");

	return new AvroRecordWriter(dir, tc.getConfiguration()) {
	private int part = 0;
	private Schema avroRecordWriterSchema;
	private AvroColumnWriter<GenericData.Record> writer;

	private void flush() throws IOException {
	Integer taskAttemptId = tc.getTaskAttemptID().getTaskID().getId();
	String partName = String.format("%05d_%03d", taskAttemptId, part++);
	OutputStream out = fs.create(
	new Path(dir, "part-" + partName + AvroTrevniOutputFormat.EXT));
	try {
	writer.writeTo(out);
	} finally {
	out.flush();
	out.close();
	}
	}

	@Override
	public void close(final TaskAttemptContext arg0)
	throws IOException, InterruptedException {
	flush();
	}

	@Override
	public void write(final NullWritable n, final Object o)
	throws IOException, InterruptedException {
	GenericData.Record r =
	AvroStorageDataConversionUtilities
	.packIntoAvro((Tuple) o, schema);
	writer.write(r);
	if (writer.sizeEstimate() >= blockSize) {
	flush();
	writer = new AvroColumnWriter<GenericData.Record>(
	avroRecordWriterSchema, meta);
	}
	}

	@Override
	public void prepareToWrite(Schema s) throws IOException {
	avroRecordWriterSchema = s;
	writer = new AvroColumnWriter<GenericData.Record>(
	avroRecordWriterSchema, meta);
	}
	};
	}
	}

	return new TrevniStorageOutputFormat(schema);
	}

	@Override
	public Schema getAvroSchema(Path p[], final Job job) throws IOException {

	ArrayList<FileStatus> statusList = new ArrayList<FileStatus>();
	FileSystem fs = FileSystem.get(p[0].toUri(), job.getConfiguration());
	for (Path temp : p) {
	for (FileStatus tempf : fs.globStatus(temp, Utils.VISIBLE_FILES)) {
	statusList.add(tempf);
	}
	}
	FileStatus[] statusArray = (FileStatus[]) statusList
	.toArray(new FileStatus[statusList.size()]);

	if (statusArray == null) {
	throw new IOException("Path " + p.toString() + " does not exist.");
	}

	if (statusArray.length == 0) {
	throw new IOException("No path matches pattern " + p.toString());
	}

	Path filePath = Utils.depthFirstSearchForFile(statusArray, fs);

	if (filePath == null) {
	throw new IOException("No path matches pattern " + p.toString());
	}

	AvroColumnReader.Params params =
	new AvroColumnReader.Params(
	new HadoopInput(filePath, job.getConfiguration()));
	AvroColumnReader<GenericData.Record> reader =
	new AvroColumnReader<GenericData.Record>(params);
	Schema s = reader.getFileSchema();
	reader.close();
	return s;
	}

	}