contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/HiveColumnarLoader.java - pig - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements. See the NOTICE file distributed with this
  * work for additional information regarding copyright ownership. The ASF
  * licenses this file to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  * License for the specific language governing permissions and limitations under
  * the License.
  */
 package org.apache.pig.piggybank.storage;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Properties;
 import java.util.Set;
 import java.util.regex.Pattern;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.ql.session.SessionState;
 import org.apache.hadoop.hive.serde.Constants;
 import org.apache.hadoop.hive.serde2.SerDeException;
 import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
 import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe;
 import org.apache.hadoop.hive.serde2.columnar.ColumnarStruct;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.mapreduce.InputFormat;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.pig.Expression;
 import org.apache.pig.FileInputLoadFunc;
 import org.apache.pig.LoadMetadata;
 import org.apache.pig.LoadPushDown;
 import org.apache.pig.ResourceSchema;
 import org.apache.pig.ResourceSchema.ResourceFieldSchema;
 import org.apache.pig.ResourceStatistics;
 import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
 import org.apache.pig.data.DataType;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.data.TupleFactory;
 import org.apache.pig.impl.logicalLayer.FrontendException;
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
 import org.apache.pig.impl.util.UDFContext;
 import org.apache.pig.piggybank.storage.hiverc.HiveRCInputFormat;
 import org.apache.pig.piggybank.storage.hiverc.HiveRCRecordReader;
 import org.apache.pig.piggybank.storage.hiverc.HiveRCSchemaUtil;
 import org.apache.pig.piggybank.storage.partition.PathPartitionHelper;

 /**
  * Loader for Hive RC Columnar files.<br/>
  * Supports the following types:<br/>
  * *
  * <table>
  * <tr>
  * <th>Hive Type</th>
  * <th>Pig Type from DataType</th>
  * </tr>
  * <tr>
  * <td>string</td>
  * <td>CHARARRAY</td>
  * </tr>
  * <tr>
  * <td>int</td>
  * <td>INTEGER</td>
  * </tr>
  * <tr>
  * <td>bigint or long</td>
  * <td>LONG</td>
  * </tr>
  * <tr>
  * <td>float</td>
  * <td>float</td>
  * </tr>
  * <tr>
  * <td>double</td>
  * <td>DOUBLE</td>
  * </tr>
  * <tr>
  * <td>boolean</td>
  * <td>BOOLEAN</td>
  * </tr>
  * <tr>
  * <td>byte</td>
  * <td>BYTE</td>
  * </tr>
  * <tr>
  * <td>array</td>
  * <td>TUPLE</td>
  * </tr>
  * <tr>
  * <td>map</td>
  * <td>MAP</td>
  * </tr>
  * </table>
  *
  * <p/>
  * <b>Partitions</b><br/>
  * The input paths are scanned by the loader for [partition name]=[value]
  * patterns in the subdirectories.<br/>
  * If detected these partitions are appended to the table schema.<br/>
  * For example if you have the directory structure:<br/>
  *
  * <pre>
  * /user/hive/warehouse/mytable
  * 				/year=2010/month=02/day=01
  * </pre>
  *
  * The mytable schema is (id int,name string).<br/>
  * The final schema returned in pig will be (id:int, name:chararray,
  * year:chararray, month:chararray, day:chararray).<br/>
  * <p/>
  * Usage 1:
  * <p/>
  * To load a hive table: uid bigint, ts long, arr ARRAY<string,string>, m
  * MAP<String, String> <br/>
  * <code>
  * <pre>
  * a = LOAD 'file' USING HiveColumnarLoader("uid bigint, ts long, arr array<string,string>, m map<string,string>");
  * -- to reference the fields
  * b = FOREACH GENERATE a.uid, a.ts, a.arr, a.m;
  * </pre>
  * </code>
  * <p/>
  * Usage 2:
  * <p/>
  * To load a hive table: uid bigint, ts long, arr ARRAY<string,string>, m
  * MAP<String, String> only processing dates 2009-10-01 to 2009-10-02 in a <br/>
  * date partitioned hive table.<br/>
  * <b>Old Usage</b><br/>
  * <b>Note:</b> The partitions can be filtered by using pig's FILTER operator.<br/>
  * <code>
  * <pre>
  * a = LOAD 'file' USING HiveColumnarLoader("uid bigint, ts long, arr array<string,string>, m map<string,string>", "2009-10-01:2009-10-02");
  * -- to reference the fields
  * b = FOREACH GENERATE a.uid, a.ts, a.arr, a.m;
  * </pre>
  * </code> <br/>
  * <b>New Usage</b/><br/>
  * <code>
  * <pre>
  * a = LOAD 'file' USING HiveColumnarLoader("uid bigint, ts long, arr array<string,string>, m map<string,string>");
  * f = FILTER a BY daydate>='2009-10-01' AND daydate >='2009-10-02';
  * </pre>
  * </code>
  * <p/>
  * Usage 3:
  * <p/>
  * To load a hive table: uid bigint, ts long, arr ARRAY<string,string>, m
  * MAP<String, String> only reading column uid and ts for dates 2009-10-01 to
  * 2009-10-02.<br/ <br/>
  * <b>Old Usage</b><br/>
  * <b>Note:<b/> This behaviour is now supported in pig by LoadPushDown adding
  * the columns needed to be loaded like below is ignored and pig will
  * automatically send the columns used by the script to the loader.<br/>
  * <code>
  * <pre>
  * a = LOAD 'file' USING HiveColumnarLoader("uid bigint, ts long, arr array<string,string>, m map<string,string>");
  * f = FILTER a BY daydate>='2009-10-01' AND daydate >='2009-10-02';
  * -- to reference the fields
  * b = FOREACH a GENERATE uid, ts, arr, m;
  * </pre>
  * </code>
  * <p/>
  * <b>Issues</b>
  * <p/>
  * <u>Table schema definition</u><br/>
  * The schema definition must be column name followed by a space then a comma
  * then no space and the next column name and so on.<br/>
  * This so column1 string, column2 string will not work, it must be column1
  * string,column2 string
  * <p/>
  * <u>Partitioning</u><br/>
  * Partitions must be in the format [partition name]=[partition value]<br/>
  * Only strings are supported in the partitioning.<br/>
  * Partitions must follow the same naming convention for all sub directories in
  * a table<br/>
  * For example:<br/>
  * The following is not valid:<br/>
  *
  * <pre>
  *     mytable/hour=00
  *     mytable/day=01/hour=00
  * </pre>
  *
  **/
 public class HiveColumnarLoader extends FileInputLoadFunc implements
 	LoadMetadata, LoadPushDown {

     public static final String PROJECTION_ID = HiveColumnarLoader.class
 	    .getName() + ".projection";

     public static final String DATE_RANGE = HiveColumnarLoader.class.getName()
 	    + ".date-range";

     /**
      * Regex to filter out column names
      */
     protected static final Pattern pcols = Pattern.compile("[a-zA-Z_0-9]*[ ]");
     protected static final Log LOG = LogFactory
 	    .getLog(HiveColumnarLoader.class);

     protected TupleFactory tupleFactory = TupleFactory.getInstance();

     String signature = "";

     // we need to save the dateRange from the constructor if provided to add to
     // the UDFContext only when the signature is available.
     String dateRange = null;

     HiveRCRecordReader reader;

     ColumnarSerDe serde = null;
     Configuration conf = null;

     ResourceSchema pigSchema;
     boolean partitionKeysSet = false;

     BytesRefArrayWritable buff = null;

     private Properties props;
     private HiveConf hiveConf;

     transient int[] requiredColumns;

     transient Set<String> partitionColumns;

     /**
      * Implements the logic for searching partition keys and applying parition
      * filtering
      */
     transient PathPartitionHelper pathPartitionerHelper = new PathPartitionHelper();

     transient Path currentPath = null;
     transient Map<String, String> currentPathPartitionKeyMap;

     /**
      * Table schema should be a space and comma separated string describing the
      * Hive schema.<br/>
      * For example uid BIGINT, pid long, means 1 column of uid type BIGINT and
      * one column of pid type LONG.<br/>
      * The types are not case sensitive.
      *
      * @param table_schema
      *            This property cannot be null
      */
     public HiveColumnarLoader(String table_schema) {
 	setup(table_schema);
     }

     /**
      * This constructor is for backward compatibility.
      *
      * Table schema should be a space and comma separated string describing the
      * Hive schema.<br/>
      * For example uid BIGINT, pid long, means 1 column of uid type BIGINT and
      * one column of pid type LONG.<br/>
      * The types are not case sensitive.
      *
      * @param table_schema
      *            This property cannot be null
      * @param dateRange
      *            String
      * @param columns
      *            String not used any more
      */
     public HiveColumnarLoader(String table_schema, String dateRange,
 	    String columns) {
 	setup(table_schema);

 	this.dateRange = dateRange;
     }

     /**
      * This constructor is for backward compatibility.
      *
      * Table schema should be a space and comma separated string describing the
      * Hive schema.<br/>
      * For example uid BIGINT, pid long, means 1 column of uid type BIGINT and
      * one column of pid type LONG.<br/>
      * The types are not case sensitive.
      *
      * @param table_schema
      *            This property cannot be null
      * @param dateRange
      *            String
      */
     public HiveColumnarLoader(String table_schema, String dateRange) {
 	setup(table_schema);

 	this.dateRange = dateRange;
     }

     private Properties getUDFContext() {
 	return UDFContext.getUDFContext().getUDFProperties(this.getClass(),
 		new String[] { signature });
     }

     @Override
     public InputFormat<LongWritable, BytesRefArrayWritable> getInputFormat()
 	    throws IOException {
 	LOG.info("Signature: " + signature);
 	return new HiveRCInputFormat(signature);
     }

     @Override
     public Tuple getNext() throws IOException {
 	Tuple tuple = null;

 	try {
 	    if (reader.nextKeyValue()) {

 		BytesRefArrayWritable buff = reader.getCurrentValue();
 		ColumnarStruct struct = readColumnarStruct(buff);

 		tuple = readColumnarTuple(struct, reader.getSplitPath());
 	    }

 	} catch (InterruptedException e) {
 	    throw new IOException(e.toString(), e);
 	}

 	return tuple;
     }

     @Override
     public void prepareToRead(
 	    @SuppressWarnings("rawtypes") RecordReader reader, PigSplit split)
 	    throws IOException {

 	this.reader = (HiveRCRecordReader) reader;

 	// check that the required indexes actually exist i.e. the columns that
 	// should be read.
 	// assuming this is always defined simplifies the readColumnarTuple
 	// logic.

 	int requiredIndexes[] = getRequiredColumns();
 	if (requiredIndexes == null) {

 	    int fieldLen = pigSchema.getFields().length;

 	    // if any the partition keys should already exist
 	    String[] partitionKeys = getPartitionKeys(null, null);
 	    if (partitionKeys != null) {
 		fieldLen += partitionKeys.length;
 	    }

 	    requiredIndexes = new int[fieldLen];

 	    for (int i = 0; i < fieldLen; i++) {
 		requiredIndexes[i] = i;
 	    }

 	    this.requiredColumns = requiredIndexes;
 	}

 	try {
 	    serde = new ColumnarSerDe();
 	    serde.initialize(hiveConf, props);
 	} catch (SerDeException e) {
 	    LOG.error(e.toString(), e);
 	    throw new IOException(e);
 	}

     }

     @Override
     public void setLocation(String location, Job job) throws IOException {
 	FileInputFormat.setInputPaths(job, location);
     }

     /**
      * Does the configuration setup and schema parsing and setup.
      *
      * @param table_schema
      *            String
      * @param columnsToRead
      *            String
      */
     private void setup(String table_schema) {

 	if (table_schema == null)
 	    throw new RuntimeException(
 		    "The table schema must be defined as colname type, colname type.  All types are hive types");

 	// create basic configuration for hdfs and hive
 	conf = new Configuration();
 	hiveConf = new HiveConf(conf, SessionState.class);

 	// parse the table_schema string
 	List<String> types = HiveRCSchemaUtil.parseSchemaTypes(table_schema);
 	List<String> cols = HiveRCSchemaUtil.parseSchema(pcols, table_schema);

 	List<FieldSchema> fieldSchemaList = new ArrayList<FieldSchema>(
 		cols.size());

 	for (int i = 0; i < cols.size(); i++) {
 	    fieldSchemaList.add(new FieldSchema(cols.get(i), HiveRCSchemaUtil
 		    .findPigDataType(types.get(i))));
 	}

 	pigSchema = new ResourceSchema(new Schema(fieldSchemaList));

 	props = new Properties();

 	// setting table schema properties for ColumnarSerDe
 	// these properties are never changed by the columns to read filter,
 	// because the columnar serde needs to now the
 	// complete format of each record.
 	props.setProperty(Constants.LIST_COLUMNS,
 		HiveRCSchemaUtil.listToString(cols));
 	props.setProperty(Constants.LIST_COLUMN_TYPES,
 		HiveRCSchemaUtil.listToString(types));

     }

     /**
      * Uses the ColumnarSerde to deserialize the buff:BytesRefArrayWritable into
      * a ColumnarStruct instance.
      *
      * @param buff
      *            BytesRefArrayWritable
      * @return ColumnarStruct
      */
     private ColumnarStruct readColumnarStruct(BytesRefArrayWritable buff) {
 	// use ColumnarSerDe to deserialize row
 	ColumnarStruct struct = null;
 	try {
 	    struct = (ColumnarStruct) serde.deserialize(buff);
 	} catch (SerDeException e) {
 	    LOG.error(e.toString(), e);
 	    throw new RuntimeException(e.toString(), e);
 	}

 	return struct;
     }

     /**
      * Only read the columns that were requested in the constructor.<br/>
      *
      * @param struct
      *            ColumnarStruct
      * @param path
      *            Path
      * @return Tuple
      * @throws IOException
      */
     private Tuple readColumnarTuple(ColumnarStruct struct, Path path)
 	    throws IOException {

 	int[] columnIndexes = getRequiredColumns();
 	// the partition keys if any will already be in the UDFContext here.
 	String[] partitionKeys = getPartitionKeys(null, null);
 	// only if the path has changed should be run the
 	if (currentPath == null || !currentPath.equals(path)) {
 	    currentPathPartitionKeyMap = (partitionKeys == null) ? null
 		    : pathPartitionerHelper.getPathPartitionKeyValues(path
 			    .toString());
 	    currentPath = path;
 	}

 	// if the partitionColumns is null this value will stop the for loop
 	// below from trynig to add any partition columns
 	// that do not exist
 	int partitionColumnStartIndex = Integer.MAX_VALUE;

 	if (!(partitionColumns == null || partitionColumns.size() == 0)) {
 	    // partition columns are always appended to the schema fields.
 	    partitionColumnStartIndex = pigSchema.getFields().length;

 	}

 	// create tuple with determined previous size
 	Tuple t = tupleFactory.newTuple(columnIndexes.length);

 	// read in all columns
 	for (int i = 0; i < columnIndexes.length; i++) {
 	    int columnIndex = columnIndexes[i];

 	    if (columnIndex < partitionColumnStartIndex) {
 		Object obj = struct.getField(columnIndex);
 		Object pigType = HiveRCSchemaUtil
 			.extractPigTypeFromHiveType(obj);

 		t.set(i, pigType);

 	    } else {
 		// read the partition columns
 		// will only be executed if partitionColumns is not null
 		String key = partitionKeys[columnIndex
 			- partitionColumnStartIndex];
 		Object value = currentPathPartitionKeyMap.get(key);
 		t.set(i, value);

 	    }

 	}

 	return t;
     }

     /**
      * Will parse the required columns from the UDFContext properties if the
      * requiredColumns[] variable is null, or else just return the
      * requiredColumns.
      *
      * @return int[]
      */
     private int[] getRequiredColumns() {

 	if (requiredColumns == null) {
 	    Properties properties = getUDFContext();

 	    String projectionStr = properties.getProperty(PROJECTION_ID);

 	    if (projectionStr != null) {
 		String[] split = projectionStr.split(",");
 		int columnIndexes[] = new int[split.length];

 		int index = 0;
 		for (String splitItem : split) {
 		    columnIndexes[index++] = Integer.parseInt(splitItem);
 		}

 		requiredColumns = columnIndexes;
 	    }

 	}

 	return requiredColumns;
     }

     /**
      * Reads the partition columns
      *
      * @param location
      * @param job
      * @return
      */
     private Set<String> getPartitionColumns(String location, Job job) {

 	if (partitionColumns == null) {
 	    // read the partition columns from the UDF Context first.
 	    // if not in the UDF context then read it using the PathPartitioner.

 	    Properties properties = getUDFContext();

 	    if (properties == null)
 		properties = new Properties();

 	    String partitionColumnStr = properties
 		    .getProperty(PathPartitionHelper.PARTITION_COLUMNS);

 	    if (partitionColumnStr == null
 		    && !(location == null || job == null)) {
 		// if it hasn't been written yet.
 		Set<String> partitionColumnSet;

 		try {
 		    partitionColumnSet = pathPartitionerHelper
 			    .getPartitionKeys(location, job.getConfiguration());
 		} catch (IOException e) {

 		    RuntimeException rte = new RuntimeException(e);
 		    rte.setStackTrace(e.getStackTrace());
 		    throw rte;

 		}

 		if (partitionColumnSet != null) {

 		    StringBuilder buff = new StringBuilder();

 		    int i = 0;
 		    for (String column : partitionColumnSet) {
 			if (i++ != 0) {
 			    buff.append(',');
 			}

 			buff.append(column);
 		    }

 		    String buffStr = buff.toString().trim();

 		    if (buffStr.length() > 0) {

 			properties.setProperty(
 				PathPartitionHelper.PARTITION_COLUMNS,
 				buff.toString());
 		    }

 		    partitionColumns = partitionColumnSet;

 		}

 	    } else {
 		// the partition columns has been set already in the UDF Context
 		if (partitionColumnStr != null) {
 		    String split[] = partitionColumnStr.split(",");
 		    partitionColumns = new LinkedHashSet<String>();
 		    if (split.length > 0) {
 			for (String splitItem : split) {
 			    partitionColumns.add(splitItem);
 			}
 		    }
 		}

 	    }

 	}

 	return partitionColumns;

     }

     @Override
     public String[] getPartitionKeys(String location, Job job)
 	    throws IOException {
 	Set<String> partitionKeys = getPartitionColumns(location, job);

 	return partitionKeys == null ? null : partitionKeys
 		.toArray(new String[] {});
     }

     @Override
     public ResourceSchema getSchema(String location, Job job)
 	    throws IOException {

 	if (!partitionKeysSet) {
 	    Set<String> keys = getPartitionColumns(location, job);

 	    if (!(keys == null || keys.size() == 0)) {

 		// re-edit the pigSchema to contain the new partition keys.
 		ResourceFieldSchema[] fields = pigSchema.getFields();

 		LOG.debug("Schema: " + Arrays.toString(fields));

 		ResourceFieldSchema[] newFields = Arrays.copyOf(fields,
 			fields.length + keys.size());

 		int index = fields.length;

 		for (String key : keys) {
 		    newFields[index++] = new ResourceFieldSchema(
 			    new FieldSchema(key, DataType.CHARARRAY));
 		}

 		pigSchema.setFields(newFields);

 		LOG.debug("Added partition fields: " + keys
 			+ " to loader schema");
 		LOG.debug("Schema is: " + Arrays.toString(newFields));
 	    }

 	    partitionKeysSet = true;

 	}

 	return pigSchema;
     }

     @Override
     public ResourceStatistics getStatistics(String location, Job job)
 	    throws IOException {
 	return null;
     }

     @Override
     public void setPartitionFilter(Expression partitionFilter)
 	    throws IOException {
 	getUDFContext().setProperty(
 		PathPartitionHelper.PARITITION_FILTER_EXPRESSION,
 		partitionFilter.toString());
     }

     @Override
     public List<OperatorSet> getFeatures() {
 	return Arrays.asList(LoadPushDown.OperatorSet.PROJECTION);
     }

     @Override
     public RequiredFieldResponse pushProjection(
 	    RequiredFieldList requiredFieldList) throws FrontendException {

 	// save the required field list to the UDFContext properties.
 	StringBuilder buff = new StringBuilder();

 	int i = 0;
 	for (RequiredField f : requiredFieldList.getFields()) {
 	    if (i++ != 0)
 		buff.append(',');

 	    buff.append(f.getIndex());
 	}

 	Properties properties = getUDFContext();

 	properties.setProperty(PROJECTION_ID, buff.toString());

 	return new RequiredFieldResponse(true);
     }

     @Override
     public void setUDFContextSignature(String signature) {
 	super.setUDFContextSignature(signature);

 	LOG.debug("Signature: " + signature);
 	this.signature = signature;

 	// this provides backwards compatibility
 	// the HiveRCInputFormat will read this and if set will perform the
 	// needed partitionFiltering
 	if (dateRange != null) {
 	    getUDFContext().setProperty(DATE_RANGE, dateRange);
 	}

     }

 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with this
	* work for additional information regarding copyright ownership. The ASF
	* licenses this file to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
	* License for the specific language governing permissions and limitations under
	* the License.
	*/
	package org.apache.pig.piggybank.storage;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.LinkedHashSet;
	import java.util.List;
	import java.util.Map;
	import java.util.Properties;
	import java.util.Set;
	import java.util.regex.Pattern;

	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.hive.conf.HiveConf;
	import org.apache.hadoop.hive.ql.session.SessionState;
	import org.apache.hadoop.hive.serde.Constants;
	import org.apache.hadoop.hive.serde2.SerDeException;
	import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
	import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe;
	import org.apache.hadoop.hive.serde2.columnar.ColumnarStruct;
	import org.apache.hadoop.io.LongWritable;
	import org.apache.hadoop.mapreduce.InputFormat;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.mapreduce.RecordReader;
	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
	import org.apache.pig.Expression;
	import org.apache.pig.FileInputLoadFunc;
	import org.apache.pig.LoadMetadata;
	import org.apache.pig.LoadPushDown;
	import org.apache.pig.ResourceSchema;
	import org.apache.pig.ResourceSchema.ResourceFieldSchema;
	import org.apache.pig.ResourceStatistics;
	import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
	import org.apache.pig.data.DataType;
	import org.apache.pig.data.Tuple;
	import org.apache.pig.data.TupleFactory;
	import org.apache.pig.impl.logicalLayer.FrontendException;
	import org.apache.pig.impl.logicalLayer.schema.Schema;
	import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
	import org.apache.pig.impl.util.UDFContext;
	import org.apache.pig.piggybank.storage.hiverc.HiveRCInputFormat;
	import org.apache.pig.piggybank.storage.hiverc.HiveRCRecordReader;
	import org.apache.pig.piggybank.storage.hiverc.HiveRCSchemaUtil;
	import org.apache.pig.piggybank.storage.partition.PathPartitionHelper;

	/**
	* Loader for Hive RC Columnar files.<br/>
	* Supports the following types:<br/>
	* *
	* <table>
	* <tr>
	* <th>Hive Type</th>
	* <th>Pig Type from DataType</th>
	* </tr>
	* <tr>
	* <td>string</td>
	* <td>CHARARRAY</td>
	* </tr>
	* <tr>
	* <td>int</td>
	* <td>INTEGER</td>
	* </tr>
	* <tr>
	* <td>bigint or long</td>
	* <td>LONG</td>
	* </tr>
	* <tr>
	* <td>float</td>
	* <td>float</td>
	* </tr>
	* <tr>
	* <td>double</td>
	* <td>DOUBLE</td>
	* </tr>
	* <tr>
	* <td>boolean</td>
	* <td>BOOLEAN</td>
	* </tr>
	* <tr>
	* <td>byte</td>
	* <td>BYTE</td>
	* </tr>
	* <tr>
	* <td>array</td>
	* <td>TUPLE</td>
	* </tr>
	* <tr>
	* <td>map</td>
	* <td>MAP</td>
	* </tr>
	* </table>
	*
	* <p/>
	* <b>Partitions</b><br/>
	* The input paths are scanned by the loader for [partition name]=[value]
	* patterns in the subdirectories.<br/>
	* If detected these partitions are appended to the table schema.<br/>
	* For example if you have the directory structure:<br/>
	*
	* <pre>
	* /user/hive/warehouse/mytable
	* /year=2010/month=02/day=01
	* </pre>
	*
	* The mytable schema is (id int,name string).<br/>
	* The final schema returned in pig will be (id:int, name:chararray,
	* year:chararray, month:chararray, day:chararray).<br/>
	* <p/>
	* Usage 1:
	* <p/>
	* To load a hive table: uid bigint, ts long, arr ARRAY<string,string>, m
	* MAP<String, String> <br/>
	* <code>
	* <pre>
	* a = LOAD 'file' USING HiveColumnarLoader("uid bigint, ts long, arr array<string,string>, m map<string,string>");
	* -- to reference the fields
	* b = FOREACH GENERATE a.uid, a.ts, a.arr, a.m;
	* </pre>
	* </code>
	* <p/>
	* Usage 2:
	* <p/>
	* To load a hive table: uid bigint, ts long, arr ARRAY<string,string>, m
	* MAP<String, String> only processing dates 2009-10-01 to 2009-10-02 in a <br/>
	* date partitioned hive table.<br/>
	* <b>Old Usage</b><br/>
	* <b>Note:</b> The partitions can be filtered by using pig's FILTER operator.<br/>
	* <code>
	* <pre>
	* a = LOAD 'file' USING HiveColumnarLoader("uid bigint, ts long, arr array<string,string>, m map<string,string>", "2009-10-01:2009-10-02");
	* -- to reference the fields
	* b = FOREACH GENERATE a.uid, a.ts, a.arr, a.m;
	* </pre>
	* </code> <br/>
	* <b>New Usage</b/><br/>
	* <code>
	* <pre>
	* a = LOAD 'file' USING HiveColumnarLoader("uid bigint, ts long, arr array<string,string>, m map<string,string>");
	* f = FILTER a BY daydate>='2009-10-01' AND daydate >='2009-10-02';
	* </pre>
	* </code>
	* <p/>
	* Usage 3:
	* <p/>
	* To load a hive table: uid bigint, ts long, arr ARRAY<string,string>, m
	* MAP<String, String> only reading column uid and ts for dates 2009-10-01 to
	* 2009-10-02.<br/ <br/>
	* <b>Old Usage</b><br/>
	* <b>Note:<b/> This behaviour is now supported in pig by LoadPushDown adding
	* the columns needed to be loaded like below is ignored and pig will
	* automatically send the columns used by the script to the loader.<br/>
	* <code>
	* <pre>
	* a = LOAD 'file' USING HiveColumnarLoader("uid bigint, ts long, arr array<string,string>, m map<string,string>");
	* f = FILTER a BY daydate>='2009-10-01' AND daydate >='2009-10-02';
	* -- to reference the fields
	* b = FOREACH a GENERATE uid, ts, arr, m;
	* </pre>
	* </code>
	* <p/>
	* <b>Issues</b>
	* <p/>
	* <u>Table schema definition</u><br/>
	* The schema definition must be column name followed by a space then a comma
	* then no space and the next column name and so on.<br/>
	* This so column1 string, column2 string will not work, it must be column1
	* string,column2 string
	* <p/>
	* <u>Partitioning</u><br/>
	* Partitions must be in the format [partition name]=[partition value]<br/>
	* Only strings are supported in the partitioning.<br/>
	* Partitions must follow the same naming convention for all sub directories in
	* a table<br/>
	* For example:<br/>
	* The following is not valid:<br/>
	*
	* <pre>
	* mytable/hour=00
	* mytable/day=01/hour=00
	* </pre>
	*
	**/
	public class HiveColumnarLoader extends FileInputLoadFunc implements
	LoadMetadata, LoadPushDown {

	public static final String PROJECTION_ID = HiveColumnarLoader.class
	.getName() + ".projection";

	public static final String DATE_RANGE = HiveColumnarLoader.class.getName()
	+ ".date-range";

	/**
	* Regex to filter out column names
	*/
	protected static final Pattern pcols = Pattern.compile("[a-zA-Z_0-9]*[ ]");
	protected static final Log LOG = LogFactory
	.getLog(HiveColumnarLoader.class);

	protected TupleFactory tupleFactory = TupleFactory.getInstance();

	String signature = "";

	// we need to save the dateRange from the constructor if provided to add to
	// the UDFContext only when the signature is available.
	String dateRange = null;

	HiveRCRecordReader reader;

	ColumnarSerDe serde = null;
	Configuration conf = null;

	ResourceSchema pigSchema;
	boolean partitionKeysSet = false;

	BytesRefArrayWritable buff = null;

	private Properties props;
	private HiveConf hiveConf;

	transient int[] requiredColumns;

	transient Set<String> partitionColumns;

	/**
	* Implements the logic for searching partition keys and applying parition
	* filtering
	*/
	transient PathPartitionHelper pathPartitionerHelper = new PathPartitionHelper();

	transient Path currentPath = null;
	transient Map<String, String> currentPathPartitionKeyMap;

	/**
	* Table schema should be a space and comma separated string describing the
	* Hive schema.<br/>
	* For example uid BIGINT, pid long, means 1 column of uid type BIGINT and
	* one column of pid type LONG.<br/>
	* The types are not case sensitive.
	*
	* @param table_schema
	* This property cannot be null
	*/
	public HiveColumnarLoader(String table_schema) {
	setup(table_schema);
	}

	/**
	* This constructor is for backward compatibility.
	*
	* Table schema should be a space and comma separated string describing the
	* Hive schema.<br/>
	* For example uid BIGINT, pid long, means 1 column of uid type BIGINT and
	* one column of pid type LONG.<br/>
	* The types are not case sensitive.
	*
	* @param table_schema
	* This property cannot be null
	* @param dateRange
	* String
	* @param columns
	* String not used any more
	*/
	public HiveColumnarLoader(String table_schema, String dateRange,
	String columns) {
	setup(table_schema);

	this.dateRange = dateRange;
	}

	/**
	* This constructor is for backward compatibility.
	*
	* Table schema should be a space and comma separated string describing the
	* Hive schema.<br/>
	* For example uid BIGINT, pid long, means 1 column of uid type BIGINT and
	* one column of pid type LONG.<br/>
	* The types are not case sensitive.
	*
	* @param table_schema
	* This property cannot be null
	* @param dateRange
	* String
	*/
	public HiveColumnarLoader(String table_schema, String dateRange) {
	setup(table_schema);

	this.dateRange = dateRange;
	}

	private Properties getUDFContext() {
	return UDFContext.getUDFContext().getUDFProperties(this.getClass(),
	new String[] { signature });
	}

	@Override
	public InputFormat<LongWritable, BytesRefArrayWritable> getInputFormat()
	throws IOException {
	LOG.info("Signature: " + signature);
	return new HiveRCInputFormat(signature);
	}

	@Override
	public Tuple getNext() throws IOException {
	Tuple tuple = null;

	try {
	if (reader.nextKeyValue()) {

	BytesRefArrayWritable buff = reader.getCurrentValue();
	ColumnarStruct struct = readColumnarStruct(buff);

	tuple = readColumnarTuple(struct, reader.getSplitPath());
	}

	} catch (InterruptedException e) {
	throw new IOException(e.toString(), e);
	}

	return tuple;
	}

	@Override
	public void prepareToRead(
	@SuppressWarnings("rawtypes") RecordReader reader, PigSplit split)
	throws IOException {

	this.reader = (HiveRCRecordReader) reader;

	// check that the required indexes actually exist i.e. the columns that
	// should be read.
	// assuming this is always defined simplifies the readColumnarTuple
	// logic.

	int requiredIndexes[] = getRequiredColumns();
	if (requiredIndexes == null) {

	int fieldLen = pigSchema.getFields().length;

	// if any the partition keys should already exist
	String[] partitionKeys = getPartitionKeys(null, null);
	if (partitionKeys != null) {
	fieldLen += partitionKeys.length;
	}

	requiredIndexes = new int[fieldLen];

	for (int i = 0; i < fieldLen; i++) {
	requiredIndexes[i] = i;
	}

	this.requiredColumns = requiredIndexes;
	}

	try {
	serde = new ColumnarSerDe();
	serde.initialize(hiveConf, props);
	} catch (SerDeException e) {
	LOG.error(e.toString(), e);
	throw new IOException(e);
	}

	}

	@Override
	public void setLocation(String location, Job job) throws IOException {
	FileInputFormat.setInputPaths(job, location);
	}

	/**
	* Does the configuration setup and schema parsing and setup.
	*
	* @param table_schema
	* String
	* @param columnsToRead
	* String
	*/
	private void setup(String table_schema) {

	if (table_schema == null)
	throw new RuntimeException(
	"The table schema must be defined as colname type, colname type. All types are hive types");

	// create basic configuration for hdfs and hive
	conf = new Configuration();
	hiveConf = new HiveConf(conf, SessionState.class);

	// parse the table_schema string
	List<String> types = HiveRCSchemaUtil.parseSchemaTypes(table_schema);
	List<String> cols = HiveRCSchemaUtil.parseSchema(pcols, table_schema);

	List<FieldSchema> fieldSchemaList = new ArrayList<FieldSchema>(
	cols.size());

	for (int i = 0; i < cols.size(); i++) {
	fieldSchemaList.add(new FieldSchema(cols.get(i), HiveRCSchemaUtil
	.findPigDataType(types.get(i))));
	}

	pigSchema = new ResourceSchema(new Schema(fieldSchemaList));

	props = new Properties();

	// setting table schema properties for ColumnarSerDe
	// these properties are never changed by the columns to read filter,
	// because the columnar serde needs to now the
	// complete format of each record.
	props.setProperty(Constants.LIST_COLUMNS,
	HiveRCSchemaUtil.listToString(cols));
	props.setProperty(Constants.LIST_COLUMN_TYPES,
	HiveRCSchemaUtil.listToString(types));

	}

	/**
	* Uses the ColumnarSerde to deserialize the buff:BytesRefArrayWritable into
	* a ColumnarStruct instance.
	*
	* @param buff
	* BytesRefArrayWritable
	* @return ColumnarStruct
	*/
	private ColumnarStruct readColumnarStruct(BytesRefArrayWritable buff) {
	// use ColumnarSerDe to deserialize row
	ColumnarStruct struct = null;
	try {
	struct = (ColumnarStruct) serde.deserialize(buff);
	} catch (SerDeException e) {
	LOG.error(e.toString(), e);
	throw new RuntimeException(e.toString(), e);
	}

	return struct;
	}

	/**
	* Only read the columns that were requested in the constructor.<br/>
	*
	* @param struct
	* ColumnarStruct
	* @param path
	* Path
	* @return Tuple
	* @throws IOException
	*/
	private Tuple readColumnarTuple(ColumnarStruct struct, Path path)
	throws IOException {

	int[] columnIndexes = getRequiredColumns();
	// the partition keys if any will already be in the UDFContext here.
	String[] partitionKeys = getPartitionKeys(null, null);
	// only if the path has changed should be run the
	if (currentPath == null \|\| !currentPath.equals(path)) {
	currentPathPartitionKeyMap = (partitionKeys == null) ? null
	: pathPartitionerHelper.getPathPartitionKeyValues(path
	.toString());
	currentPath = path;
	}

	// if the partitionColumns is null this value will stop the for loop
	// below from trynig to add any partition columns
	// that do not exist
	int partitionColumnStartIndex = Integer.MAX_VALUE;

	if (!(partitionColumns == null \|\| partitionColumns.size() == 0)) {
	// partition columns are always appended to the schema fields.
	partitionColumnStartIndex = pigSchema.getFields().length;

	}

	// create tuple with determined previous size
	Tuple t = tupleFactory.newTuple(columnIndexes.length);

	// read in all columns
	for (int i = 0; i < columnIndexes.length; i++) {
	int columnIndex = columnIndexes[i];

	if (columnIndex < partitionColumnStartIndex) {
	Object obj = struct.getField(columnIndex);
	Object pigType = HiveRCSchemaUtil
	.extractPigTypeFromHiveType(obj);

	t.set(i, pigType);

	} else {
	// read the partition columns
	// will only be executed if partitionColumns is not null
	String key = partitionKeys[columnIndex
	- partitionColumnStartIndex];
	Object value = currentPathPartitionKeyMap.get(key);
	t.set(i, value);

	}

	}

	return t;
	}

	/**
	* Will parse the required columns from the UDFContext properties if the
	* requiredColumns[] variable is null, or else just return the
	* requiredColumns.
	*
	* @return int[]
	*/
	private int[] getRequiredColumns() {

	if (requiredColumns == null) {
	Properties properties = getUDFContext();

	String projectionStr = properties.getProperty(PROJECTION_ID);

	if (projectionStr != null) {
	String[] split = projectionStr.split(",");
	int columnIndexes[] = new int[split.length];

	int index = 0;
	for (String splitItem : split) {
	columnIndexes[index++] = Integer.parseInt(splitItem);
	}

	requiredColumns = columnIndexes;
	}

	}

	return requiredColumns;
	}

	/**
	* Reads the partition columns
	*
	* @param location
	* @param job
	* @return
	*/
	private Set<String> getPartitionColumns(String location, Job job) {

	if (partitionColumns == null) {
	// read the partition columns from the UDF Context first.
	// if not in the UDF context then read it using the PathPartitioner.

	Properties properties = getUDFContext();

	if (properties == null)
	properties = new Properties();

	String partitionColumnStr = properties
	.getProperty(PathPartitionHelper.PARTITION_COLUMNS);

	if (partitionColumnStr == null
	&& !(location == null \|\| job == null)) {
	// if it hasn't been written yet.
	Set<String> partitionColumnSet;

	try {
	partitionColumnSet = pathPartitionerHelper
	.getPartitionKeys(location, job.getConfiguration());
	} catch (IOException e) {

	RuntimeException rte = new RuntimeException(e);
	rte.setStackTrace(e.getStackTrace());
	throw rte;

	}

	if (partitionColumnSet != null) {

	StringBuilder buff = new StringBuilder();

	int i = 0;
	for (String column : partitionColumnSet) {
	if (i++ != 0) {
	buff.append(',');
	}

	buff.append(column);
	}

	String buffStr = buff.toString().trim();

	if (buffStr.length() > 0) {

	properties.setProperty(
	PathPartitionHelper.PARTITION_COLUMNS,
	buff.toString());
	}

	partitionColumns = partitionColumnSet;

	}

	} else {
	// the partition columns has been set already in the UDF Context
	if (partitionColumnStr != null) {
	String split[] = partitionColumnStr.split(",");
	partitionColumns = new LinkedHashSet<String>();
	if (split.length > 0) {
	for (String splitItem : split) {
	partitionColumns.add(splitItem);
	}
	}
	}

	}

	}

	return partitionColumns;

	}

	@Override
	public String[] getPartitionKeys(String location, Job job)
	throws IOException {
	Set<String> partitionKeys = getPartitionColumns(location, job);

	return partitionKeys == null ? null : partitionKeys
	.toArray(new String[] {});
	}

	@Override
	public ResourceSchema getSchema(String location, Job job)
	throws IOException {

	if (!partitionKeysSet) {
	Set<String> keys = getPartitionColumns(location, job);

	if (!(keys == null \|\| keys.size() == 0)) {

	// re-edit the pigSchema to contain the new partition keys.
	ResourceFieldSchema[] fields = pigSchema.getFields();

	LOG.debug("Schema: " + Arrays.toString(fields));

	ResourceFieldSchema[] newFields = Arrays.copyOf(fields,
	fields.length + keys.size());

	int index = fields.length;

	for (String key : keys) {
	newFields[index++] = new ResourceFieldSchema(
	new FieldSchema(key, DataType.CHARARRAY));
	}

	pigSchema.setFields(newFields);

	LOG.debug("Added partition fields: " + keys
	+ " to loader schema");
	LOG.debug("Schema is: " + Arrays.toString(newFields));
	}

	partitionKeysSet = true;

	}

	return pigSchema;
	}

	@Override
	public ResourceStatistics getStatistics(String location, Job job)
	throws IOException {
	return null;
	}

	@Override
	public void setPartitionFilter(Expression partitionFilter)
	throws IOException {
	getUDFContext().setProperty(
	PathPartitionHelper.PARITITION_FILTER_EXPRESSION,
	partitionFilter.toString());
	}

	@Override
	public List<OperatorSet> getFeatures() {
	return Arrays.asList(LoadPushDown.OperatorSet.PROJECTION);
	}

	@Override
	public RequiredFieldResponse pushProjection(
	RequiredFieldList requiredFieldList) throws FrontendException {

	// save the required field list to the UDFContext properties.
	StringBuilder buff = new StringBuilder();

	int i = 0;
	for (RequiredField f : requiredFieldList.getFields()) {
	if (i++ != 0)
	buff.append(',');

	buff.append(f.getIndex());
	}

	Properties properties = getUDFContext();

	properties.setProperty(PROJECTION_ID, buff.toString());

	return new RequiredFieldResponse(true);
	}

	@Override
	public void setUDFContextSignature(String signature) {
	super.setUDFContextSignature(signature);

	LOG.debug("Signature: " + signature);
	this.signature = signature;

	// this provides backwards compatibility
	// the HiveRCInputFormat will read this and if set will perform the
	// needed partitionFiltering
	if (dateRange != null) {
	getUDFContext().setProperty(DATE_RANGE, dateRange);
	}

	}

	}