src/java/org/apache/hcatalog/mapreduce/HCatOutputFormat.java - hcatalog - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hcatalog.mapreduce;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
 import org.apache.hadoop.hive.metastore.api.FieldSchema;
 import org.apache.hadoop.hive.metastore.api.Index;
 import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
 import org.apache.hadoop.hive.ql.metadata.Table;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.OutputCommitter;
 import org.apache.hadoop.mapreduce.RecordWriter;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hcatalog.common.ErrorType;
 import org.apache.hcatalog.common.HCatConstants;
 import org.apache.hcatalog.common.HCatException;
 import org.apache.hcatalog.common.HCatUtil;
 import org.apache.hcatalog.data.HCatRecord;
 import org.apache.hcatalog.data.schema.HCatSchema;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /** The OutputFormat to use to write data to HCatalog. The key value is ignored and
  *  should be given as null. The value is the HCatRecord to write.*/
 public class HCatOutputFormat extends HCatBaseOutputFormat {

     static final private Logger LOG = LoggerFactory.getLogger(HCatOutputFormat.class);

     private static int maxDynamicPartitions;
     private static boolean harRequested;

     /**
      * Set the information about the output to write for the job. This queries the metadata server
      * to find the StorageHandler to use for the table.  It throws an error if the
      * partition is already published.
      * @param job the job object
      * @param outputJobInfo the table output information for the job
      * @throws IOException the exception in communicating with the metadata server
      */
     @SuppressWarnings("unchecked")
     public static void setOutput(Job job, OutputJobInfo outputJobInfo) throws IOException {
         HiveMetaStoreClient client = null;

         try {

             Configuration conf = job.getConfiguration();
             HiveConf hiveConf = HCatUtil.getHiveConf(conf);
             client = HCatUtil.getHiveClient(hiveConf);
             Table table = HCatUtil.getTable(client, outputJobInfo.getDatabaseName(),
                 outputJobInfo.getTableName());

             List<String> indexList = client.listIndexNames(outputJobInfo.getDatabaseName(), outputJobInfo.getTableName(), Short.MAX_VALUE);

             for (String indexName : indexList) {
                 Index index = client.getIndex(outputJobInfo.getDatabaseName(), outputJobInfo.getTableName(), indexName);
                 if (!index.isDeferredRebuild()) {
                     throw new HCatException(ErrorType.ERROR_NOT_SUPPORTED, "Store into a table with an automatic index from Pig/Mapreduce is not supported");
                 }
             }
             StorageDescriptor sd = table.getTTable().getSd();

             if (sd.isCompressed()) {
                 throw new HCatException(ErrorType.ERROR_NOT_SUPPORTED, "Store into a compressed partition from Pig/Mapreduce is not supported");
             }

             if (sd.getBucketCols() != null && !sd.getBucketCols().isEmpty()) {
                 throw new HCatException(ErrorType.ERROR_NOT_SUPPORTED, "Store into a partition with bucket definition from Pig/Mapreduce is not supported");
             }

             if (sd.getSortCols() != null && !sd.getSortCols().isEmpty()) {
                 throw new HCatException(ErrorType.ERROR_NOT_SUPPORTED, "Store into a partition with sorted column definition from Pig/Mapreduce is not supported");
             }

             if (table.getTTable().getPartitionKeysSize() == 0) {
                 if ((outputJobInfo.getPartitionValues() != null) && (!outputJobInfo.getPartitionValues().isEmpty())) {
                     // attempt made to save partition values in non-partitioned table - throw error.
                     throw new HCatException(ErrorType.ERROR_INVALID_PARTITION_VALUES,
                         "Partition values specified for non-partitioned table");
                 }
                 // non-partitioned table
                 outputJobInfo.setPartitionValues(new HashMap<String, String>());

             } else {
                 // partitioned table, we expect partition values
                 // convert user specified map to have lower case key names
                 Map<String, String> valueMap = new HashMap<String, String>();
                 if (outputJobInfo.getPartitionValues() != null) {
                     for (Map.Entry<String, String> entry : outputJobInfo.getPartitionValues().entrySet()) {
                         valueMap.put(entry.getKey().toLowerCase(), entry.getValue());
                     }
                 }

                 if ((outputJobInfo.getPartitionValues() == null)
                     || (outputJobInfo.getPartitionValues().size() < table.getTTable().getPartitionKeysSize())) {
                     // dynamic partition usecase - partition values were null, or not all were specified
                     // need to figure out which keys are not specified.
                     List<String> dynamicPartitioningKeys = new ArrayList<String>();
                     boolean firstItem = true;
                     for (FieldSchema fs : table.getPartitionKeys()) {
                         if (!valueMap.containsKey(fs.getName().toLowerCase())) {
                             dynamicPartitioningKeys.add(fs.getName().toLowerCase());
                         }
                     }

                     if (valueMap.size() + dynamicPartitioningKeys.size() != table.getTTable().getPartitionKeysSize()) {
                         // If this isn't equal, then bogus key values have been inserted, error out.
                         throw new HCatException(ErrorType.ERROR_INVALID_PARTITION_VALUES, "Invalid partition keys specified");
                     }

                     outputJobInfo.setDynamicPartitioningKeys(dynamicPartitioningKeys);
                     String dynHash;
                     if ((dynHash = conf.get(HCatConstants.HCAT_DYNAMIC_PTN_JOBID)) == null) {
                         dynHash = String.valueOf(Math.random());
 //              LOG.info("New dynHash : ["+dynHash+"]");
 //            }else{
 //              LOG.info("Old dynHash : ["+dynHash+"]");
                     }
                     conf.set(HCatConstants.HCAT_DYNAMIC_PTN_JOBID, dynHash);

                 }

                 outputJobInfo.setPartitionValues(valueMap);
             }

             HCatSchema tableSchema = HCatUtil.extractSchema(table);
             StorerInfo storerInfo =
                 InternalUtil.extractStorerInfo(table.getTTable().getSd(), table.getParameters());

             List<String> partitionCols = new ArrayList<String>();
             for (FieldSchema schema : table.getPartitionKeys()) {
                 partitionCols.add(schema.getName());
             }

             HCatStorageHandler storageHandler = HCatUtil.getStorageHandler(job.getConfiguration(), storerInfo);

             //Serialize the output info into the configuration
             outputJobInfo.setTableInfo(HCatTableInfo.valueOf(table.getTTable()));
             outputJobInfo.setOutputSchema(tableSchema);
             harRequested = getHarRequested(hiveConf);
             outputJobInfo.setHarRequested(harRequested);
             maxDynamicPartitions = getMaxDynamicPartitions(hiveConf);
             outputJobInfo.setMaximumDynamicPartitions(maxDynamicPartitions);

             HCatUtil.configureOutputStorageHandler(storageHandler, job, outputJobInfo);

             Path tblPath = new Path(table.getTTable().getSd().getLocation());

             /*  Set the umask in conf such that files/dirs get created with table-dir
             * permissions. Following three assumptions are made:
             * 1. Actual files/dirs creation is done by RecordWriter of underlying
             * output format. It is assumed that they use default permissions while creation.
             * 2. Default Permissions = FsPermission.getDefault() = 777.
             * 3. UMask is honored by underlying filesystem.
             */

             FsPermission.setUMask(conf, FsPermission.getDefault().applyUMask(
                 tblPath.getFileSystem(conf).getFileStatus(tblPath).getPermission()));

             if (Security.getInstance().isSecurityEnabled()) {
                 Security.getInstance().handleSecurity(job, outputJobInfo, client, conf, harRequested);
             }
         } catch (Exception e) {
             if (e instanceof HCatException) {
                 throw (HCatException) e;
             } else {
                 throw new HCatException(ErrorType.ERROR_SET_OUTPUT, e);
             }
         } finally {
             HCatUtil.closeHiveClientQuietly(client);
         }
     }

     /**
      * Set the schema for the data being written out to the partition. The
      * table schema is used by default for the partition if this is not called.
      * @param job the job object
      * @param schema the schema for the data
      * @throws IOException
      */
     public static void setSchema(final Job job, final HCatSchema schema) throws IOException {

         OutputJobInfo jobInfo = getJobInfo(job);
         Map<String, String> partMap = jobInfo.getPartitionValues();
         setPartDetails(jobInfo, schema, partMap);
         job.getConfiguration().set(HCatConstants.HCAT_KEY_OUTPUT_INFO, HCatUtil.serialize(jobInfo));
     }

     /**
      * Get the record writer for the job. This uses the StorageHandler's default
      * OutputFormat to get the record writer.
      * @param context the information about the current task
      * @return a RecordWriter to write the output for the job
      * @throws IOException
      * @throws InterruptedException
      */
     @Override
     public RecordWriter<WritableComparable<?>, HCatRecord>
     getRecordWriter(TaskAttemptContext context)
         throws IOException, InterruptedException {
         return getOutputFormat(context).getRecordWriter(context);
     }


     /**
      * Get the output committer for this output format. This is responsible
      * for ensuring the output is committed correctly.
      * @param context the task context
      * @return an output committer
      * @throws IOException
      * @throws InterruptedException
      */
     @Override
     public OutputCommitter getOutputCommitter(TaskAttemptContext context
     ) throws IOException, InterruptedException {
         return getOutputFormat(context).getOutputCommitter(context);
     }

     private static int getMaxDynamicPartitions(HiveConf hConf) {
         // by default the bounds checking for maximum number of
         // dynamic partitions is disabled (-1)
         int maxDynamicPartitions = -1;

         if (HCatConstants.HCAT_IS_DYNAMIC_MAX_PTN_CHECK_ENABLED) {
             maxDynamicPartitions = hConf.getIntVar(
                 HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS);
         }

         return maxDynamicPartitions;
     }

     private static boolean getHarRequested(HiveConf hConf) {
         return hConf.getBoolVar(HiveConf.ConfVars.HIVEARCHIVEENABLED);
     }

 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.hcatalog.mapreduce;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.fs.permission.FsPermission;
	import org.apache.hadoop.hive.conf.HiveConf;
	import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
	import org.apache.hadoop.hive.metastore.api.FieldSchema;
	import org.apache.hadoop.hive.metastore.api.Index;
	import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
	import org.apache.hadoop.hive.ql.metadata.Table;
	import org.apache.hadoop.io.WritableComparable;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.mapreduce.OutputCommitter;
	import org.apache.hadoop.mapreduce.RecordWriter;
	import org.apache.hadoop.mapreduce.TaskAttemptContext;
	import org.apache.hcatalog.common.ErrorType;
	import org.apache.hcatalog.common.HCatConstants;
	import org.apache.hcatalog.common.HCatException;
	import org.apache.hcatalog.common.HCatUtil;
	import org.apache.hcatalog.data.HCatRecord;
	import org.apache.hcatalog.data.schema.HCatSchema;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/** The OutputFormat to use to write data to HCatalog. The key value is ignored and
	* should be given as null. The value is the HCatRecord to write.*/
	public class HCatOutputFormat extends HCatBaseOutputFormat {

	static final private Logger LOG = LoggerFactory.getLogger(HCatOutputFormat.class);

	private static int maxDynamicPartitions;
	private static boolean harRequested;

	/**
	* Set the information about the output to write for the job. This queries the metadata server
	* to find the StorageHandler to use for the table. It throws an error if the
	* partition is already published.
	* @param job the job object
	* @param outputJobInfo the table output information for the job
	* @throws IOException the exception in communicating with the metadata server
	*/
	@SuppressWarnings("unchecked")
	public static void setOutput(Job job, OutputJobInfo outputJobInfo) throws IOException {
	HiveMetaStoreClient client = null;

	try {

	Configuration conf = job.getConfiguration();
	HiveConf hiveConf = HCatUtil.getHiveConf(conf);
	client = HCatUtil.getHiveClient(hiveConf);
	Table table = HCatUtil.getTable(client, outputJobInfo.getDatabaseName(),
	outputJobInfo.getTableName());

	List<String> indexList = client.listIndexNames(outputJobInfo.getDatabaseName(), outputJobInfo.getTableName(), Short.MAX_VALUE);

	for (String indexName : indexList) {
	Index index = client.getIndex(outputJobInfo.getDatabaseName(), outputJobInfo.getTableName(), indexName);
	if (!index.isDeferredRebuild()) {
	throw new HCatException(ErrorType.ERROR_NOT_SUPPORTED, "Store into a table with an automatic index from Pig/Mapreduce is not supported");
	}
	}
	StorageDescriptor sd = table.getTTable().getSd();

	if (sd.isCompressed()) {
	throw new HCatException(ErrorType.ERROR_NOT_SUPPORTED, "Store into a compressed partition from Pig/Mapreduce is not supported");
	}

	if (sd.getBucketCols() != null && !sd.getBucketCols().isEmpty()) {
	throw new HCatException(ErrorType.ERROR_NOT_SUPPORTED, "Store into a partition with bucket definition from Pig/Mapreduce is not supported");
	}

	if (sd.getSortCols() != null && !sd.getSortCols().isEmpty()) {
	throw new HCatException(ErrorType.ERROR_NOT_SUPPORTED, "Store into a partition with sorted column definition from Pig/Mapreduce is not supported");
	}

	if (table.getTTable().getPartitionKeysSize() == 0) {
	if ((outputJobInfo.getPartitionValues() != null) && (!outputJobInfo.getPartitionValues().isEmpty())) {
	// attempt made to save partition values in non-partitioned table - throw error.
	throw new HCatException(ErrorType.ERROR_INVALID_PARTITION_VALUES,
	"Partition values specified for non-partitioned table");
	}
	// non-partitioned table
	outputJobInfo.setPartitionValues(new HashMap<String, String>());

	} else {
	// partitioned table, we expect partition values
	// convert user specified map to have lower case key names
	Map<String, String> valueMap = new HashMap<String, String>();
	if (outputJobInfo.getPartitionValues() != null) {
	for (Map.Entry<String, String> entry : outputJobInfo.getPartitionValues().entrySet()) {
	valueMap.put(entry.getKey().toLowerCase(), entry.getValue());
	}
	}

	if ((outputJobInfo.getPartitionValues() == null)
	\|\| (outputJobInfo.getPartitionValues().size() < table.getTTable().getPartitionKeysSize())) {
	// dynamic partition usecase - partition values were null, or not all were specified
	// need to figure out which keys are not specified.
	List<String> dynamicPartitioningKeys = new ArrayList<String>();
	boolean firstItem = true;
	for (FieldSchema fs : table.getPartitionKeys()) {
	if (!valueMap.containsKey(fs.getName().toLowerCase())) {
	dynamicPartitioningKeys.add(fs.getName().toLowerCase());
	}
	}

	if (valueMap.size() + dynamicPartitioningKeys.size() != table.getTTable().getPartitionKeysSize()) {
	// If this isn't equal, then bogus key values have been inserted, error out.
	throw new HCatException(ErrorType.ERROR_INVALID_PARTITION_VALUES, "Invalid partition keys specified");
	}

	outputJobInfo.setDynamicPartitioningKeys(dynamicPartitioningKeys);
	String dynHash;
	if ((dynHash = conf.get(HCatConstants.HCAT_DYNAMIC_PTN_JOBID)) == null) {
	dynHash = String.valueOf(Math.random());
	// LOG.info("New dynHash : ["+dynHash+"]");
	// }else{
	// LOG.info("Old dynHash : ["+dynHash+"]");
	}
	conf.set(HCatConstants.HCAT_DYNAMIC_PTN_JOBID, dynHash);

	}

	outputJobInfo.setPartitionValues(valueMap);
	}

	HCatSchema tableSchema = HCatUtil.extractSchema(table);
	StorerInfo storerInfo =
	InternalUtil.extractStorerInfo(table.getTTable().getSd(), table.getParameters());

	List<String> partitionCols = new ArrayList<String>();
	for (FieldSchema schema : table.getPartitionKeys()) {
	partitionCols.add(schema.getName());
	}

	HCatStorageHandler storageHandler = HCatUtil.getStorageHandler(job.getConfiguration(), storerInfo);

	//Serialize the output info into the configuration
	outputJobInfo.setTableInfo(HCatTableInfo.valueOf(table.getTTable()));
	outputJobInfo.setOutputSchema(tableSchema);
	harRequested = getHarRequested(hiveConf);
	outputJobInfo.setHarRequested(harRequested);
	maxDynamicPartitions = getMaxDynamicPartitions(hiveConf);
	outputJobInfo.setMaximumDynamicPartitions(maxDynamicPartitions);

	HCatUtil.configureOutputStorageHandler(storageHandler, job, outputJobInfo);

	Path tblPath = new Path(table.getTTable().getSd().getLocation());

	/* Set the umask in conf such that files/dirs get created with table-dir
	* permissions. Following three assumptions are made:
	* 1. Actual files/dirs creation is done by RecordWriter of underlying
	* output format. It is assumed that they use default permissions while creation.
	* 2. Default Permissions = FsPermission.getDefault() = 777.
	* 3. UMask is honored by underlying filesystem.
	*/

	FsPermission.setUMask(conf, FsPermission.getDefault().applyUMask(
	tblPath.getFileSystem(conf).getFileStatus(tblPath).getPermission()));

	if (Security.getInstance().isSecurityEnabled()) {
	Security.getInstance().handleSecurity(job, outputJobInfo, client, conf, harRequested);
	}
	} catch (Exception e) {
	if (e instanceof HCatException) {
	throw (HCatException) e;
	} else {
	throw new HCatException(ErrorType.ERROR_SET_OUTPUT, e);
	}
	} finally {
	HCatUtil.closeHiveClientQuietly(client);
	}
	}

	/**
	* Set the schema for the data being written out to the partition. The
	* table schema is used by default for the partition if this is not called.
	* @param job the job object
	* @param schema the schema for the data
	* @throws IOException
	*/
	public static void setSchema(final Job job, final HCatSchema schema) throws IOException {

	OutputJobInfo jobInfo = getJobInfo(job);
	Map<String, String> partMap = jobInfo.getPartitionValues();
	setPartDetails(jobInfo, schema, partMap);
	job.getConfiguration().set(HCatConstants.HCAT_KEY_OUTPUT_INFO, HCatUtil.serialize(jobInfo));
	}

	/**
	* Get the record writer for the job. This uses the StorageHandler's default
	* OutputFormat to get the record writer.
	* @param context the information about the current task
	* @return a RecordWriter to write the output for the job
	* @throws IOException
	* @throws InterruptedException
	*/
	@Override
	public RecordWriter<WritableComparable<?>, HCatRecord>
	getRecordWriter(TaskAttemptContext context)
	throws IOException, InterruptedException {
	return getOutputFormat(context).getRecordWriter(context);
	}


	/**
	* Get the output committer for this output format. This is responsible
	* for ensuring the output is committed correctly.
	* @param context the task context
	* @return an output committer
	* @throws IOException
	* @throws InterruptedException
	*/
	@Override
	public OutputCommitter getOutputCommitter(TaskAttemptContext context
	) throws IOException, InterruptedException {
	return getOutputFormat(context).getOutputCommitter(context);
	}

	private static int getMaxDynamicPartitions(HiveConf hConf) {
	// by default the bounds checking for maximum number of
	// dynamic partitions is disabled (-1)
	int maxDynamicPartitions = -1;

	if (HCatConstants.HCAT_IS_DYNAMIC_MAX_PTN_CHECK_ENABLED) {
	maxDynamicPartitions = hConf.getIntVar(
	HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS);
	}

	return maxDynamicPartitions;
	}

	private static boolean getHarRequested(HiveConf hConf) {
	return hConf.getBoolVar(HiveConf.ConfVars.HIVEARCHIVEENABLED);
	}

	}