| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.hcatalog.mapreduce; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.Map; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.fs.Path; |
| import org.apache.hadoop.fs.permission.FsPermission; |
| import org.apache.hadoop.hive.conf.HiveConf; |
| import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; |
| import org.apache.hadoop.hive.metastore.api.FieldSchema; |
| import org.apache.hadoop.hive.metastore.api.Index; |
| import org.apache.hadoop.hive.metastore.api.StorageDescriptor; |
| import org.apache.hadoop.hive.ql.metadata.Table; |
| import org.apache.hadoop.io.WritableComparable; |
| import org.apache.hadoop.mapreduce.Job; |
| import org.apache.hadoop.mapreduce.OutputCommitter; |
| import org.apache.hadoop.mapreduce.RecordWriter; |
| import org.apache.hadoop.mapreduce.TaskAttemptContext; |
| import org.apache.hcatalog.common.ErrorType; |
| import org.apache.hcatalog.common.HCatConstants; |
| import org.apache.hcatalog.common.HCatException; |
| import org.apache.hcatalog.common.HCatUtil; |
| import org.apache.hcatalog.data.HCatRecord; |
| import org.apache.hcatalog.data.schema.HCatSchema; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** The OutputFormat to use to write data to HCatalog. The key value is ignored and |
| * should be given as null. The value is the HCatRecord to write.*/ |
| public class HCatOutputFormat extends HCatBaseOutputFormat { |
| |
| static final private Logger LOG = LoggerFactory.getLogger(HCatOutputFormat.class); |
| |
| private static int maxDynamicPartitions; |
| private static boolean harRequested; |
| |
| /** |
| * Set the information about the output to write for the job. This queries the metadata server |
| * to find the StorageHandler to use for the table. It throws an error if the |
| * partition is already published. |
| * @param job the job object |
| * @param outputJobInfo the table output information for the job |
| * @throws IOException the exception in communicating with the metadata server |
| */ |
| @SuppressWarnings("unchecked") |
| public static void setOutput(Job job, OutputJobInfo outputJobInfo) throws IOException { |
| HiveMetaStoreClient client = null; |
| |
| try { |
| |
| Configuration conf = job.getConfiguration(); |
| HiveConf hiveConf = HCatUtil.getHiveConf(conf); |
| client = HCatUtil.getHiveClient(hiveConf); |
| Table table = HCatUtil.getTable(client, outputJobInfo.getDatabaseName(), |
| outputJobInfo.getTableName()); |
| |
| List<String> indexList = client.listIndexNames(outputJobInfo.getDatabaseName(), outputJobInfo.getTableName(), Short.MAX_VALUE); |
| |
| for (String indexName : indexList) { |
| Index index = client.getIndex(outputJobInfo.getDatabaseName(), outputJobInfo.getTableName(), indexName); |
| if (!index.isDeferredRebuild()) { |
| throw new HCatException(ErrorType.ERROR_NOT_SUPPORTED, "Store into a table with an automatic index from Pig/Mapreduce is not supported"); |
| } |
| } |
| StorageDescriptor sd = table.getTTable().getSd(); |
| |
| if (sd.isCompressed()) { |
| throw new HCatException(ErrorType.ERROR_NOT_SUPPORTED, "Store into a compressed partition from Pig/Mapreduce is not supported"); |
| } |
| |
| if (sd.getBucketCols()!=null && !sd.getBucketCols().isEmpty()) { |
| throw new HCatException(ErrorType.ERROR_NOT_SUPPORTED, "Store into a partition with bucket definition from Pig/Mapreduce is not supported"); |
| } |
| |
| if (sd.getSortCols()!=null && !sd.getSortCols().isEmpty()) { |
| throw new HCatException(ErrorType.ERROR_NOT_SUPPORTED, "Store into a partition with sorted column definition from Pig/Mapreduce is not supported"); |
| } |
| |
| if (table.getTTable().getPartitionKeysSize() == 0 ){ |
| if ((outputJobInfo.getPartitionValues() != null) && (!outputJobInfo.getPartitionValues().isEmpty())){ |
| // attempt made to save partition values in non-partitioned table - throw error. |
| throw new HCatException(ErrorType.ERROR_INVALID_PARTITION_VALUES, |
| "Partition values specified for non-partitioned table"); |
| } |
| // non-partitioned table |
| outputJobInfo.setPartitionValues(new HashMap<String, String>()); |
| |
| } else { |
| // partitioned table, we expect partition values |
| // convert user specified map to have lower case key names |
| Map<String, String> valueMap = new HashMap<String, String>(); |
| if (outputJobInfo.getPartitionValues() != null){ |
| for(Map.Entry<String, String> entry : outputJobInfo.getPartitionValues().entrySet()) { |
| valueMap.put(entry.getKey().toLowerCase(), entry.getValue()); |
| } |
| } |
| |
| if ((outputJobInfo.getPartitionValues() == null) |
| || (outputJobInfo.getPartitionValues().size() < table.getTTable().getPartitionKeysSize())){ |
| // dynamic partition usecase - partition values were null, or not all were specified |
| // need to figure out which keys are not specified. |
| List<String> dynamicPartitioningKeys = new ArrayList<String>(); |
| boolean firstItem = true; |
| for (FieldSchema fs : table.getPartitionKeys()){ |
| if (!valueMap.containsKey(fs.getName().toLowerCase())){ |
| dynamicPartitioningKeys.add(fs.getName().toLowerCase()); |
| } |
| } |
| |
| if (valueMap.size() + dynamicPartitioningKeys.size() != table.getTTable().getPartitionKeysSize()){ |
| // If this isn't equal, then bogus key values have been inserted, error out. |
| throw new HCatException(ErrorType.ERROR_INVALID_PARTITION_VALUES,"Invalid partition keys specified"); |
| } |
| |
| outputJobInfo.setDynamicPartitioningKeys(dynamicPartitioningKeys); |
| String dynHash; |
| if ((dynHash = conf.get(HCatConstants.HCAT_DYNAMIC_PTN_JOBID)) == null){ |
| dynHash = String.valueOf(Math.random()); |
| // LOG.info("New dynHash : ["+dynHash+"]"); |
| // }else{ |
| // LOG.info("Old dynHash : ["+dynHash+"]"); |
| } |
| conf.set(HCatConstants.HCAT_DYNAMIC_PTN_JOBID, dynHash); |
| |
| } |
| |
| outputJobInfo.setPartitionValues(valueMap); |
| } |
| |
| HCatSchema tableSchema = HCatUtil.extractSchema(table); |
| StorerInfo storerInfo = |
| InternalUtil.extractStorerInfo(table.getTTable().getSd(), table.getParameters()); |
| |
| List<String> partitionCols = new ArrayList<String>(); |
| for(FieldSchema schema : table.getPartitionKeys()) { |
| partitionCols.add(schema.getName()); |
| } |
| |
| HCatStorageHandler storageHandler = HCatUtil.getStorageHandler(job.getConfiguration(), storerInfo); |
| |
| //Serialize the output info into the configuration |
| outputJobInfo.setTableInfo(HCatTableInfo.valueOf(table.getTTable())); |
| outputJobInfo.setOutputSchema(tableSchema); |
| harRequested = getHarRequested(hiveConf); |
| outputJobInfo.setHarRequested(harRequested); |
| maxDynamicPartitions = getMaxDynamicPartitions(hiveConf); |
| outputJobInfo.setMaximumDynamicPartitions(maxDynamicPartitions); |
| |
| HCatUtil.configureOutputStorageHandler(storageHandler,job,outputJobInfo); |
| |
| Path tblPath = new Path(table.getTTable().getSd().getLocation()); |
| |
| /* Set the umask in conf such that files/dirs get created with table-dir |
| * permissions. Following three assumptions are made: |
| * 1. Actual files/dirs creation is done by RecordWriter of underlying |
| * output format. It is assumed that they use default permissions while creation. |
| * 2. Default Permissions = FsPermission.getDefault() = 777. |
| * 3. UMask is honored by underlying filesystem. |
| */ |
| |
| FsPermission.setUMask(conf, FsPermission.getDefault().applyUMask( |
| tblPath.getFileSystem(conf).getFileStatus(tblPath).getPermission())); |
| |
| if(Security.getInstance().isSecurityEnabled()) { |
| Security.getInstance().handleSecurity(job, outputJobInfo, client, conf, harRequested); |
| } |
| } catch(Exception e) { |
| if( e instanceof HCatException ) { |
| throw (HCatException) e; |
| } else { |
| throw new HCatException(ErrorType.ERROR_SET_OUTPUT, e); |
| } |
| } finally { |
| HCatUtil.closeHiveClientQuietly(client); |
| } |
| } |
| |
| /** |
| * Set the schema for the data being written out to the partition. The |
| * table schema is used by default for the partition if this is not called. |
| * @param job the job object |
| * @param schema the schema for the data |
| * @throws IOException |
| */ |
| public static void setSchema(final Job job, final HCatSchema schema) throws IOException { |
| |
| OutputJobInfo jobInfo = getJobInfo(job); |
| Map<String,String> partMap = jobInfo.getPartitionValues(); |
| setPartDetails(jobInfo, schema, partMap); |
| job.getConfiguration().set(HCatConstants.HCAT_KEY_OUTPUT_INFO, HCatUtil.serialize(jobInfo)); |
| } |
| |
| /** |
| * Get the record writer for the job. This uses the StorageHandler's default |
| * OutputFormat to get the record writer. |
| * @param context the information about the current task |
| * @return a RecordWriter to write the output for the job |
| * @throws IOException |
| * @throws InterruptedException |
| */ |
| @Override |
| public RecordWriter<WritableComparable<?>, HCatRecord> |
| getRecordWriter(TaskAttemptContext context) |
| throws IOException, InterruptedException { |
| return getOutputFormat(context).getRecordWriter(context); |
| } |
| |
| |
| /** |
| * Get the output committer for this output format. This is responsible |
| * for ensuring the output is committed correctly. |
| * @param context the task context |
| * @return an output committer |
| * @throws IOException |
| * @throws InterruptedException |
| */ |
| @Override |
| public OutputCommitter getOutputCommitter(TaskAttemptContext context |
| ) throws IOException, InterruptedException { |
| return getOutputFormat(context).getOutputCommitter(context); |
| } |
| |
| private static int getMaxDynamicPartitions(HiveConf hConf) { |
| // by default the bounds checking for maximum number of |
| // dynamic partitions is disabled (-1) |
| int maxDynamicPartitions = -1; |
| |
| if (HCatConstants.HCAT_IS_DYNAMIC_MAX_PTN_CHECK_ENABLED){ |
| maxDynamicPartitions = hConf.getIntVar( |
| HiveConf.ConfVars.DYNAMICPARTITIONMAXPARTS); |
| } |
| |
| return maxDynamicPartitions; |
| } |
| |
| private static boolean getHarRequested(HiveConf hConf) { |
| return hConf.getBoolVar(HiveConf.ConfVars.HIVEARCHIVEENABLED); |
| } |
| |
| } |