ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java - hive - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.hadoop.hive.ql.metadata;

 import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableList;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.Collections;
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.hive.common.classification.InterfaceAudience;
 import org.apache.hadoop.hive.common.classification.InterfaceStability;
 import org.apache.hadoop.hive.common.type.SnapshotContext;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.metastore.HiveMetaHook;
 import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
 import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
 import org.apache.hadoop.hive.metastore.api.EnvironmentContext;
 import org.apache.hadoop.hive.metastore.api.FieldSchema;
 import org.apache.hadoop.hive.metastore.api.LockType;
 import org.apache.hadoop.hive.metastore.api.MetaException;
 import org.apache.hadoop.hive.metastore.api.Table;
 import org.apache.hadoop.hive.ql.Context.Operation;
 import org.apache.hadoop.hive.ql.ddl.table.AbstractAlterTableDesc;
 import org.apache.hadoop.hive.ql.ddl.table.AlterTableType;
 import org.apache.hadoop.hive.ql.ddl.table.create.like.CreateTableLikeDesc;
 import org.apache.hadoop.hive.ql.hooks.WriteEntity;
 import org.apache.hadoop.hive.ql.io.StorageFormatDescriptor;
 import org.apache.hadoop.hive.ql.parse.AlterTableBranchSpec;
 import org.apache.hadoop.hive.ql.parse.AlterTableExecuteSpec;
 import org.apache.hadoop.hive.ql.parse.TransformSpec;
 import org.apache.hadoop.hive.ql.parse.SemanticException;
 import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
 import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
 import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
 import org.apache.hadoop.hive.ql.plan.OperatorDesc;
 import org.apache.hadoop.hive.ql.plan.TableDesc;
 import org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider;
 import org.apache.hadoop.hive.ql.security.authorization.HiveCustomStorageHandlerUtils;
 import org.apache.hadoop.hive.ql.stats.Partish;
 import org.apache.hadoop.hive.serde2.AbstractSerDe;
 import org.apache.hadoop.mapred.InputFormat;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.OutputFormat;

 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Properties;

 /**
  * HiveStorageHandler defines a pluggable interface for adding
  * new storage handlers to Hive.  A storage handler consists of
  * a bundle of the following:
  *
  *<ul>
  *<li>input format
  *<li>output format
  *<li>serde
  *<li>metadata hooks for keeping an external catalog in sync
  * with Hive's metastore
  *<li>rules for setting up the configuration properties on
  * map/reduce jobs which access tables stored by this handler
  *</ul>
  *
  * Storage handler classes are plugged in using the STORED BY 'classname'
  * clause in CREATE TABLE.
  */
 @InterfaceAudience.Public
 @InterfaceStability.Stable
 public interface HiveStorageHandler extends Configurable {

   List<AlterTableType> DEFAULT_ALLOWED_ALTER_OPS = ImmutableList.of(
       AlterTableType.ADDPROPS, AlterTableType.DROPPROPS, AlterTableType.ADDCOLS);

   /**
    * @return Class providing an implementation of {@link InputFormat}
    */
   public Class<? extends InputFormat> getInputFormatClass();

   /**
    * @return Class providing an implementation of {@link OutputFormat}
    */
   public Class<? extends OutputFormat> getOutputFormatClass();

   /**
    * @return Class providing an implementation of {@link AbstractSerDe}
    */
   public Class<? extends AbstractSerDe> getSerDeClass();

   /**
    * @return metadata hook implementation, or null if this
    * storage handler does not need any metadata notifications
    */
   public HiveMetaHook getMetaHook();

   /**
    * Returns the implementation specific authorization provider
    *
    * @return authorization provider
    * @throws HiveException
    */
   public HiveAuthorizationProvider getAuthorizationProvider()
     throws HiveException;

   /**
    * This method is called to allow the StorageHandlers the chance
    * to populate the JobContext.getConfiguration() with properties that
    * maybe be needed by the handler's bundled artifacts (ie InputFormat, SerDe, etc).
    * Key value pairs passed into jobProperties are guaranteed to be set in the job's
    * configuration object. User's can retrieve "context" information from tableDesc.
    * User's should avoid mutating tableDesc and only make changes in jobProperties.
    * This method is expected to be idempotent such that a job called with the
    * same tableDesc values should return the same key-value pairs in jobProperties.
    * Any external state set by this method should remain the same if this method is
    * called again. It is up to the user to determine how best guarantee this invariant.
    *
    * This method in particular is to create a configuration for input.
    * @param tableDesc descriptor for the table being accessed
    * @param jobProperties receives properties copied or transformed
    * from the table properties
    */
   public abstract void configureInputJobProperties(TableDesc tableDesc,
     Map<String, String> jobProperties);

   /**
    * This method is called to allow the StorageHandlers the chance to
    * populate secret keys into the job's credentials.
    */
   public abstract void configureInputJobCredentials(TableDesc tableDesc, Map<String, String> secrets);

   /**
    * This method is called to allow the StorageHandlers the chance
    * to populate the JobContext.getConfiguration() with properties that
    * maybe be needed by the handler's bundled artifacts (ie InputFormat, SerDe, etc).
    * Key value pairs passed into jobProperties are guaranteed to be set in the job's
    * configuration object. User's can retrieve "context" information from tableDesc.
    * User's should avoid mutating tableDesc and only make changes in jobProperties.
    * This method is expected to be idempotent such that a job called with the
    * same tableDesc values should return the same key-value pairs in jobProperties.
    * Any external state set by this method should remain the same if this method is
    * called again. It is up to the user to determine how best guarantee this invariant.
    *
    * This method in particular is to create a configuration for output.
    * @param tableDesc descriptor for the table being accessed
    * @param jobProperties receives properties copied or transformed
    * from the table properties
    */
   public abstract void configureOutputJobProperties(TableDesc tableDesc,
     Map<String, String> jobProperties);

   /**
    * Deprecated use configureInputJobProperties/configureOutputJobProperties
    * methods instead.
    *
    * Configures properties for a job based on the definition of the
    * source or target table it accesses.
    *
    * @param tableDesc descriptor for the table being accessed
    *
    * @param jobProperties receives properties copied or transformed
    * from the table properties
    */
   @Deprecated
   public void configureTableJobProperties(
     TableDesc tableDesc,
     Map<String, String> jobProperties);

   /**
    * Called just before submitting MapReduce job.
    *
    * @param tableDesc descriptor for the table being accessed
    * @param jobConf jobConf for MapReduce job
    */
   public void configureJobConf(TableDesc tableDesc, JobConf jobConf);

   /**
    * Used to fetch runtime information about storage handler during DESCRIBE EXTENDED statement
    *
    * @param table table definition
    * @return StorageHandlerInfo containing runtime information about storage handler
    * OR `null` if the storage handler choose to not provide any runtime information.
    */
   public default StorageHandlerInfo getStorageHandlerInfo(Table table) throws MetaException
   {
     return null;
   }

   default LockType getLockType(WriteEntity writeEntity){
     return LockType.EXCLUSIVE;
   }

   /**
    * Test if the storage handler allows the push-down of join filter predicate to prune further the splits.
    *
    * @param table The table to filter.
    * @param syntheticFilterPredicate Join filter predicate.
    * @return true if supports dynamic split pruning for the given predicate.
    */

   default boolean addDynamicSplitPruningEdge(org.apache.hadoop.hive.ql.metadata.Table table,
       ExprNodeDesc syntheticFilterPredicate) {
     return false;
   }

   /**
    * Used to add additional operator specific information from storage handler during DESCRIBE EXTENDED statement.
    *
    * @param operatorDesc operatorDesc
    * @param initialProps Map containing initial operator properties
    * @return Map&lt;String, String&gt; containing additional operator specific information from storage handler
    * OR `initialProps` if the storage handler choose to not provide any such information.
    */
   default Map<String, String> getOperatorDescProperties(OperatorDesc operatorDesc, Map<String, String> initialProps) {
     return initialProps;
   }

   /**
    * Return some basic statistics (numRows, numFiles, totalSize) calculated by the underlying storage handler
    * implementation.
    * @param partish a partish wrapper class
    * @return map of basic statistics, can be null
    */
   default Map<String, String> getBasicStatistics(Partish partish) {
     return null;
   }

   /**
    * Check if the storage handler can provide basic statistics.
    * @return true if the storage handler can supply the basic statistics
    */
   default boolean canProvideBasicStatistics() {
     return false;
   }

   /**
    * Return some col statistics (Lower bounds, Upper bounds, Null value counts, NaN, total counts) calculated by
    * the underlying storage handler implementation.
    * @param table
    * @return A List of Column Statistics Objects, can be null
    */
   default List<ColumnStatisticsObj>getColStatistics(org.apache.hadoop.hive.ql.metadata.Table table) {
     return null;
   }

   /**
    * Set column stats for non-native tables
    * @param table
    * @param colStats
    * @return boolean
    */
   default boolean setColStatistics(org.apache.hadoop.hive.ql.metadata.Table table,
       List<ColumnStatistics> colStats) {
     return false;
   }

   /**
    * Check if the storage handler can provide col statistics.
    * @param tbl
    * @return true if the storage handler can supply the col statistics
    */
   default boolean canProvideColStatistics(org.apache.hadoop.hive.ql.metadata.Table tbl) {
     return false;
   }

   /**
    * Check if the storage handler can set col statistics.
    * @return true if the storage handler can set the col statistics
    */
   default boolean canSetColStatistics(org.apache.hadoop.hive.ql.metadata.Table tbl) {
     return false;
   }

   /**
    * Check if the storage handler answer a few queries like count(1) purely using stats.
    * @return true if the storage handler can answer query using statistics
    */
   default boolean canComputeQueryUsingStats(org.apache.hadoop.hive.ql.metadata.Table tbl) {
     return false;
   }

   /**
    *
    * Gets the storage format descriptor to be used for temp table for LOAD data.
    * @param table table object
    * @return StorageFormatDescriptor if the storage handler can support load data
    */
   default StorageFormatDescriptor getStorageFormatDescriptor(Table table) throws SemanticException {
     return null;
   }
   /**
    * Check if CTAS and CMV operations should behave in a direct-insert manner (i.e. no move task).
    * <p>
    * Please note that the atomicity of the operation will suffer in this case, i.e. the created table might become
    * exposed, depending on the implementation, before the CTAS or CMV operations finishes.
    * Rollback (e.g. dropping the table) is also the responsibility of the storage handler in case of failures.
    *
    * @return whether direct insert CTAS or CMV is required
    */
   default boolean directInsert() {
     return false;
   }

   /**
    * Check if partition columns should be removed and added to the list of regular columns in HMS.
    * This can be useful for non-native tables where the table format/layout differs from the standard Hive table layout,
    * e.g. Iceberg tables. For these table formats, the partition column values are stored in the data files along with
    * regular column values, therefore the object inspectors should include the partition columns as well.
    * Any partitioning scheme provided via the standard HiveQL syntax will be honored but stored in someplace
    * other than HMS, depending on the storage handler implementation.
    *
    * @return whether table should always be unpartitioned from the perspective of HMS
    */
   default boolean alwaysUnpartitioned() {
     return false;
   }

   /**
    * Retains storage handler specific properties during CTLT.
    * @param tbl        the table
    * @param desc       the table descriptor
    * @param origParams the original table properties
    */
   default void setTableParametersForCTLT(org.apache.hadoop.hive.ql.metadata.Table tbl, CreateTableLikeDesc desc,
       Map<String, String> origParams) {
   }

   /**
    * Extract the native properties of the table which aren't stored in the HMS
    * @param table the table
    * @return map with native table level properties
    */
   default Map<String, String> getNativeProperties(org.apache.hadoop.hive.ql.metadata.Table table) {
     return new HashMap<>();
   }

   /**
    * Returns whether the data should be overwritten for the specific operation.
    * @param mTable the table.
    * @param operationName operationName of the operation.
    * @return if the data should be overwritten for the specified operation.
    */
   default boolean shouldOverwrite(org.apache.hadoop.hive.ql.metadata.Table mTable, String operationName) {
     return false;
   }

   enum AcidSupportType {
     NONE,
     WITH_TRANSACTIONS,
     WITHOUT_TRANSACTIONS
   }

   /**
    * Specifies whether the table supports ACID operations or not (DELETE, UPDATE and MERGE statements).
    *
    * Possible return values:
    * <ul>
    *   <li>AcidSupportType.NONE - ACID operations are not supported</li>
    *   <li>AcidSupportType.WITH_TRANSACTIONS - ACID operations are supported, and must use a valid HiveTxnManager to wrap
    *   the operation in a transaction, like in the case of standard Hive ACID tables</li>
    *   <li>AcidSupportType.WITHOUT_TRANSACTIONS - ACID operations are supported, and there is no need for a HiveTxnManager
    *   to open/close transactions for the operation, i.e. org.apache.hadoop.hive.ql.lockmgr.DummyTxnManager
    *   can be used</li>
    * </ul>
    *
    * @return the table's ACID support type
    */
   default AcidSupportType supportsAcidOperations(org.apache.hadoop.hive.ql.metadata.Table table,
       boolean isWriteOperation) {
     return AcidSupportType.NONE;
   }

   /**
    * Specifies which additional virtual columns should be added to the virtual column registry during compilation
    * for tables that support ACID operations.
    *
    * Should only return a non-empty list if
    * {@link HiveStorageHandler#supportsAcidOperations(org.apache.hadoop.hive.ql.metadata.Table, boolean)} ()} returns something
    * other NONE.
    *
    * @return the list of ACID virtual columns
    */
   default List<VirtualColumn> acidVirtualColumns() {
     return Collections.emptyList();
   }

   /**
    * {@link org.apache.hadoop.hive.ql.parse.UpdateDeleteSemanticAnalyzer} rewrites DELETE/UPDATE queries into INSERT
    * queries.
    * - DELETE FROM T WHERE A = 32 is rewritten into
    * INSERT INTO T SELECT &lt;selectCols&gt; FROM T WHERE A = 32 SORT BY &lt;sortCols&gt;.
    * - UPDATE T SET B=12 WHERE A = 32 is rewritten into
    * INSERT INTO T SELECT &lt;selectCols&gt;, &lt;newValues&gt; FROM T WHERE A = 32 SORT BY &lt;sortCols&gt;.
    *
    * This method specifies which columns should be injected into the &lt;selectCols&gt; part of the rewritten query.
    *
    * Should only return a non-empty list if
    * {@link HiveStorageHandler#supportsAcidOperations(org.apache.hadoop.hive.ql.metadata.Table, boolean)} returns something
    * other NONE.
    *
    * @param table the table which is being deleted/updated/merged into
    * @param operation the operation type we are executing
    * @return the list of columns that should be projected in the rewritten ACID query
    */
   default List<FieldSchema> acidSelectColumns(org.apache.hadoop.hive.ql.metadata.Table table, Operation operation) {
     return Collections.emptyList();
   }

   /**
    * {@link org.apache.hadoop.hive.ql.parse.UpdateDeleteSemanticAnalyzer} rewrites DELETE/UPDATE queries into INSERT
    * queries. E.g. DELETE FROM T WHERE A = 32 is rewritten into
    * INSERT INTO T SELECT &lt;selectCols&gt; FROM T WHERE A = 32 SORT BY &lt;sortCols&gt;.
    *
    * This method specifies which columns should be injected into the &lt;sortCols&gt; part of the rewritten query.
    *
    * Should only return a non-empty list if
    * {@link HiveStorageHandler#supportsAcidOperations(org.apache.hadoop.hive.ql.metadata.Table, boolean)} returns something
    * other NONE.
    *
    * @param table the table which is being deleted/updated/merged into
    * @param operation the operation type we are executing
    * @return the list of columns that should be used as sort columns in the rewritten ACID query
    */
   default List<FieldSchema> acidSortColumns(org.apache.hadoop.hive.ql.metadata.Table table, Operation operation) {
     return Collections.emptyList();
   }

   /**
    * Check if the underlying storage handler implementation supports sort columns.
    * @return true if the storage handler can support it
    */
   default boolean supportsSortColumns() {
     return false;
   }

   /**
    * Collect the columns that are used to sort the content of the data files
    * @param table the table which is being sorted
    * @return the list of columns that are used during data sorting
    */
   default List<FieldSchema> sortColumns(org.apache.hadoop.hive.ql.metadata.Table table) {
     Preconditions.checkState(supportsSortColumns(), "Should only be called for table formats where data sorting " +
         "is supported");
     return Collections.emptyList();
   }

   /**
    * Check if the underlying storage handler implementation support partition transformations.
    * @return true if the storage handler can support it
    */
   default boolean supportsPartitionTransform() {
     return false;
   }

   /**
    * Return a list of partition transform specifications. This method should be overwritten in case
    * {@link HiveStorageHandler#supportsPartitionTransform()} returns true.
    * @param table the HMS table, must be non-null
    * @return partition transform specification, can be null.
    */
   default List<TransformSpec> getPartitionTransformSpec(org.apache.hadoop.hive.ql.metadata.Table table) {
     return null;
   }

   /**
    * Creates a DynamicPartitionCtx instance that will be set up by the storage handler itself. Useful for non-native
    * tables where partitions are not handled by Hive, and sorting is required in a custom way before writing the table.
    * @param conf job conf
    * @param table the HMS table
    * @return the created DP context object, null if DP context / sorting is not required
    * @throws SemanticException
    */
   default DynamicPartitionCtx createDPContext(
           HiveConf conf, org.apache.hadoop.hive.ql.metadata.Table table, Operation writeOperation)
       throws SemanticException {
     Preconditions.checkState(alwaysUnpartitioned(), "Should only be called for table formats where partitioning " +
         "is not handled by Hive but the table format itself. See alwaysUnpartitioned() method.");
     return null;
   }

   /**
    * Get file format property key, if the file format is configured through a table property.
    * @return table property key, can be null
    */
   default String getFileFormatPropertyKey() {
     return null;
   }

   /**
    * Checks if we should keep the {@link org.apache.hadoop.hive.ql.exec.MoveTask} and use the
    * {@link #storageHandlerCommit(Properties, boolean)} method for committing inserts instead of
    * {@link org.apache.hadoop.hive.metastore.DefaultHiveMetaHook#commitInsertTable(Table, boolean)}.
    * @return Returns true if we should use the {@link #storageHandlerCommit(Properties, boolean)} method
    */
   default boolean commitInMoveTask() {
     return false;
   }

   /**
    * Commits the inserts for the non-native tables. Used in the {@link org.apache.hadoop.hive.ql.exec.MoveTask}.
    * @param commitProperties Commit properties which are needed for the handler based commit
    * @param overwrite If this is an INSERT OVERWRITE then it is true
    * @throws HiveException If there is an error during commit
    */
   default void storageHandlerCommit(Properties commitProperties, boolean overwrite) throws HiveException {
     throw new UnsupportedOperationException();
   }

   /**
    * Checks whether a certain ALTER TABLE operation is supported by the storage handler implementation.
    *
    * @param opType The alter operation type (e.g. RENAME_COLUMNS)
    * @return whether the operation is supported by the storage handler
    */
   default boolean isAllowedAlterOperation(AlterTableType opType) {
     return DEFAULT_ALLOWED_ALTER_OPS.contains(opType);
   }

   /**
    * Check if the underlying storage handler implementation supports truncate operation
    * for non native tables.
    * @return true if the storage handler can support it
    * @return
    */
   default boolean supportsTruncateOnNonNativeTables() {
     return false;
   }

   /**
    * Should return true if the StorageHandler is able to handle time travel.
    * @return True if time travel is allowed
    */
   default boolean isTimeTravelAllowed() {
     return false;
   }

   /**
    * Introduced by HIVE-25457 for iceberg to query metadata table.
    * @return true if the storage handler can support it
    * @deprecated Use {@link #isTableMetaRefSupported()}
    */
   @Deprecated
   default boolean isMetadataTableSupported() {
     return isTableMetaRefSupported();
   }

   /**
    * Check whether the table supports metadata references which mainly include branch, tag and metadata tables.
    * @return true if the storage handler can support it
    */
   default boolean isTableMetaRefSupported() {
     return false;
   }

   default boolean isValidMetadataTable(String metaTableName) {
     return false;
   }

   default org.apache.hadoop.hive.ql.metadata.Table checkAndSetTableMetaRef(
       org.apache.hadoop.hive.ql.metadata.Table hmsTable, String tableMetaRef) throws SemanticException {
     return null;
   }

   /**
    * Constructs a URI for authorization purposes using the HMS table object
    * @param table The HMS table object
    * @return the URI for authorization
    */
   default URI getURIForAuth(Table table) throws URISyntaxException {
     Map<String, String> tableProperties = HiveCustomStorageHandlerUtils.getTableProperties(table);
     return new URI(this.getClass().getSimpleName().toLowerCase() + "://" +
         HiveCustomStorageHandlerUtils.getTablePropsForCustomStorageHandler(tableProperties));
   }

   /**
    * Validates whether the sink operation is permitted for the specific storage handler, based
    * on information contained in the sinkDesc.
    * @param sinkDesc The sink descriptor
    * @throws SemanticException if the sink operation is not allowed
    */
   default void validateSinkDesc(FileSinkDesc sinkDesc) throws SemanticException {
   }

   /**
    * Execute an operation on storage handler level
    * @param executeSpec operation specification
    */
   default void executeOperation(org.apache.hadoop.hive.ql.metadata.Table table, AlterTableExecuteSpec executeSpec) {
   }

   default void alterTableBranchOperation(org.apache.hadoop.hive.ql.metadata.Table table,
       AlterTableBranchSpec alterBranchSpec) {
   }

   /**
    * Gets whether this storage handler supports snapshots.
    * @return true means snapshots are supported false otherwise
    */
   default boolean areSnapshotsSupported() {
     return false;
   }

   /**
    * Query the most recent unique snapshot's context of the passed table.
    * @param table - {@link org.apache.hadoop.hive.ql.metadata.Table} which snapshot context should be returned.
    * @return {@link SnapshotContext} wraps the snapshotId or null if no snapshot present.
    */
   default SnapshotContext getCurrentSnapshotContext(org.apache.hadoop.hive.ql.metadata.Table table) {
     return null;
   }

   /**
    * Alter table operations can rely on this to customize the EnvironmentContext to be used during the alter table
    * invocation (both on client and server side of HMS)
    * @param alterTableDesc the alter table desc (e.g.: AlterTableSetPropertiesDesc) containing the work to do
    * @param environmentContext an existing EnvironmentContext created prior, now to be filled/amended
    */
   default void prepareAlterTableEnvironmentContext(AbstractAlterTableDesc alterTableDesc,
       EnvironmentContext environmentContext) {
   }

   default Boolean hasAppendsOnly(org.apache.hadoop.hive.ql.metadata.Table hmsTable, SnapshotContext since) {
     return null;
   }
 }