| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| namespace cpp impala |
| namespace java org.apache.impala.thrift |
| |
| include "Exprs.thrift" |
| include "Status.thrift" |
| include "Types.thrift" |
| include "hive_metastore.thrift" |
| |
| // Types used to represent catalog objects. |
| |
| // Type of Catalog object. |
| enum TCatalogObjectType { |
| // UNKNOWN is used to indicate an error condition when converting |
| // strings to their matching TCatalogObjectType. |
| UNKNOWN = 0 |
| CATALOG = 1 |
| DATABASE = 2 |
| TABLE = 3 |
| VIEW = 4 |
| FUNCTION = 5 |
| DATA_SOURCE = 6 |
| PRINCIPAL = 7 |
| PRIVILEGE = 8 |
| HDFS_CACHE_POOL = 9 |
| // A catalog object type as a marker for authorization cache invalidation. |
| AUTHZ_CACHE_INVALIDATION = 10 |
| } |
| |
| enum TTableType { |
| HDFS_TABLE = 0 |
| HBASE_TABLE = 1 |
| VIEW = 2 |
| DATA_SOURCE_TABLE = 3 |
| KUDU_TABLE = 4 |
| } |
| |
| // TODO: Separate the storage engines (e.g. Kudu) from the file formats. |
| // TODO: Make the names consistent with the file format keywords specified in |
| // the parser. |
| enum THdfsFileFormat { |
| TEXT = 0 |
| RC_FILE = 1 |
| SEQUENCE_FILE = 2 |
| AVRO = 3 |
| PARQUET = 4 |
| KUDU = 5 |
| ORC = 6 |
| } |
| |
| // TODO: Since compression is also enabled for Kudu columns, we should |
| // rename this enum to not be Hdfs specific. |
| enum THdfsCompression { |
| NONE = 0 |
| DEFAULT = 1 |
| GZIP = 2 |
| DEFLATE = 3 |
| BZIP2 = 4 |
| SNAPPY = 5 |
| SNAPPY_BLOCKED = 6 |
| LZO = 7 |
| LZ4 = 8 |
| ZLIB = 9 |
| ZSTD = 10 |
| BROTLI = 11 |
| LZ4_BLOCKED = 12 |
| } |
| |
| enum TColumnEncoding { |
| AUTO = 0 |
| PLAIN = 1 |
| PREFIX = 2 |
| GROUP_VARINT = 3 |
| RLE = 4 |
| DICTIONARY = 5 |
| BIT_SHUFFLE = 6 |
| } |
| |
| enum THdfsSeqCompressionMode { |
| RECORD = 0 |
| BLOCK = 1 |
| } |
| |
| // The table property type. |
| enum TTablePropertyType { |
| TBL_PROPERTY = 0 |
| SERDE_PROPERTY = 1 |
| } |
| |
| // The access level that is available to Impala on the Catalog object. |
| enum TAccessLevel { |
| NONE = 0 |
| READ_WRITE = 1 |
| READ_ONLY = 2 |
| WRITE_ONLY = 3 |
| } |
| |
| struct TCompressionCodec { |
| // Compression codec |
| 1: required THdfsCompression codec |
| // Compression level |
| 2: optional i32 compression_level |
| } |
| |
| // Mapping from names defined by Avro to values in the THdfsCompression enum. |
| const map<string, THdfsCompression> COMPRESSION_MAP = { |
| "": THdfsCompression.NONE, |
| "none": THdfsCompression.NONE, |
| "deflate": THdfsCompression.DEFAULT, |
| "gzip": THdfsCompression.GZIP, |
| "bzip2": THdfsCompression.BZIP2, |
| "snappy": THdfsCompression.SNAPPY |
| } |
| |
| // Represents a single item in a partition spec (column name + value) |
| struct TPartitionKeyValue { |
| // Partition column name |
| 1: required string name, |
| |
| // Partition value |
| 2: required string value |
| } |
| |
| // Represents a fully qualified table name. |
| struct TTableName { |
| // Name of the table's parent database. |
| 1: required string db_name |
| |
| // Name of the table |
| 2: required string table_name |
| } |
| |
| struct TTableStats { |
| // Estimated number of rows in the table or -1 if unknown |
| 1: required i64 num_rows |
| |
| // Sum of file sizes in the table. Only set for tables of type HDFS_TABLE. |
| 2: optional i64 total_file_bytes |
| } |
| |
| // Column stats data that Impala uses. |
| struct TColumnStats { |
| // Average size and max size, in bytes. Excludes serialization overhead. |
| // For fixed-length types (those which don't need additional storage besides the slot |
| // they occupy), sets avg_size and max_size to their slot size. |
| 1: required double avg_size |
| 2: required i64 max_size |
| |
| // Estimated number of distinct values. |
| 3: required i64 num_distinct_values |
| |
| // Estimated number of null values. |
| 4: required i64 num_nulls |
| } |
| |
| // Intermediate state for the computation of per-column stats. Impala can aggregate these |
| // structures together to produce final stats for a column. |
| struct TIntermediateColumnStats { |
| // One byte for each bucket of the NDV HLL computation |
| 1: optional binary intermediate_ndv |
| |
| // If true, intermediate_ndv is RLE-compressed |
| 2: optional bool is_ndv_encoded |
| |
| // Number of nulls seen so far (or -1 if nulls are not counted) |
| 3: optional i64 num_nulls |
| |
| // The maximum width, in bytes, of the column |
| 4: optional i32 max_width |
| |
| // The average width (in bytes) of the column |
| 5: optional double avg_width |
| |
| // The number of rows counted, needed to compute NDVs from intermediate_ndv |
| 6: optional i64 num_rows |
| } |
| |
| // Per-partition statistics |
| struct TPartitionStats { |
| // Number of rows gathered per-partition by non-incremental stats. |
| // TODO: This can probably be removed in favour of the intermediate_col_stats, but doing |
| // so would interfere with the non-incremental stats path |
| 1: required TTableStats stats |
| |
| // Intermediate state for incremental statistics, one entry per column name. |
| 2: optional map<string, TIntermediateColumnStats> intermediate_col_stats |
| } |
| |
| struct TColumn { |
| // The column name, in lower case. |
| 1: required string columnName |
| 2: required Types.TColumnType columnType |
| 3: optional string comment |
| // Stats for this table, if any are available. |
| 4: optional TColumnStats col_stats |
| // Ordinal position in the source table |
| 5: optional i32 position |
| |
| // Indicates whether this is an HBase column. If true, implies |
| // all following HBase-specific fields are set. |
| 6: optional bool is_hbase_column |
| 7: optional string column_family |
| 8: optional string column_qualifier |
| 9: optional bool is_binary |
| |
| // All the following are Kudu-specific column properties |
| 10: optional bool is_kudu_column |
| 11: optional bool is_key |
| 12: optional bool is_nullable |
| 13: optional TColumnEncoding encoding |
| 14: optional THdfsCompression compression |
| 15: optional Exprs.TExpr default_value |
| 16: optional i32 block_size |
| // The column name, in the case that it appears in Kudu. |
| 17: optional string kudu_column_name |
| } |
| |
| // Represents an HDFS file in a partition. |
| struct THdfsFileDesc { |
| // File descriptor metadata serialized into a FlatBuffer |
| // (defined in common/fbs/CatalogObjects.fbs). |
| // TODO: Put this in a KRPC sidecar to avoid serialization cost. |
| 1: required binary file_desc_data |
| } |
| |
| // Represents an HDFS partition's location in a compressed format. 'prefix_index' |
| // represents the portion of the partition's location that comes before the last N |
| // directories, where N is the number of partitioning columns. 'prefix_index' is an index |
| // into THdfsTable.partition_prefixes, or -1 if this location has not been compressed. |
| // 'suffix' is the rest of the partition location. |
| struct THdfsPartitionLocation { |
| 1: required i32 prefix_index = -1 |
| 2: required string suffix |
| } |
| |
| // Represents an HDFS partition |
| // TODO(vercegovac): rename to TFsPartition |
| struct THdfsPartition { |
| |
| // ============================================================ |
| // Fields included in the "Descriptor" format sent to the backend |
| // as part of query plans and fragments. |
| // ============================================================ |
| |
| 1: required byte lineDelim |
| 2: required byte fieldDelim |
| 3: required byte collectionDelim |
| 4: required byte mapKeyDelim |
| 5: required byte escapeChar |
| 6: required THdfsFileFormat fileFormat |
| |
| // These are Literal expressions |
| 7: list<Exprs.TExpr> partitionKeyExprs |
| 8: required i32 blockSize |
| |
| 10: optional THdfsPartitionLocation location |
| |
| // Unique (in this table) id of this partition. May be set to |
| // PROTOTYPE_PARTITION_ID when this object is used to describe |
| // a partition which will be created as part of a query. |
| 14: optional i64 id |
| |
| |
| // ============================================================ |
| // Fields only included when the catalogd serializes a table to be |
| // sent to the impalad as part of a catalog update. |
| // ============================================================ |
| |
| 9: optional list<THdfsFileDesc> file_desc |
| |
| // The access level Impala has on this partition (READ_WRITE, READ_ONLY, etc). |
| 11: optional TAccessLevel access_level |
| |
| // Statistics on this partition, e.g., number of rows in this partition. |
| 12: optional TTableStats stats |
| |
| // True if this partition has been marked as cached (does not necessarily mean the |
| // underlying data is cached). |
| 13: optional bool is_marked_cached |
| |
| // (key,value) pairs stored in the Hive Metastore. |
| 15: optional map<string, string> hms_parameters |
| |
| // The following fields store stats about this partition |
| // which are collected when toThrift() is called. |
| // Total number of blocks in this partition. |
| 16: optional i64 num_blocks |
| |
| // Total file size in bytes of this partition. |
| 17: optional i64 total_file_size_bytes |
| |
| // byte[] representation of TPartitionStats for this partition that is compressed using |
| // 'deflate-compression'. |
| 18: optional binary partition_stats |
| |
| // Set to true if partition_stats contain intermediate column stats computed via |
| // incremental statistics, false otherwise. |
| 19: optional bool has_incremental_stats |
| |
| // For acid table, store last committed write id. |
| 20: optional i64 write_id |
| } |
| |
| // Constant partition ID used for THdfsPartition.prototype_partition below. |
| // Must be < 0 to avoid collisions |
| const i64 PROTOTYPE_PARTITION_ID = -1; |
| |
| |
| struct THdfsTable { |
| // ============================================================ |
| // Fields included in the "Descriptor" format sent to the backend |
| // as part of query plans and fragments. |
| // ============================================================ |
| |
| 1: required string hdfsBaseDir |
| |
| // Deprecated. Use TTableDescriptor.colNames. |
| 2: required list<string> colNames; |
| |
| // The string used to represent NULL partition keys. |
| 3: required string nullPartitionKeyValue |
| |
| // String to indicate a NULL column value in text files |
| 5: required string nullColumnValue |
| |
| // Set to the table's Avro schema if this is an Avro table |
| 6: optional string avroSchema |
| |
| // Map from partition id to partition metadata. |
| // Does not include the special prototype partition with id=PROTOTYPE_PARTITION_ID -- |
| // that partition is separately included below. |
| 4: required map<i64, THdfsPartition> partitions |
| |
| // Prototype partition, used when creating new partitions during insert. |
| 10: required THdfsPartition prototype_partition |
| |
| // REMOVED: 8: optional bool multiple_filesystems |
| |
| // The prefixes of locations of partitions in this table. See THdfsPartitionLocation for |
| // the description of how a prefix is computed. |
| 9: optional list<string> partition_prefixes |
| |
| // ============================================================ |
| // Fields only included when the catalogd serializes a table to be |
| // sent to the impalad as part of a catalog update. |
| // ============================================================ |
| |
| // Each TNetworkAddress is a datanode which contains blocks of a file in the table. |
| // Used so that each THdfsFileBlock can just reference an index in this list rather |
| // than duplicate the list of network address, which helps reduce memory usage. |
| 7: optional list<Types.TNetworkAddress> network_addresses, |
| |
| // Primary Keys information for HDFS Tables |
| 11: optional list<hive_metastore.SQLPrimaryKey> primary_keys, |
| |
| // Foreign Keys information for HDFS Tables |
| 12: optional list<hive_metastore.SQLForeignKey> foreign_keys |
| } |
| |
| struct THBaseTable { |
| 1: required string tableName |
| 2: required list<string> families |
| 3: required list<string> qualifiers |
| |
| // Column i is binary encoded if binary_encoded[i] is true. Otherwise, column i is |
| // text encoded. |
| 4: optional list<bool> binary_encoded |
| } |
| |
| // Represents an external data source |
| struct TDataSource { |
| // Name of the data source |
| 1: required string name |
| |
| // HDFS URI of the library |
| 2: required string hdfs_location |
| |
| // Class name of the data source implementing the ExternalDataSource interface. |
| 3: required string class_name |
| |
| // Version of the ExternalDataSource interface. Currently only 'V1' exists. |
| 4: required string api_version |
| } |
| |
| // Represents a table scanned by an external data source. |
| struct TDataSourceTable { |
| // The data source that will scan this table. |
| 1: required TDataSource data_source |
| |
| // Init string for the table passed to the data source. May be an empty string. |
| 2: required string init_string |
| } |
| |
| // Parameters needed for hash partitioning |
| struct TKuduPartitionByHashParam { |
| 1: required list<string> columns |
| 2: required i32 num_partitions |
| } |
| |
| struct TRangePartition { |
| 1: optional list<Exprs.TExpr> lower_bound_values |
| 2: optional bool is_lower_bound_inclusive |
| 3: optional list<Exprs.TExpr> upper_bound_values |
| 4: optional bool is_upper_bound_inclusive |
| } |
| |
| // A range partitioning is identified by a list of columns and a list of range partitions. |
| struct TKuduPartitionByRangeParam { |
| 1: required list<string> columns |
| 2: optional list<TRangePartition> range_partitions |
| } |
| |
| // Parameters for the PARTITION BY clause. |
| struct TKuduPartitionParam { |
| 1: optional TKuduPartitionByHashParam by_hash_param; |
| 2: optional TKuduPartitionByRangeParam by_range_param; |
| } |
| |
| // Represents a Kudu table |
| struct TKuduTable { |
| 1: required string table_name |
| |
| // Network address of a master host in the form of 0.0.0.0:port |
| 2: required list<string> master_addresses |
| |
| // Name of the key columns |
| 3: required list<string> key_columns |
| |
| // Partitioning |
| 4: required list<TKuduPartitionParam> partition_by |
| } |
| |
| // Represents a table or view. |
| struct TTable { |
| // Name of the parent database. Case insensitive, expected to be stored as lowercase. |
| 1: required string db_name |
| |
| // Unqualified table name. Case insensitive, expected to be stored as lowercase. |
| 2: required string tbl_name |
| |
| // Set if there were any errors loading the Table metadata. The remaining fields in |
| // the struct may not be set if there were problems loading the table metadata. |
| // By convention, the final error message in the Status should contain the call stack |
| // string pointing to where the metadata loading error occurred. |
| 3: optional Status.TStatus load_status |
| |
| // The access level Impala has on this table (READ_WRITE, READ_ONLY, etc). |
| 4: optional TAccessLevel access_level |
| |
| // List of columns (excludes clustering columns) |
| 5: optional list<TColumn> columns |
| |
| // List of clustering columns (empty list if table has no clustering columns) |
| 6: optional list<TColumn> clustering_columns |
| |
| // Table stats data for the table. |
| 7: optional TTableStats table_stats |
| |
| // Determines the table type - either HDFS, HBASE, or VIEW. |
| 8: optional TTableType table_type |
| |
| // Set iff this is an HDFS table |
| 9: optional THdfsTable hdfs_table |
| |
| // Set iff this is an Hbase table |
| 10: optional THBaseTable hbase_table |
| |
| // The Hive Metastore representation of this table. May not be set if there were |
| // errors loading the table metadata |
| 11: optional hive_metastore.Table metastore_table |
| |
| // Set iff this is a table from an external data source |
| 12: optional TDataSourceTable data_source_table |
| |
| // Set iff this a kudu table |
| 13: optional TKuduTable kudu_table |
| |
| // Set iff this is an acid table. The valid write ids list. |
| // The string is assumed to be created by ValidWriteIdList.writeToString |
| // For example ValidReaderWriteIdList object's format is: |
| // <table_name>:<highwatermark>:<minOpenWriteId>:<open_writeids>:<abort_writeids> |
| 14: optional string valid_write_ids |
| |
| // Set if this table needs storage access during metadata load. |
| // Time used for storage loading in nanoseconds. |
| 15: optional i64 storage_metadata_load_time_ns |
| } |
| |
| // Represents a database. |
| struct TDatabase { |
| // Name of the database. Case insensitive, expected to be stored as lowercase. |
| 1: required string db_name |
| |
| // The Hive Metastore representation of this database. May not be set if there were |
| // errors loading the database metadata |
| 2: optional hive_metastore.Database metastore_db |
| } |
| |
| // Represents a type of principal. |
| enum TPrincipalType { |
| ROLE = 0 |
| USER = 1 |
| GROUP = 2 |
| } |
| |
| // Represents a principal in an authorization policy. |
| struct TPrincipal { |
| // Case-insensitive principal name |
| 1: required string principal_name |
| |
| // Unique ID of this principal, generated by the Catalog Server. |
| 2: required i32 principal_id |
| |
| // Type of this principal. |
| 3: required TPrincipalType principal_type |
| |
| // List of groups this principal has been granted to (group names are case sensitive). |
| // TODO: Keep a list of grant groups globally (in TCatalog?) and reference by ID since |
| // the same groups will likely be shared across multiple principals. |
| 4: required list<string> grant_groups |
| } |
| |
| // The scope a TPrivilege applies to. |
| enum TPrivilegeScope { |
| SERVER = 0 |
| URI = 1 |
| DATABASE = 2 |
| TABLE = 3 |
| COLUMN = 4 |
| } |
| |
| // The privilege level allowed. |
| enum TPrivilegeLevel { |
| ALL = 0 |
| INSERT = 1 |
| SELECT = 2 |
| REFRESH = 3 |
| CREATE = 4 |
| ALTER = 5 |
| DROP = 6 |
| OWNER = 7 |
| } |
| |
| // Represents a privilege in an authorization policy. Privileges contain the level |
| // of access, the scope and principal the privilege applies to, and details on what |
| // catalog object the privilege is securing. Objects are hierarchical, so a privilege |
| // corresponding to a table must also specify all the parent objects (database name |
| // and server name). |
| struct TPrivilege { |
| // NOTE: This field is no longer needed. Keeping it here to keep the field numbers. |
| // A human readable name for this privilege. The combination of principal_id + |
| // privilege_name is guaranteed to be unique. Stored in a form that can be passed |
| // to Sentry: [ServerName]->[DbName]->[TableName]->[ColumnName]->[Action Granted]. |
| // 1: required string privilege_name |
| |
| // The level of access this privilege provides. |
| 2: required TPrivilegeLevel privilege_level |
| |
| // The scope of the privilege: SERVER, DATABASE, URI, TABLE or COLUMN |
| 3: required TPrivilegeScope scope |
| |
| // If true, GRANT OPTION was specified. For a GRANT privilege statement, everyone |
| // granted this principal should be able to issue GRANT/REVOKE privilege statements even |
| // if they are not an admin. For REVOKE privilege statements, the privilege should be |
| // retainined and the existing GRANT OPTION (if it was set) on the privilege should be |
| // removed. |
| 4: required bool has_grant_opt |
| |
| // The ID of the principal this privilege belongs to. |
| 5: optional i32 principal_id |
| |
| // The type of the principal this privilege belongs to. |
| 6: optional TPrincipalType principal_type |
| |
| // Set if scope is SERVER, URI, DATABASE, or TABLE |
| 7: optional string server_name |
| |
| // Set if scope is DATABASE or TABLE |
| 8: optional string db_name |
| |
| // Unqualified table name. Set if scope is TABLE. |
| 9: optional string table_name |
| |
| // Set if scope is URI |
| 10: optional string uri |
| |
| // Time this privilege was created (in milliseconds since epoch). |
| 11: optional i64 create_time_ms |
| |
| // Set if scope is COLUMN |
| 12: optional string column_name |
| } |
| |
| // Thrift representation of an HdfsCachePool. |
| struct THdfsCachePool { |
| // Name of the cache pool |
| 1: required string pool_name |
| |
| // In the future we may want to include additional info on the pool such as |
| // the pool limits, pool owner, etc. |
| } |
| |
| // Thrift representation of an TAuthzCacheInvalidation. This catalog object does not |
| // contain any authorization data and it's used as marker to perform an authorization |
| // cache invalidation. |
| struct TAuthzCacheInvalidation { |
| // Name of the authorization cache marker. |
| 1: required string marker_name |
| } |
| |
| // Represents state associated with the overall catalog. |
| struct TCatalog { |
| // The CatalogService service ID. |
| 1: required Types.TUniqueId catalog_service_id |
| |
| // The catalog version last time when we reset the entire catalog |
| 2: required i64 last_reset_catalog_version |
| } |
| |
| // Union of all Thrift Catalog objects |
| struct TCatalogObject { |
| // The object type (Database, Table, View, or Function) |
| 1: required TCatalogObjectType type |
| |
| // The Catalog version this object is from |
| 2: required i64 catalog_version |
| |
| // Set iff object type is CATALOG |
| 3: optional TCatalog catalog |
| |
| // Set iff object type is DATABASE |
| 4: optional TDatabase db |
| |
| // Set iff object type is TABLE or VIEW |
| 5: optional TTable table |
| |
| // Set iff object type is FUNCTION |
| 6: optional Types.TFunction fn |
| |
| // Set iff object type is DATA SOURCE |
| 7: optional TDataSource data_source |
| |
| // Set iff object type is PRINCIPAL |
| 8: optional TPrincipal principal |
| |
| // Set iff object type is PRIVILEGE |
| 9: optional TPrivilege privilege |
| |
| // Set iff object type is HDFS_CACHE_POOL |
| 10: optional THdfsCachePool cache_pool |
| |
| // Set iff object type is AUTHZ_CACHE_INVALIDATION |
| 11: optional TAuthzCacheInvalidation authz_cache_invalidation |
| } |