be/src/exec/orc/orc-metadata-utils.h - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #pragma once

 #include <orc/OrcFile.hh>

 #include "runtime/descriptors.h"

 namespace impala {

 class FileMetadataUtils;

 // Key of Hive ACID version in ORC metadata.
 const string HIVE_ACID_VERSION_KEY = "hive.acid.version";

 // Table level indexes of ACID columns.
 constexpr int ACID_FIELD_OPERATION_INDEX = 0;
 constexpr int ACID_FIELD_ORIGINAL_TRANSACTION_INDEX = 1;
 constexpr int ACID_FIELD_BUCKET_INDEX = 2;
 constexpr int ACID_FIELD_ROWID_INDEX = 3;
 constexpr int ACID_FIELD_CURRENT_TRANSACTION_INDEX = 4;
 constexpr int ACID_FIELD_ROW = 5;

 // ORC type id of column "currentTransaction" in full ACID ORC files.
 constexpr int CURRENT_TRANSCACTION_TYPE_ID = 5;

 /// Util class to resolve SchemaPaths of TupleDescriptors/SlotDescriptors into orc::Type.
 class OrcSchemaResolver {
  public:
   OrcSchemaResolver(const HdfsTableDescriptor& tbl_desc,
       const FileMetadataUtils& file_metadata_utils,
       const orc::Type* root,
       const char* filename, bool is_table_acid,
       TSchemaResolutionStrategy::type schema_resolution);

   Status Init();

   /// Resolve SchemaPath into orc::Type (ORC column representation)
   /// 'pos_field' is set to true if 'col_path' reference the index field of an array
   /// column. '*node' will be the array node if 'pos_field' is set to true.
   /// 'missing_field' is set to true if the column is missing in the ORC file.
   Status ResolveColumn(const SchemaPath& col_path, const orc::Type** node,
       bool* pos_field, bool* missing_field) const;

   /// Returns true if file schema corresponds to full ACIDv2 format.
   bool HasFullAcidV2Schema() const { return is_file_full_acid_; }

   /// Can be only invoked for original files of full transactional tables.
   /// Returns true if 'col_path' refers to an ACID column.
   bool IsAcidColumn(const SchemaPath& col_path) const;

   /// Translates 'col_path' to non-canonical table and file paths. These non-canonical
   /// paths have the same lengths. To achieve that they might contain -1 values that must
   /// be ignored. These paths are useful for tables that have different table and file
   /// schema (ACID tables, partitioned tables).
   /// E.g. ACID table schema is
   /// {
   ///   "row__id" : {...ACID columns...},
   ///   ...TABLE columns...
   /// }
   /// While ACID file schema is
   /// {
   ///   ...ACID columns...,
   ///   "row" : {...TABLE columns...}
   /// }
   /// Let's assume we have a non-partitioned ACID table and the first user column is
   /// called 'id'.
   /// In that case 'col_path' for 'id' looks like [5, 0]. This function converts it to
   /// non-canonical 'table_col_path' [-1, 1] and non-canonical 'file_col_path'
   /// [5, 0] (which is the same as the canonical in this case).
   /// Another example for ACID column 'rowid':
   /// 'col_path' is [3], 'table_col_path' is [0, 3], 'file_col_path' is [-1, 3].
   /// Different conversions are needed for original files and non-transactional tables
   /// (for the latter it only adjusts first column offsets if the table is partitioned).
   /// These non-canonical paths are easier to be processed by ResolveColumn().
   void TranslateColPaths(const SchemaPath& col_path,
       SchemaPath* table_col_path, SchemaPath* file_col_path) const;

  private:
   TSchemaResolutionStrategy::type schema_resolution_strategy_;

   /// Resolve column based on position. This only works when the fields in the HMS
   /// table schema match the file schema (apart from Hive ACID schema differences which
   /// are being handled).
   Status ResolveColumnByPosition(const SchemaPath& col_path, const orc::Type** node,
       bool* pos_field, bool* missing_field) const;

   /// Resolve column based on name.
   Status ResolveColumnByName(const SchemaPath& col_path, const orc::Type** node,
       bool* pos_field, bool* missing_field) const;

   /// Resolve column based on the Iceberg field ids. This way we will retrieve the
   /// Iceberg field ids from the HMS table via 'col_path', then find the corresponding
   /// field in the ORC file.
   Status ResolveColumnByIcebergFieldId(const SchemaPath& col_path, const orc::Type** node,
       bool* pos_field, bool* missing_field) const;

   /// Finds child of 'node' whose column name matches to provided 'name'.
   const orc::Type* FindChildWithName(
       const orc::Type* node, const std::string& name) const;

   /// Finds child of 'node' that has Iceberg field id equals to 'field_id'.
   const orc::Type* FindChildWithFieldId(const orc::Type* node, const int field_id) const;

   /// Generates field ids for the columns in the same order as Iceberg. The traversal is
   /// preorder, but the assigned field IDs are not in that order. When a node is
   /// processed, its child nodes are assigned an ID, hence the difference.
   Status GenerateFieldIDs();

   inline int GetGeneratedFieldID(const orc::Type* type) const;

   SchemaPath GetCanonicalSchemaPath(const SchemaPath& col_path, int last_idx) const;

   /// Sets 'is_file_full_acid_' based on the file schema.
   void DetermineFullAcidSchema();

   const HdfsTableDescriptor& tbl_desc_;
   const FileMetadataUtils& file_metadata_utils_;
   const orc::Type* const root_;
   const char* const filename_ = nullptr;
   const bool is_table_full_acid_;
   bool is_file_full_acid_;
   std::unordered_map<const orc::Type*, int> orc_type_to_field_id_;

   /// Validate whether the ColumnType is compatible with the orc type
   Status ValidateType(const ColumnType& type, const orc::Type& orc_type,
     const SchemaPath& col_path, int last_idx) const WARN_UNUSED_RESULT;
   Status ValidateStruct(const ColumnType& type, const orc::Type& orc_type,
       const SchemaPath& col_path, int last_idx) const WARN_UNUSED_RESULT;
   Status ValidateArray(const ColumnType& type, const orc::Type& orc_type,
       const SchemaPath& col_path, int last_idx) const WARN_UNUSED_RESULT;
   Status ValidateMap(const ColumnType& type, const orc::Type& orc_type,
       const SchemaPath& col_path, int last_idx) const WARN_UNUSED_RESULT;
   Status ValidatePrimitiveType(const ColumnType& type, const orc::Type& orc_type) const
       WARN_UNUSED_RESULT;
 };
 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#pragma once

	#include <orc/OrcFile.hh>

	#include "runtime/descriptors.h"

	namespace impala {

	class FileMetadataUtils;

	// Key of Hive ACID version in ORC metadata.
	const string HIVE_ACID_VERSION_KEY = "hive.acid.version";

	// Table level indexes of ACID columns.
	constexpr int ACID_FIELD_OPERATION_INDEX = 0;
	constexpr int ACID_FIELD_ORIGINAL_TRANSACTION_INDEX = 1;
	constexpr int ACID_FIELD_BUCKET_INDEX = 2;
	constexpr int ACID_FIELD_ROWID_INDEX = 3;
	constexpr int ACID_FIELD_CURRENT_TRANSACTION_INDEX = 4;
	constexpr int ACID_FIELD_ROW = 5;

	// ORC type id of column "currentTransaction" in full ACID ORC files.
	constexpr int CURRENT_TRANSCACTION_TYPE_ID = 5;

	/// Util class to resolve SchemaPaths of TupleDescriptors/SlotDescriptors into orc::Type.
	class OrcSchemaResolver {
	public:
	OrcSchemaResolver(const HdfsTableDescriptor& tbl_desc,
	const FileMetadataUtils& file_metadata_utils,
	const orc::Type* root,
	const char* filename, bool is_table_acid,
	TSchemaResolutionStrategy::type schema_resolution);

	Status Init();

	/// Resolve SchemaPath into orc::Type (ORC column representation)
	/// 'pos_field' is set to true if 'col_path' reference the index field of an array
	/// column. '*node' will be the array node if 'pos_field' is set to true.
	/// 'missing_field' is set to true if the column is missing in the ORC file.
	Status ResolveColumn(const SchemaPath& col_path, const orc::Type** node,
	bool* pos_field, bool* missing_field) const;

	/// Returns true if file schema corresponds to full ACIDv2 format.
	bool HasFullAcidV2Schema() const { return is_file_full_acid_; }

	/// Can be only invoked for original files of full transactional tables.
	/// Returns true if 'col_path' refers to an ACID column.
	bool IsAcidColumn(const SchemaPath& col_path) const;

	/// Translates 'col_path' to non-canonical table and file paths. These non-canonical
	/// paths have the same lengths. To achieve that they might contain -1 values that must
	/// be ignored. These paths are useful for tables that have different table and file
	/// schema (ACID tables, partitioned tables).
	/// E.g. ACID table schema is
	/// {
	/// "row__id" : {...ACID columns...},
	/// ...TABLE columns...
	/// }
	/// While ACID file schema is
	/// {
	/// ...ACID columns...,
	/// "row" : {...TABLE columns...}
	/// }
	/// Let's assume we have a non-partitioned ACID table and the first user column is
	/// called 'id'.
	/// In that case 'col_path' for 'id' looks like [5, 0]. This function converts it to
	/// non-canonical 'table_col_path' [-1, 1] and non-canonical 'file_col_path'
	/// [5, 0] (which is the same as the canonical in this case).
	/// Another example for ACID column 'rowid':
	/// 'col_path' is [3], 'table_col_path' is [0, 3], 'file_col_path' is [-1, 3].
	/// Different conversions are needed for original files and non-transactional tables
	/// (for the latter it only adjusts first column offsets if the table is partitioned).
	/// These non-canonical paths are easier to be processed by ResolveColumn().
	void TranslateColPaths(const SchemaPath& col_path,
	SchemaPath* table_col_path, SchemaPath* file_col_path) const;

	private:
	TSchemaResolutionStrategy::type schema_resolution_strategy_;

	/// Resolve column based on position. This only works when the fields in the HMS
	/// table schema match the file schema (apart from Hive ACID schema differences which
	/// are being handled).
	Status ResolveColumnByPosition(const SchemaPath& col_path, const orc::Type** node,
	bool* pos_field, bool* missing_field) const;

	/// Resolve column based on name.
	Status ResolveColumnByName(const SchemaPath& col_path, const orc::Type** node,
	bool* pos_field, bool* missing_field) const;

	/// Resolve column based on the Iceberg field ids. This way we will retrieve the
	/// Iceberg field ids from the HMS table via 'col_path', then find the corresponding
	/// field in the ORC file.
	Status ResolveColumnByIcebergFieldId(const SchemaPath& col_path, const orc::Type** node,
	bool* pos_field, bool* missing_field) const;

	/// Finds child of 'node' whose column name matches to provided 'name'.
	const orc::Type* FindChildWithName(
	const orc::Type* node, const std::string& name) const;

	/// Finds child of 'node' that has Iceberg field id equals to 'field_id'.
	const orc::Type* FindChildWithFieldId(const orc::Type* node, const int field_id) const;

	/// Generates field ids for the columns in the same order as Iceberg. The traversal is
	/// preorder, but the assigned field IDs are not in that order. When a node is
	/// processed, its child nodes are assigned an ID, hence the difference.
	Status GenerateFieldIDs();

	inline int GetGeneratedFieldID(const orc::Type* type) const;

	SchemaPath GetCanonicalSchemaPath(const SchemaPath& col_path, int last_idx) const;

	/// Sets 'is_file_full_acid_' based on the file schema.
	void DetermineFullAcidSchema();

	const HdfsTableDescriptor& tbl_desc_;
	const FileMetadataUtils& file_metadata_utils_;
	const orc::Type* const root_;
	const char* const filename_ = nullptr;
	const bool is_table_full_acid_;
	bool is_file_full_acid_;
	std::unordered_map<const orc::Type*, int> orc_type_to_field_id_;

	/// Validate whether the ColumnType is compatible with the orc type
	Status ValidateType(const ColumnType& type, const orc::Type& orc_type,
	const SchemaPath& col_path, int last_idx) const WARN_UNUSED_RESULT;
	Status ValidateStruct(const ColumnType& type, const orc::Type& orc_type,
	const SchemaPath& col_path, int last_idx) const WARN_UNUSED_RESULT;
	Status ValidateArray(const ColumnType& type, const orc::Type& orc_type,
	const SchemaPath& col_path, int last_idx) const WARN_UNUSED_RESULT;
	Status ValidateMap(const ColumnType& type, const orc::Type& orc_type,
	const SchemaPath& col_path, int last_idx) const WARN_UNUSED_RESULT;
	Status ValidatePrimitiveType(const ColumnType& type, const orc::Type& orc_type) const
	WARN_UNUSED_RESULT;
	};
	}