src/iceberg/schema.h - iceberg-cpp - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 #pragma once

 /// \file iceberg/schema.h
 /// Schemas for Iceberg tables.  This header contains the definition of Schema
 /// and any utility functions.  See iceberg/type.h and iceberg/field.h as well.

 #include <cstdint>
 #include <optional>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>

 #include "iceberg/iceberg_export.h"
 #include "iceberg/result.h"
 #include "iceberg/schema_field.h"
 #include "iceberg/type.h"
 #include "iceberg/util/lazy.h"
 #include "iceberg/util/string_util.h"

 namespace iceberg {

 class SchemaCache;

 /// \brief A schema for a Table.
 ///
 /// A schema is a list of typed columns, along with a unique integer ID.  A
 /// Table may have different schemas over its lifetime due to schema
 /// evolution.
 class ICEBERG_EXPORT Schema : public StructType {
  public:
   static constexpr int32_t kInitialSchemaId = 0;
   static constexpr int32_t kInitialColumnId = 0;
   static constexpr int32_t kInvalidColumnId = -1;

   /// \brief Special value to select all columns from manifest files.
   static constexpr std::string_view kAllColumns = "*";

   explicit Schema(std::vector<SchemaField> fields, int32_t schema_id = kInitialSchemaId);

   /// \brief Create a schema.
   ///
   /// \param fields The fields that make up the schema.
   /// \param schema_id The unique identifier for this schema (default:kInitialSchemaId).
   /// \param identifier_field_ids Field IDs that uniquely identify rows in the table.
   /// \return A new Schema instance or Status if failed.
   static Result<std::unique_ptr<Schema>> Make(std::vector<SchemaField> fields,
                                               int32_t schema_id,
                                               std::vector<int32_t> identifier_field_ids);

   /// \brief Create a schema.
   ///
   /// \param fields The fields that make up the schema.
   /// \param schema_id The unique identifier for this schema (default: kInitialSchemaId).
   /// \param identifier_field_names Canonical names of fields that uniquely identify rows
   /// in the table.
   /// \return A new Schema instance or Status if failed.
   static Result<std::unique_ptr<Schema>> Make(
       std::vector<SchemaField> fields, int32_t schema_id,
       const std::vector<std::string>& identifier_field_names);

   /// \brief Validate that the identifier field with the given ID is valid for the schema
   ///
   /// This method checks that the specified field ID represents a valid identifier field
   /// according to Iceberg's identifier field requirements. It verifies that the field:
   /// - exists in the schema
   /// - is a primitive type
   /// - is not optional (required field)
   /// - is not a float or double type
   /// - is not nested within optional or non-struct parent fields
   ///
   /// \param field_id The ID of the field to validate as an identifier field.
   /// \param schema The schema containing the field to validate.
   /// \param id_to_parent A mapping from field IDs to their parent field IDs for nested
   /// field validation.
   /// \return Status indicating success or failure of the validation.
   static Status ValidateIdentifierFields(
       int32_t field_id, const Schema& schema,
       const std::unordered_map<int32_t, int32_t>& id_to_parent);

   /// \brief Get an empty schema.
   ///
   /// An empty schema has no fields and a schema ID of 0.
   static const std::shared_ptr<Schema>& EmptySchema();

   /// \brief Get the schema ID.
   ///
   /// A schema is identified by a unique ID for the purposes of schema
   /// evolution.
   int32_t schema_id() const;

   std::string ToString() const override;

   /// \brief Recursively find the SchemaField by field name.
   ///
   /// Short names for maps and lists are included for any name that does not conflict with
   /// a canonical name. For example, a list, 'l', of structs with field 'x' will produce
   /// short name 'l.x' in addition to canonical name 'l.element.x'. A map 'm', if its
   /// value includes a struct with field 'x' will produce short name 'm.x' in addition to
   /// canonical name 'm.value.x'.
   /// FIXME: Currently only handles ASCII lowercase conversion; extend to support
   /// non-ASCII characters (e.g., using std::towlower or ICU)
   Result<std::optional<std::reference_wrapper<const SchemaField>>> FindFieldByName(
       std::string_view name, bool case_sensitive = true) const;

   /// \brief Recursively find the SchemaField by field id.
   ///
   /// \param field_id The id of the field to get the accessor for.
   /// \return The field with the given id, or std::nullopt if not found.
   Result<std::optional<std::reference_wrapper<const SchemaField>>> FindFieldById(
       int32_t field_id) const;

   /// \brief Returns the canonical field name for the given id.
   ///
   /// \param field_id The id of the field to get the canonical name for.
   /// \return The canocinal column name of the field with the given id, or std::nullopt if
   /// not found.
   Result<std::optional<std::string_view>> FindColumnNameById(int32_t field_id) const;

   /// \brief Get the accessor to access the field by field id.
   ///
   /// \param field_id The id of the field to get the accessor for.
   /// \return The accessor to access the field, or NotFound if the field is not found.
   Result<std::unique_ptr<StructLikeAccessor>> GetAccessorById(int32_t field_id) const;

   /// \brief Creates a projected schema from selected field names.
   ///
   /// \param names Selected field names and nested names are dot-concatenated.
   /// \param case_sensitive Whether name matching is case-sensitive (default: true).
   /// \return Projected schema containing only selected fields.
   /// \note If the field name of a nested type has been selected, all of its
   /// sub-fields will be selected.
   Result<std::unique_ptr<Schema>> Select(std::span<const std::string> names,
                                          bool case_sensitive = true) const;

   /// \brief Creates a projected schema from selected field IDs.
   ///
   /// \param field_ids Set of field IDs to select
   /// \return Projected schema containing only the specified fields.
   /// \note Field ID of a nested field may not be projected unless at least
   /// one of its sub-fields has been projected.
   Result<std::unique_ptr<Schema>> Project(
       const std::unordered_set<int32_t>& field_ids) const;

   /// \brief Return the field IDs of the identifier fields.
   const std::vector<int32_t>& IdentifierFieldIds() const;

   /// \brief Return the canonical field names of the identifier fields.
   Result<std::vector<std::string>> IdentifierFieldNames() const;

   /// \brief Get the highest field ID in the schema.
   /// \return The highest field ID.
   Result<int32_t> HighestFieldId() const;

   /// \brief Checks whether this schema is equivalent to another schema while ignoring the
   /// schema id.
   bool SameSchema(const Schema& other) const;

   /// \brief Validate the schema for a given format version.
   ///
   /// This validates that the schema does not contain types that were released in later
   /// format versions.
   ///
   /// \param format_version The format version to validate against.
   /// \return Error status if the schema is invalid.
   Status Validate(int32_t format_version) const;

   friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }

  private:
   /// \brief Compare two schemas for equality.
   bool Equals(const Schema& other) const;

   const int32_t schema_id_;
   // Field IDs that uniquely identify rows in the table.
   std::vector<int32_t> identifier_field_ids_;
   // Cache for schema mappings to facilitate fast lookups.
   std::unique_ptr<SchemaCache> cache_;
 };

 // Cache for schema mappings to facilitate fast lookups.
 class ICEBERG_EXPORT SchemaCache {
  public:
   explicit SchemaCache(const Schema* schema) : schema_(schema) {}

   using IdToFieldMap =
       std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>;
   using IdToFieldMapRef = std::reference_wrapper<const IdToFieldMap>;

   struct NameIdMap {
     /// \brief Mapping from canonical field name to ID
     ///
     /// \note Short names for maps and lists are included for any name that does not
     /// conflict with a canonical name. For example, a list, 'l', of structs with field
     /// 'x' will produce short name 'l.x' in addition to canonical name 'l.element.x'.
     std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> name_to_id;

     /// \brief Mapping from field ID to canonical name
     ///
     /// \note Canonical names, but not short names are set, for example
     /// 'list.element.field' instead of 'list.field'.
     std::unordered_map<int32_t, std::string> id_to_name;
   };
   using NameIdMapRef = std::reference_wrapper<const NameIdMap>;

   using LowercaseNameToIdMap =
       std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>;
   using LowercaseNameToIdMapRef = std::reference_wrapper<const LowercaseNameToIdMap>;

   using IdToPositionPathMap = std::unordered_map<int32_t, std::vector<size_t>>;
   using IdToPositionPathMapRef = std::reference_wrapper<const IdToPositionPathMap>;

   Result<IdToFieldMapRef> GetIdToFieldMap() const;
   Result<NameIdMapRef> GetNameIdMap() const;
   Result<LowercaseNameToIdMapRef> GetLowercaseNameToIdMap() const;
   Result<IdToPositionPathMapRef> GetIdToPositionPathMap() const;
   Result<int32_t> GetHighestFieldId() const;

  private:
   static Result<IdToFieldMap> InitIdToFieldMap(const Schema* schema);
   static Result<NameIdMap> InitNameIdMap(const Schema* schema);
   static Result<LowercaseNameToIdMap> InitLowerCaseNameToIdMap(const Schema* schema);
   static Result<IdToPositionPathMap> InitIdToPositionPath(const Schema* schema);
   static Result<int32_t> InitHighestFieldId(const Schema* schema);

   const Schema* schema_;
   // Mapping from field id to field.
   Lazy<InitIdToFieldMap> id_to_field_;
   // Mapping from field name to field id.
   Lazy<InitNameIdMap> name_id_map_;
   // Mapping from lowercased field name to field id.
   Lazy<InitLowerCaseNameToIdMap> lowercase_name_to_id_;
   // Mapping from field id to (nested) position path to access the field.
   Lazy<InitIdToPositionPath> id_to_position_path_;
   // Highest field ID in the schema.
   Lazy<InitHighestFieldId> highest_field_id_;
 };

 }  // namespace iceberg
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	#pragma once

	/// \file iceberg/schema.h
	/// Schemas for Iceberg tables. This header contains the definition of Schema
	/// and any utility functions. See iceberg/type.h and iceberg/field.h as well.

	#include <cstdint>
	#include <optional>
	#include <string>
	#include <unordered_map>
	#include <unordered_set>
	#include <vector>

	#include "iceberg/iceberg_export.h"
	#include "iceberg/result.h"
	#include "iceberg/schema_field.h"
	#include "iceberg/type.h"
	#include "iceberg/util/lazy.h"
	#include "iceberg/util/string_util.h"

	namespace iceberg {

	class SchemaCache;

	/// \brief A schema for a Table.
	///
	/// A schema is a list of typed columns, along with a unique integer ID. A
	/// Table may have different schemas over its lifetime due to schema
	/// evolution.
	class ICEBERG_EXPORT Schema : public StructType {
	public:
	static constexpr int32_t kInitialSchemaId = 0;
	static constexpr int32_t kInitialColumnId = 0;
	static constexpr int32_t kInvalidColumnId = -1;

	/// \brief Special value to select all columns from manifest files.
	static constexpr std::string_view kAllColumns = "*";

	explicit Schema(std::vector<SchemaField> fields, int32_t schema_id = kInitialSchemaId);

	/// \brief Create a schema.
	///
	/// \param fields The fields that make up the schema.
	/// \param schema_id The unique identifier for this schema (default:kInitialSchemaId).
	/// \param identifier_field_ids Field IDs that uniquely identify rows in the table.
	/// \return A new Schema instance or Status if failed.
	static Result<std::unique_ptr<Schema>> Make(std::vector<SchemaField> fields,
	int32_t schema_id,
	std::vector<int32_t> identifier_field_ids);

	/// \brief Create a schema.
	///
	/// \param fields The fields that make up the schema.
	/// \param schema_id The unique identifier for this schema (default: kInitialSchemaId).
	/// \param identifier_field_names Canonical names of fields that uniquely identify rows
	/// in the table.
	/// \return A new Schema instance or Status if failed.
	static Result<std::unique_ptr<Schema>> Make(
	std::vector<SchemaField> fields, int32_t schema_id,
	const std::vector<std::string>& identifier_field_names);

	/// \brief Validate that the identifier field with the given ID is valid for the schema
	///
	/// This method checks that the specified field ID represents a valid identifier field
	/// according to Iceberg's identifier field requirements. It verifies that the field:
	/// - exists in the schema
	/// - is a primitive type
	/// - is not optional (required field)
	/// - is not a float or double type
	/// - is not nested within optional or non-struct parent fields
	///
	/// \param field_id The ID of the field to validate as an identifier field.
	/// \param schema The schema containing the field to validate.
	/// \param id_to_parent A mapping from field IDs to their parent field IDs for nested
	/// field validation.
	/// \return Status indicating success or failure of the validation.
	static Status ValidateIdentifierFields(
	int32_t field_id, const Schema& schema,
	const std::unordered_map<int32_t, int32_t>& id_to_parent);

	/// \brief Get an empty schema.
	///
	/// An empty schema has no fields and a schema ID of 0.
	static const std::shared_ptr<Schema>& EmptySchema();

	/// \brief Get the schema ID.
	///
	/// A schema is identified by a unique ID for the purposes of schema
	/// evolution.
	int32_t schema_id() const;

	std::string ToString() const override;

	/// \brief Recursively find the SchemaField by field name.
	///
	/// Short names for maps and lists are included for any name that does not conflict with
	/// a canonical name. For example, a list, 'l', of structs with field 'x' will produce
	/// short name 'l.x' in addition to canonical name 'l.element.x'. A map 'm', if its
	/// value includes a struct with field 'x' will produce short name 'm.x' in addition to
	/// canonical name 'm.value.x'.
	/// FIXME: Currently only handles ASCII lowercase conversion; extend to support
	/// non-ASCII characters (e.g., using std::towlower or ICU)
	Result<std::optional<std::reference_wrapper<const SchemaField>>> FindFieldByName(
	std::string_view name, bool case_sensitive = true) const;

	/// \brief Recursively find the SchemaField by field id.
	///
	/// \param field_id The id of the field to get the accessor for.
	/// \return The field with the given id, or std::nullopt if not found.
	Result<std::optional<std::reference_wrapper<const SchemaField>>> FindFieldById(
	int32_t field_id) const;

	/// \brief Returns the canonical field name for the given id.
	///
	/// \param field_id The id of the field to get the canonical name for.
	/// \return The canocinal column name of the field with the given id, or std::nullopt if
	/// not found.
	Result<std::optional<std::string_view>> FindColumnNameById(int32_t field_id) const;

	/// \brief Get the accessor to access the field by field id.
	///
	/// \param field_id The id of the field to get the accessor for.
	/// \return The accessor to access the field, or NotFound if the field is not found.
	Result<std::unique_ptr<StructLikeAccessor>> GetAccessorById(int32_t field_id) const;

	/// \brief Creates a projected schema from selected field names.
	///
	/// \param names Selected field names and nested names are dot-concatenated.
	/// \param case_sensitive Whether name matching is case-sensitive (default: true).
	/// \return Projected schema containing only selected fields.
	/// \note If the field name of a nested type has been selected, all of its
	/// sub-fields will be selected.
	Result<std::unique_ptr<Schema>> Select(std::span<const std::string> names,
	bool case_sensitive = true) const;

	/// \brief Creates a projected schema from selected field IDs.
	///
	/// \param field_ids Set of field IDs to select
	/// \return Projected schema containing only the specified fields.
	/// \note Field ID of a nested field may not be projected unless at least
	/// one of its sub-fields has been projected.
	Result<std::unique_ptr<Schema>> Project(
	const std::unordered_set<int32_t>& field_ids) const;

	/// \brief Return the field IDs of the identifier fields.
	const std::vector<int32_t>& IdentifierFieldIds() const;

	/// \brief Return the canonical field names of the identifier fields.
	Result<std::vector<std::string>> IdentifierFieldNames() const;

	/// \brief Get the highest field ID in the schema.
	/// \return The highest field ID.
	Result<int32_t> HighestFieldId() const;

	/// \brief Checks whether this schema is equivalent to another schema while ignoring the
	/// schema id.
	bool SameSchema(const Schema& other) const;

	/// \brief Validate the schema for a given format version.
	///
	/// This validates that the schema does not contain types that were released in later
	/// format versions.
	///
	/// \param format_version The format version to validate against.
	/// \return Error status if the schema is invalid.
	Status Validate(int32_t format_version) const;

	friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }

	private:
	/// \brief Compare two schemas for equality.
	bool Equals(const Schema& other) const;

	const int32_t schema_id_;
	// Field IDs that uniquely identify rows in the table.
	std::vector<int32_t> identifier_field_ids_;
	// Cache for schema mappings to facilitate fast lookups.
	std::unique_ptr<SchemaCache> cache_;
	};

	// Cache for schema mappings to facilitate fast lookups.
	class ICEBERG_EXPORT SchemaCache {
	public:
	explicit SchemaCache(const Schema* schema) : schema_(schema) {}

	using IdToFieldMap =
	std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>;
	using IdToFieldMapRef = std::reference_wrapper<const IdToFieldMap>;

	struct NameIdMap {
	/// \brief Mapping from canonical field name to ID
	///
	/// \note Short names for maps and lists are included for any name that does not
	/// conflict with a canonical name. For example, a list, 'l', of structs with field
	/// 'x' will produce short name 'l.x' in addition to canonical name 'l.element.x'.
	std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> name_to_id;

	/// \brief Mapping from field ID to canonical name
	///
	/// \note Canonical names, but not short names are set, for example
	/// 'list.element.field' instead of 'list.field'.
	std::unordered_map<int32_t, std::string> id_to_name;
	};
	using NameIdMapRef = std::reference_wrapper<const NameIdMap>;

	using LowercaseNameToIdMap =
	std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>;
	using LowercaseNameToIdMapRef = std::reference_wrapper<const LowercaseNameToIdMap>;

	using IdToPositionPathMap = std::unordered_map<int32_t, std::vector<size_t>>;
	using IdToPositionPathMapRef = std::reference_wrapper<const IdToPositionPathMap>;

	Result<IdToFieldMapRef> GetIdToFieldMap() const;
	Result<NameIdMapRef> GetNameIdMap() const;
	Result<LowercaseNameToIdMapRef> GetLowercaseNameToIdMap() const;
	Result<IdToPositionPathMapRef> GetIdToPositionPathMap() const;
	Result<int32_t> GetHighestFieldId() const;

	private:
	static Result<IdToFieldMap> InitIdToFieldMap(const Schema* schema);
	static Result<NameIdMap> InitNameIdMap(const Schema* schema);
	static Result<LowercaseNameToIdMap> InitLowerCaseNameToIdMap(const Schema* schema);
	static Result<IdToPositionPathMap> InitIdToPositionPath(const Schema* schema);
	static Result<int32_t> InitHighestFieldId(const Schema* schema);

	const Schema* schema_;
	// Mapping from field id to field.
	Lazy<InitIdToFieldMap> id_to_field_;
	// Mapping from field name to field id.
	Lazy<InitNameIdMap> name_id_map_;
	// Mapping from lowercased field name to field id.
	Lazy<InitLowerCaseNameToIdMap> lowercase_name_to_id_;
	// Mapping from field id to (nested) position path to access the field.
	Lazy<InitIdToPositionPath> id_to_position_path_;
	// Highest field ID in the schema.
	Lazy<InitHighestFieldId> highest_field_id_;
	};

	} // namespace iceberg