blob: bcaccaa151994642dfac518262b899c5e5c23c27 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#pragma once
/// \file iceberg/schema.h
/// Schemas for Iceberg tables. This header contains the definition of Schema
/// and any utility functions. See iceberg/type.h and iceberg/field.h as well.
#include <cstdint>
#include <optional>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "iceberg/iceberg_export.h"
#include "iceberg/result.h"
#include "iceberg/schema_field.h"
#include "iceberg/type.h"
#include "iceberg/util/lazy.h"
#include "iceberg/util/string_util.h"
namespace iceberg {
class SchemaCache;
/// \brief A schema for a Table.
///
/// A schema is a list of typed columns, along with a unique integer ID. A
/// Table may have different schemas over its lifetime due to schema
/// evolution.
class ICEBERG_EXPORT Schema : public StructType {
public:
static constexpr int32_t kInitialSchemaId = 0;
static constexpr int32_t kInitialColumnId = 0;
static constexpr int32_t kInvalidColumnId = -1;
/// \brief Special value to select all columns from manifest files.
static constexpr std::string_view kAllColumns = "*";
explicit Schema(std::vector<SchemaField> fields, int32_t schema_id = kInitialSchemaId);
/// \brief Create a schema.
///
/// \param fields The fields that make up the schema.
/// \param schema_id The unique identifier for this schema (default:kInitialSchemaId).
/// \param identifier_field_ids Field IDs that uniquely identify rows in the table.
/// \return A new Schema instance or Status if failed.
static Result<std::unique_ptr<Schema>> Make(std::vector<SchemaField> fields,
int32_t schema_id,
std::vector<int32_t> identifier_field_ids);
/// \brief Create a schema.
///
/// \param fields The fields that make up the schema.
/// \param schema_id The unique identifier for this schema (default: kInitialSchemaId).
/// \param identifier_field_names Canonical names of fields that uniquely identify rows
/// in the table.
/// \return A new Schema instance or Status if failed.
static Result<std::unique_ptr<Schema>> Make(
std::vector<SchemaField> fields, int32_t schema_id,
const std::vector<std::string>& identifier_field_names);
/// \brief Validate that the identifier field with the given ID is valid for the schema
///
/// This method checks that the specified field ID represents a valid identifier field
/// according to Iceberg's identifier field requirements. It verifies that the field:
/// - exists in the schema
/// - is a primitive type
/// - is not optional (required field)
/// - is not a float or double type
/// - is not nested within optional or non-struct parent fields
///
/// \param field_id The ID of the field to validate as an identifier field.
/// \param schema The schema containing the field to validate.
/// \param id_to_parent A mapping from field IDs to their parent field IDs for nested
/// field validation.
/// \return Status indicating success or failure of the validation.
static Status ValidateIdentifierFields(
int32_t field_id, const Schema& schema,
const std::unordered_map<int32_t, int32_t>& id_to_parent);
/// \brief Get an empty schema.
///
/// An empty schema has no fields and a schema ID of 0.
static const std::shared_ptr<Schema>& EmptySchema();
/// \brief Get the schema ID.
///
/// A schema is identified by a unique ID for the purposes of schema
/// evolution.
int32_t schema_id() const;
std::string ToString() const override;
/// \brief Recursively find the SchemaField by field name.
///
/// Short names for maps and lists are included for any name that does not conflict with
/// a canonical name. For example, a list, 'l', of structs with field 'x' will produce
/// short name 'l.x' in addition to canonical name 'l.element.x'. A map 'm', if its
/// value includes a struct with field 'x' will produce short name 'm.x' in addition to
/// canonical name 'm.value.x'.
/// FIXME: Currently only handles ASCII lowercase conversion; extend to support
/// non-ASCII characters (e.g., using std::towlower or ICU)
Result<std::optional<std::reference_wrapper<const SchemaField>>> FindFieldByName(
std::string_view name, bool case_sensitive = true) const;
/// \brief Recursively find the SchemaField by field id.
///
/// \param field_id The id of the field to get the accessor for.
/// \return The field with the given id, or std::nullopt if not found.
Result<std::optional<std::reference_wrapper<const SchemaField>>> FindFieldById(
int32_t field_id) const;
/// \brief Returns the canonical field name for the given id.
///
/// \param field_id The id of the field to get the canonical name for.
/// \return The canocinal column name of the field with the given id, or std::nullopt if
/// not found.
Result<std::optional<std::string_view>> FindColumnNameById(int32_t field_id) const;
/// \brief Get the accessor to access the field by field id.
///
/// \param field_id The id of the field to get the accessor for.
/// \return The accessor to access the field, or NotFound if the field is not found.
Result<std::unique_ptr<StructLikeAccessor>> GetAccessorById(int32_t field_id) const;
/// \brief Creates a projected schema from selected field names.
///
/// \param names Selected field names and nested names are dot-concatenated.
/// \param case_sensitive Whether name matching is case-sensitive (default: true).
/// \return Projected schema containing only selected fields.
/// \note If the field name of a nested type has been selected, all of its
/// sub-fields will be selected.
Result<std::unique_ptr<Schema>> Select(std::span<const std::string> names,
bool case_sensitive = true) const;
/// \brief Creates a projected schema from selected field IDs.
///
/// \param field_ids Set of field IDs to select
/// \return Projected schema containing only the specified fields.
/// \note Field ID of a nested field may not be projected unless at least
/// one of its sub-fields has been projected.
Result<std::unique_ptr<Schema>> Project(
const std::unordered_set<int32_t>& field_ids) const;
/// \brief Return the field IDs of the identifier fields.
const std::vector<int32_t>& IdentifierFieldIds() const;
/// \brief Return the canonical field names of the identifier fields.
Result<std::vector<std::string>> IdentifierFieldNames() const;
/// \brief Get the highest field ID in the schema.
/// \return The highest field ID.
Result<int32_t> HighestFieldId() const;
/// \brief Checks whether this schema is equivalent to another schema while ignoring the
/// schema id.
bool SameSchema(const Schema& other) const;
/// \brief Validate the schema for a given format version.
///
/// This validates that the schema does not contain types that were released in later
/// format versions.
///
/// \param format_version The format version to validate against.
/// \return Error status if the schema is invalid.
Status Validate(int32_t format_version) const;
friend bool operator==(const Schema& lhs, const Schema& rhs) { return lhs.Equals(rhs); }
private:
/// \brief Compare two schemas for equality.
bool Equals(const Schema& other) const;
const int32_t schema_id_;
// Field IDs that uniquely identify rows in the table.
std::vector<int32_t> identifier_field_ids_;
// Cache for schema mappings to facilitate fast lookups.
std::unique_ptr<SchemaCache> cache_;
};
// Cache for schema mappings to facilitate fast lookups.
class ICEBERG_EXPORT SchemaCache {
public:
explicit SchemaCache(const Schema* schema) : schema_(schema) {}
using IdToFieldMap =
std::unordered_map<int32_t, std::reference_wrapper<const SchemaField>>;
using IdToFieldMapRef = std::reference_wrapper<const IdToFieldMap>;
struct NameIdMap {
/// \brief Mapping from canonical field name to ID
///
/// \note Short names for maps and lists are included for any name that does not
/// conflict with a canonical name. For example, a list, 'l', of structs with field
/// 'x' will produce short name 'l.x' in addition to canonical name 'l.element.x'.
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>> name_to_id;
/// \brief Mapping from field ID to canonical name
///
/// \note Canonical names, but not short names are set, for example
/// 'list.element.field' instead of 'list.field'.
std::unordered_map<int32_t, std::string> id_to_name;
};
using NameIdMapRef = std::reference_wrapper<const NameIdMap>;
using LowercaseNameToIdMap =
std::unordered_map<std::string, int32_t, StringHash, std::equal_to<>>;
using LowercaseNameToIdMapRef = std::reference_wrapper<const LowercaseNameToIdMap>;
using IdToPositionPathMap = std::unordered_map<int32_t, std::vector<size_t>>;
using IdToPositionPathMapRef = std::reference_wrapper<const IdToPositionPathMap>;
Result<IdToFieldMapRef> GetIdToFieldMap() const;
Result<NameIdMapRef> GetNameIdMap() const;
Result<LowercaseNameToIdMapRef> GetLowercaseNameToIdMap() const;
Result<IdToPositionPathMapRef> GetIdToPositionPathMap() const;
Result<int32_t> GetHighestFieldId() const;
private:
static Result<IdToFieldMap> InitIdToFieldMap(const Schema* schema);
static Result<NameIdMap> InitNameIdMap(const Schema* schema);
static Result<LowercaseNameToIdMap> InitLowerCaseNameToIdMap(const Schema* schema);
static Result<IdToPositionPathMap> InitIdToPositionPath(const Schema* schema);
static Result<int32_t> InitHighestFieldId(const Schema* schema);
const Schema* schema_;
// Mapping from field id to field.
Lazy<InitIdToFieldMap> id_to_field_;
// Mapping from field name to field id.
Lazy<InitNameIdMap> name_id_map_;
// Mapping from lowercased field name to field id.
Lazy<InitLowerCaseNameToIdMap> lowercase_name_to_id_;
// Mapping from field id to (nested) position path to access the field.
Lazy<InitIdToPositionPath> id_to_position_path_;
// Highest field ID in the schema.
Lazy<InitHighestFieldId> highest_field_id_;
};
} // namespace iceberg