blob: 987046a27bd6ae48078346b8e22b739067d36a08 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg;
import java.io.Serializable;
import java.util.Arrays;
import java.util.Collection;
import java.util.Deque;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.iceberg.relocated.com.google.common.base.Joiner;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.BiMap;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableBiMap;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.relocated.com.google.common.primitives.Ints;
import org.apache.iceberg.types.Type;
import org.apache.iceberg.types.TypeUtil;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.types.Types.NestedField;
import org.apache.iceberg.types.Types.StructType;
/**
* The schema of a data table.
* <p>
* Schema ID will only be populated when reading from/writing to table metadata,
* otherwise it will be default to 0.
*/
public class Schema implements Serializable {
private static final Joiner NEWLINE = Joiner.on('\n');
private static final String ALL_COLUMNS = "*";
private static final int DEFAULT_SCHEMA_ID = 0;
private final StructType struct;
private final int schemaId;
private final int[] identifierFieldIds;
private final int highestFieldId;
private transient BiMap<String, Integer> aliasToId = null;
private transient Map<Integer, NestedField> idToField = null;
private transient Map<String, Integer> nameToId = null;
private transient Map<String, Integer> lowerCaseNameToId = null;
private transient Map<Integer, Accessor<StructLike>> idToAccessor = null;
private transient Map<Integer, String> idToName = null;
private transient Set<Integer> identifierFieldIdSet = null;
public Schema(List<NestedField> columns, Map<String, Integer> aliases) {
this(columns, aliases, ImmutableSet.of());
}
public Schema(List<NestedField> columns, Map<String, Integer> aliases, Set<Integer> identifierFieldIds) {
this(DEFAULT_SCHEMA_ID, columns, aliases, identifierFieldIds);
}
public Schema(List<NestedField> columns) {
this(columns, ImmutableSet.of());
}
public Schema(List<NestedField> columns, Set<Integer> identifierFieldIds) {
this(DEFAULT_SCHEMA_ID, columns, identifierFieldIds);
}
public Schema(int schemaId, List<NestedField> columns) {
this(schemaId, columns, ImmutableSet.of());
}
public Schema(int schemaId, List<NestedField> columns, Set<Integer> identifierFieldIds) {
this(schemaId, columns, null, identifierFieldIds);
}
public Schema(int schemaId, List<NestedField> columns, Map<String, Integer> aliases,
Set<Integer> identifierFieldIds) {
this.schemaId = schemaId;
this.struct = StructType.of(columns);
this.aliasToId = aliases != null ? ImmutableBiMap.copyOf(aliases) : null;
// validate IdentifierField
if (identifierFieldIds != null) {
Map<Integer, Integer> idToParent = TypeUtil.indexParents(struct);
identifierFieldIds.forEach(id -> validateIdentifierField(id, lazyIdToField(), idToParent));
}
this.identifierFieldIds = identifierFieldIds != null ? Ints.toArray(identifierFieldIds) : new int[0];
this.highestFieldId = lazyIdToName().keySet().stream().mapToInt(i -> i).max().orElse(0);
}
static void validateIdentifierField(int fieldId, Map<Integer, Types.NestedField> idToField,
Map<Integer, Integer> idToParent) {
Types.NestedField field = idToField.get(fieldId);
Preconditions.checkArgument(field != null,
"Cannot add fieldId %s as an identifier field: field does not exist", fieldId);
Preconditions.checkArgument(field.type().isPrimitiveType(),
"Cannot add field %s as an identifier field: not a primitive type field", field.name());
Preconditions.checkArgument(field.isRequired(),
"Cannot add field %s as an identifier field: not a required field", field.name());
Preconditions.checkArgument(!Types.DoubleType.get().equals(field.type()) &&
!Types.FloatType.get().equals(field.type()),
"Cannot add field %s as an identifier field: must not be float or double field", field.name());
// check whether the nested field is in a chain of required struct fields
// exploring from root for better error message for list and map types
Integer parentId = idToParent.get(field.fieldId());
Deque<Integer> deque = Lists.newLinkedList();
while (parentId != null) {
deque.push(parentId);
parentId = idToParent.get(parentId);
}
while (!deque.isEmpty()) {
Types.NestedField parent = idToField.get(deque.pop());
Preconditions.checkArgument(parent.type().isStructType(),
"Cannot add field %s as an identifier field: must not be nested in %s", field.name(), parent);
Preconditions.checkArgument(parent.isRequired(),
"Cannot add field %s as an identifier field: must not be nested in an optional field %s",
field.name(), parent);
}
}
public Schema(NestedField... columns) {
this(DEFAULT_SCHEMA_ID, Arrays.asList(columns));
}
public Schema(int schemaId, NestedField... columns) {
this(schemaId, Arrays.asList(columns));
}
private Map<Integer, NestedField> lazyIdToField() {
if (idToField == null) {
this.idToField = TypeUtil.indexById(struct);
}
return idToField;
}
private Map<String, Integer> lazyNameToId() {
if (nameToId == null) {
this.nameToId = ImmutableMap.copyOf(TypeUtil.indexByName(struct));
}
return nameToId;
}
private Map<Integer, String> lazyIdToName() {
if (idToName == null) {
this.idToName = ImmutableMap.copyOf(TypeUtil.indexNameById(struct));
}
return idToName;
}
private Map<String, Integer> lazyLowerCaseNameToId() {
if (lowerCaseNameToId == null) {
this.lowerCaseNameToId = ImmutableMap.copyOf(TypeUtil.indexByLowerCaseName(struct));
}
return lowerCaseNameToId;
}
private Map<Integer, Accessor<StructLike>> lazyIdToAccessor() {
if (idToAccessor == null) {
idToAccessor = Accessors.forSchema(this);
}
return idToAccessor;
}
private Set<Integer> lazyIdentifierFieldIdSet() {
if (identifierFieldIdSet == null) {
identifierFieldIdSet = ImmutableSet.copyOf(Ints.asList(identifierFieldIds));
}
return identifierFieldIdSet;
}
/**
* Returns the schema ID for this schema.
* <p>
* Note that schema ID will only be populated when reading from/writing to table metadata,
* otherwise it will be default to 0.
*/
public int schemaId() {
return this.schemaId;
}
/**
* Returns the highest field ID in this schema, including nested fields.
*/
public int highestFieldId() {
return highestFieldId;
}
/**
* Returns an alias map for this schema, if set.
* <p>
* Alias maps are created when translating an external schema, like an Avro Schema, to this
* format. The original column names can be provided in a Map when constructing this Schema.
*
* @return a Map of column aliases to field ids
*/
public Map<String, Integer> getAliases() {
return aliasToId;
}
/**
* Returns the underlying {@link StructType struct type} for this schema.
*
* @return the StructType version of this schema.
*/
public StructType asStruct() {
return struct;
}
/**
* Returns a List of the {@link NestedField columns} in this Schema.
*/
public List<NestedField> columns() {
return struct.fields();
}
/**
* The set of identifier field IDs.
* <p>
* Identifier is a concept similar to primary key in a relational database system.
* It consists of a unique set of primitive fields in the schema.
* An identifier field must be at root, or nested in a chain of structs (no maps or lists).
* A row should be unique in a table based on the values of the identifier fields.
* Optional, float and double columns cannot be used as identifier fields.
* However, Iceberg identifier differs from primary key in the following ways:
* <ul>
* <li>Iceberg does not enforce the uniqueness of a row based on this identifier information.
* It is used for operations like upsert to define the default upsert key.</li>
* <li>A nested field in a struct can be used as an identifier. For example, if there is a "last_name" field
* inside a "user" struct in a schema, field "user.last_name" can be set as a part of the identifier field.</li>
* </ul>
* <p>
*
* @return the set of identifier field IDs in this schema.
*/
public Set<Integer> identifierFieldIds() {
return lazyIdentifierFieldIdSet();
}
/**
* Returns the set of identifier field names.
*/
public Set<String> identifierFieldNames() {
return identifierFieldIds()
.stream()
.map(id -> lazyIdToName().get(id))
.collect(Collectors.toSet());
}
/**
* Returns the {@link Type} of a sub-field identified by the field name.
*
* @param name a field name
* @return a Type for the sub-field or null if it is not found
*/
public Type findType(String name) {
Preconditions.checkArgument(!name.isEmpty(), "Invalid column name: (empty)");
Integer id = lazyNameToId().get(name);
if (id != null) { // name is found
return findType(id);
}
// name could not be found
return null;
}
/**
* Returns the {@link Type} of a sub-field identified by the field id.
*
* @param id a field id
* @return a Type for the sub-field or null if it is not found
*/
public Type findType(int id) {
NestedField field = lazyIdToField().get(id);
if (field != null) {
return field.type();
}
return null;
}
/**
* Returns the sub-field identified by the field id as a {@link NestedField}.
*
* @param id a field id
* @return the sub-field or null if it is not found
*/
public NestedField findField(int id) {
return lazyIdToField().get(id);
}
/**
* Returns a sub-field by name as a {@link NestedField}.
* <p>
* The result may be a top-level or a nested field.
*
* @param name a String name
* @return a Type for the sub-field or null if it is not found
*/
public NestedField findField(String name) {
Preconditions.checkArgument(!name.isEmpty(), "Invalid column name: (empty)");
Integer id = lazyNameToId().get(name);
if (id != null) {
return lazyIdToField().get(id);
}
return null;
}
/**
* Returns a sub-field by name as a {@link NestedField}.
* <p>
* The result may be a top-level or a nested field.
*
* @param name a String name
* @return the sub-field or null if it is not found
*/
public NestedField caseInsensitiveFindField(String name) {
Preconditions.checkArgument(!name.isEmpty(), "Invalid column name: (empty)");
Integer id = lazyLowerCaseNameToId().get(name.toLowerCase(Locale.ROOT));
if (id != null) {
return lazyIdToField().get(id);
}
return null;
}
/**
* Returns the full column name for the given id.
*
* @param id a field id
* @return the full column name in this schema that resolves to the id
*/
public String findColumnName(int id) {
return lazyIdToName().get(id);
}
/**
* Returns the column id for the given column alias. Column aliases are set
* by conversions from Parquet or Avro to this Schema type.
*
* @param alias a full column name in the unconverted data schema
* @return the column id in this schema, or null if the column wasn't found
*/
public Integer aliasToId(String alias) {
if (aliasToId != null) {
return aliasToId.get(alias);
}
return null;
}
/**
* Returns the full column name in the unconverted data schema for the given column id.
* Column aliases are set by conversions from Parquet or Avro to this Schema type.
*
* @param fieldId a column id in this schema
* @return the full column name in the unconverted data schema, or null if one wasn't found
*/
public String idToAlias(Integer fieldId) {
if (aliasToId != null) {
return aliasToId.inverse().get(fieldId);
}
return null;
}
/**
* Returns an accessor for retrieving the data from {@link StructLike}.
* <p>
* Accessors do not retrieve data contained in lists or maps.
*
* @param id a column id in this schema
* @return an {@link Accessor} to retrieve values from a {@link StructLike} row
*/
public Accessor<StructLike> accessorForField(int id) {
return lazyIdToAccessor().get(id);
}
/**
* Creates a projection schema for a subset of columns, selected by name.
* <p>
* Names that identify nested fields will select part or all of the field's top-level column.
*
* @param names String names for selected columns
* @return a projection schema from this schema, by name
*/
public Schema select(String... names) {
return select(Arrays.asList(names));
}
/**
* Creates a projection schema for a subset of columns, selected by name.
* <p>
* Names that identify nested fields will select part or all of the field's top-level column.
*
* @param names a List of String names for selected columns
* @return a projection schema from this schema, by name
*/
public Schema select(Collection<String> names) {
return internalSelect(names, true);
}
/**
* Creates a projection schema for a subset of columns, selected by case insensitive names
* <p>
* Names that identify nested fields will select part or all of the field's top-level column.
*
* @param names a List of String names for selected columns
* @return a projection schema from this schema, by names
*/
public Schema caseInsensitiveSelect(String... names) {
return caseInsensitiveSelect(Arrays.asList(names));
}
/**
* Creates a projection schema for a subset of columns, selected by case insensitive names
* <p>
* Names that identify nested fields will select part or all of the field's top-level column.
*
* @param names a List of String names for selected columns
* @return a projection schema from this schema, by names
*/
public Schema caseInsensitiveSelect(Collection<String> names) {
return internalSelect(names, false);
}
/**
* Checks whether this schema is equivalent to another schema while ignoring the schema ID.
* @param anotherSchema another schema
* @return true if this schema is equivalent to the given schema
*/
public boolean sameSchema(Schema anotherSchema) {
return asStruct().equals(anotherSchema.asStruct()) &&
identifierFieldIds().equals(anotherSchema.identifierFieldIds());
}
private Schema internalSelect(Collection<String> names, boolean caseSensitive) {
if (names.contains(ALL_COLUMNS)) {
return this;
}
Set<Integer> selected = Sets.newHashSet();
for (String name : names) {
Integer id;
if (caseSensitive) {
id = lazyNameToId().get(name);
} else {
id = lazyLowerCaseNameToId().get(name.toLowerCase(Locale.ROOT));
}
if (id != null) {
selected.add(id);
}
}
return TypeUtil.select(this, selected);
}
private String identifierFieldToString(Types.NestedField field) {
return " " + field + (identifierFieldIds().contains(field.fieldId()) ? " (id)" : "");
}
@Override
public String toString() {
return String.format("table {\n%s\n}",
NEWLINE.join(struct.fields().stream()
.map(this::identifierFieldToString)
.collect(Collectors.toList())));
}
}