blob: 469c930d31c76cea12fd68417ea794b33d657a9d [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::error::ArrowError;
use std::cmp::Ordering;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::sync::Arc;
use crate::datatype::DataType;
#[cfg(feature = "canonical_extension_types")]
use crate::extension::CanonicalExtensionType;
use crate::schema::SchemaBuilder;
use crate::{
extension::{ExtensionType, EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY},
Fields, UnionFields, UnionMode,
};
/// A reference counted [`Field`]
pub type FieldRef = Arc<Field>;
/// Describes a single column in a [`Schema`](super::Schema).
///
/// A [`Schema`](super::Schema) is an ordered collection of
/// [`Field`] objects. Fields contain:
/// * `name`: the name of the field
/// * `data_type`: the type of the field
/// * `nullable`: if the field is nullable
/// * `metadata`: a map of key-value pairs containing additional custom metadata
///
/// Arrow Extension types, are encoded in `Field`s metadata. See
/// [`Self::try_extension_type`] to retrieve the [`ExtensionType`], if any.
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct Field {
name: String,
data_type: DataType,
nullable: bool,
#[deprecated(
since = "54.0.0",
note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it."
)]
dict_id: i64,
dict_is_ordered: bool,
/// A map of key-value pairs containing additional custom meta data.
metadata: HashMap<String, String>,
}
// Auto-derive `PartialEq` traits will pull `dict_id` and `dict_is_ordered`
// into comparison. However, these properties are only used in IPC context
// for matching dictionary encoded data. They are not necessary to be same
// to consider schema equality. For example, in C++ `Field` implementation,
// it doesn't contain these dictionary properties too.
impl PartialEq for Field {
fn eq(&self, other: &Self) -> bool {
self.name == other.name
&& self.data_type == other.data_type
&& self.nullable == other.nullable
&& self.metadata == other.metadata
}
}
impl Eq for Field {}
impl PartialOrd for Field {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Field {
fn cmp(&self, other: &Self) -> Ordering {
self.name
.cmp(other.name())
.then_with(|| self.data_type.cmp(other.data_type()))
.then_with(|| self.nullable.cmp(&other.nullable))
.then_with(|| {
// ensure deterministic key order
let mut keys: Vec<&String> =
self.metadata.keys().chain(other.metadata.keys()).collect();
keys.sort();
for k in keys {
match (self.metadata.get(k), other.metadata.get(k)) {
(None, None) => {}
(Some(_), None) => {
return Ordering::Less;
}
(None, Some(_)) => {
return Ordering::Greater;
}
(Some(v1), Some(v2)) => match v1.cmp(v2) {
Ordering::Equal => {}
other => {
return other;
}
},
}
}
Ordering::Equal
})
}
}
impl Hash for Field {
fn hash<H: Hasher>(&self, state: &mut H) {
self.name.hash(state);
self.data_type.hash(state);
self.nullable.hash(state);
// ensure deterministic key order
let mut keys: Vec<&String> = self.metadata.keys().collect();
keys.sort();
for k in keys {
k.hash(state);
self.metadata.get(k).expect("key valid").hash(state);
}
}
}
impl Field {
/// Default list member field name
pub const LIST_FIELD_DEFAULT_NAME: &'static str = "item";
/// Creates a new field with the given name, data type, and nullability
///
/// # Example
/// ```
/// # use arrow_schema::{Field, DataType};
/// Field::new("field_name", DataType::Int32, true);
/// ```
pub fn new(name: impl Into<String>, data_type: DataType, nullable: bool) -> Self {
#[allow(deprecated)]
Field {
name: name.into(),
data_type,
nullable,
dict_id: 0,
dict_is_ordered: false,
metadata: HashMap::default(),
}
}
/// Creates a new `Field` suitable for [`DataType::List`] and
/// [`DataType::LargeList`]
///
/// While not required, this method follows the convention of naming the
/// `Field` `"item"`.
///
/// # Example
/// ```
/// # use arrow_schema::{Field, DataType};
/// assert_eq!(
/// Field::new("item", DataType::Int32, true),
/// Field::new_list_field(DataType::Int32, true)
/// );
/// ```
pub fn new_list_field(data_type: DataType, nullable: bool) -> Self {
Self::new(Self::LIST_FIELD_DEFAULT_NAME, data_type, nullable)
}
/// Creates a new field that has additional dictionary information
#[deprecated(
since = "54.0.0",
note = "The ability to preserve dictionary IDs will be removed. With the dict_id field disappearing this function signature will change by removing the dict_id parameter."
)]
pub fn new_dict(
name: impl Into<String>,
data_type: DataType,
nullable: bool,
dict_id: i64,
dict_is_ordered: bool,
) -> Self {
#[allow(deprecated)]
Field {
name: name.into(),
data_type,
nullable,
dict_id,
dict_is_ordered,
metadata: HashMap::default(),
}
}
/// Create a new [`Field`] with [`DataType::Dictionary`]
///
/// Use [`Self::new_dict`] for more advanced dictionary options
///
/// # Panics
///
/// Panics if [`!key.is_dictionary_key_type`][DataType::is_dictionary_key_type]
pub fn new_dictionary(
name: impl Into<String>,
key: DataType,
value: DataType,
nullable: bool,
) -> Self {
assert!(
key.is_dictionary_key_type(),
"{key} is not a valid dictionary key"
);
let data_type = DataType::Dictionary(Box::new(key), Box::new(value));
Self::new(name, data_type, nullable)
}
/// Create a new [`Field`] with [`DataType::Struct`]
///
/// - `name`: the name of the [`DataType::Struct`] field
/// - `fields`: the description of each struct element
/// - `nullable`: if the [`DataType::Struct`] array is nullable
pub fn new_struct(name: impl Into<String>, fields: impl Into<Fields>, nullable: bool) -> Self {
Self::new(name, DataType::Struct(fields.into()), nullable)
}
/// Create a new [`Field`] with [`DataType::List`]
///
/// - `name`: the name of the [`DataType::List`] field
/// - `value`: the description of each list element
/// - `nullable`: if the [`DataType::List`] array is nullable
pub fn new_list(name: impl Into<String>, value: impl Into<FieldRef>, nullable: bool) -> Self {
Self::new(name, DataType::List(value.into()), nullable)
}
/// Create a new [`Field`] with [`DataType::LargeList`]
///
/// - `name`: the name of the [`DataType::LargeList`] field
/// - `value`: the description of each list element
/// - `nullable`: if the [`DataType::LargeList`] array is nullable
pub fn new_large_list(
name: impl Into<String>,
value: impl Into<FieldRef>,
nullable: bool,
) -> Self {
Self::new(name, DataType::LargeList(value.into()), nullable)
}
/// Create a new [`Field`] with [`DataType::FixedSizeList`]
///
/// - `name`: the name of the [`DataType::FixedSizeList`] field
/// - `value`: the description of each list element
/// - `size`: the size of the fixed size list
/// - `nullable`: if the [`DataType::FixedSizeList`] array is nullable
pub fn new_fixed_size_list(
name: impl Into<String>,
value: impl Into<FieldRef>,
size: i32,
nullable: bool,
) -> Self {
Self::new(name, DataType::FixedSizeList(value.into(), size), nullable)
}
/// Create a new [`Field`] with [`DataType::Map`]
///
/// - `name`: the name of the [`DataType::Map`] field
/// - `entries`: the name of the inner [`DataType::Struct`] field
/// - `keys`: the map keys
/// - `values`: the map values
/// - `sorted`: if the [`DataType::Map`] array is sorted
/// - `nullable`: if the [`DataType::Map`] array is nullable
pub fn new_map(
name: impl Into<String>,
entries: impl Into<String>,
keys: impl Into<FieldRef>,
values: impl Into<FieldRef>,
sorted: bool,
nullable: bool,
) -> Self {
let data_type = DataType::Map(
Arc::new(Field::new(
entries.into(),
DataType::Struct(Fields::from([keys.into(), values.into()])),
false, // The inner map field is always non-nullable (#1697),
)),
sorted,
);
Self::new(name, data_type, nullable)
}
/// Create a new [`Field`] with [`DataType::Union`]
///
/// - `name`: the name of the [`DataType::Union`] field
/// - `type_ids`: the union type ids
/// - `fields`: the union fields
/// - `mode`: the union mode
pub fn new_union<S, F, T>(name: S, type_ids: T, fields: F, mode: UnionMode) -> Self
where
S: Into<String>,
F: IntoIterator,
F::Item: Into<FieldRef>,
T: IntoIterator<Item = i8>,
{
Self::new(
name,
DataType::Union(UnionFields::new(type_ids, fields), mode),
false, // Unions cannot be nullable
)
}
/// Sets the `Field`'s optional custom metadata.
#[inline]
pub fn set_metadata(&mut self, metadata: HashMap<String, String>) {
self.metadata = metadata;
}
/// Sets the metadata of this `Field` to be `metadata` and returns self
pub fn with_metadata(mut self, metadata: HashMap<String, String>) -> Self {
self.set_metadata(metadata);
self
}
/// Returns the immutable reference to the `Field`'s optional custom metadata.
#[inline]
pub const fn metadata(&self) -> &HashMap<String, String> {
&self.metadata
}
/// Returns a mutable reference to the `Field`'s optional custom metadata.
#[inline]
pub fn metadata_mut(&mut self) -> &mut HashMap<String, String> {
&mut self.metadata
}
/// Returns an immutable reference to the `Field`'s name.
#[inline]
pub const fn name(&self) -> &String {
&self.name
}
/// Set the name of this [`Field`]
#[inline]
pub fn set_name(&mut self, name: impl Into<String>) {
self.name = name.into();
}
/// Set the name of the [`Field`] and returns self.
///
/// ```
/// # use arrow_schema::*;
/// let field = Field::new("c1", DataType::Int64, false)
/// .with_name("c2");
///
/// assert_eq!(field.name(), "c2");
/// ```
pub fn with_name(mut self, name: impl Into<String>) -> Self {
self.set_name(name);
self
}
/// Returns an immutable reference to the [`Field`]'s [`DataType`].
#[inline]
pub const fn data_type(&self) -> &DataType {
&self.data_type
}
/// Set [`DataType`] of the [`Field`]
///
/// ```
/// # use arrow_schema::*;
/// let mut field = Field::new("c1", DataType::Int64, false);
/// field.set_data_type(DataType::Utf8);
///
/// assert_eq!(field.data_type(), &DataType::Utf8);
/// ```
#[inline]
pub fn set_data_type(&mut self, data_type: DataType) {
self.data_type = data_type;
}
/// Set [`DataType`] of the [`Field`] and returns self.
///
/// ```
/// # use arrow_schema::*;
/// let field = Field::new("c1", DataType::Int64, false)
/// .with_data_type(DataType::Utf8);
///
/// assert_eq!(field.data_type(), &DataType::Utf8);
/// ```
pub fn with_data_type(mut self, data_type: DataType) -> Self {
self.set_data_type(data_type);
self
}
/// Returns the extension type name of this [`Field`], if set.
///
/// This returns the value of [`EXTENSION_TYPE_NAME_KEY`], if set in
/// [`Field::metadata`]. If the key is missing, there is no extension type
/// name and this returns `None`.
///
/// # Example
///
/// ```
/// # use arrow_schema::{DataType, extension::EXTENSION_TYPE_NAME_KEY, Field};
///
/// let field = Field::new("", DataType::Null, false);
/// assert_eq!(field.extension_type_name(), None);
///
/// let field = Field::new("", DataType::Null, false).with_metadata(
/// [(EXTENSION_TYPE_NAME_KEY.to_owned(), "example".to_owned())]
/// .into_iter()
/// .collect(),
/// );
/// assert_eq!(field.extension_type_name(), Some("example"));
/// ```
pub fn extension_type_name(&self) -> Option<&str> {
self.metadata()
.get(EXTENSION_TYPE_NAME_KEY)
.map(String::as_ref)
}
/// Returns the extension type metadata of this [`Field`], if set.
///
/// This returns the value of [`EXTENSION_TYPE_METADATA_KEY`], if set in
/// [`Field::metadata`]. If the key is missing, there is no extension type
/// metadata and this returns `None`.
///
/// # Example
///
/// ```
/// # use arrow_schema::{DataType, extension::EXTENSION_TYPE_METADATA_KEY, Field};
///
/// let field = Field::new("", DataType::Null, false);
/// assert_eq!(field.extension_type_metadata(), None);
///
/// let field = Field::new("", DataType::Null, false).with_metadata(
/// [(EXTENSION_TYPE_METADATA_KEY.to_owned(), "example".to_owned())]
/// .into_iter()
/// .collect(),
/// );
/// assert_eq!(field.extension_type_metadata(), Some("example"));
/// ```
pub fn extension_type_metadata(&self) -> Option<&str> {
self.metadata()
.get(EXTENSION_TYPE_METADATA_KEY)
.map(String::as_ref)
}
/// Returns an instance of the given [`ExtensionType`] of this [`Field`],
/// if set in the [`Field::metadata`].
///
/// # Error
///
/// Returns an error if
/// - this field does not have the name of this extension type
/// ([`ExtensionType::NAME`]) in the [`Field::metadata`] (mismatch or
/// missing)
/// - the deserialization of the metadata
/// ([`ExtensionType::deserialize_metadata`]) fails
/// - the construction of the extension type ([`ExtensionType::try_new`])
/// fail (for example when the [`Field::data_type`] is not supported by
/// the extension type ([`ExtensionType::supports_data_type`]))
pub fn try_extension_type<E: ExtensionType>(&self) -> Result<E, ArrowError> {
// Check the extension name in the metadata
match self.extension_type_name() {
// It should match the name of the given extension type
Some(name) if name == E::NAME => {
// Deserialize the metadata and try to construct the extension
// type
E::deserialize_metadata(self.extension_type_metadata())
.and_then(|metadata| E::try_new(self.data_type(), metadata))
}
// Name mismatch
Some(name) => Err(ArrowError::InvalidArgumentError(format!(
"Field extension type name mismatch, expected {}, found {name}",
E::NAME
))),
// Name missing
None => Err(ArrowError::InvalidArgumentError(
"Field extension type name missing".to_owned(),
)),
}
}
/// Returns an instance of the given [`ExtensionType`] of this [`Field`],
/// panics if this [`Field`] does not have this extension type.
///
/// # Panic
///
/// This calls [`Field::try_extension_type`] and panics when it returns an
/// error.
pub fn extension_type<E: ExtensionType>(&self) -> E {
self.try_extension_type::<E>()
.unwrap_or_else(|e| panic!("{e}"))
}
/// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`]
/// and [`ExtensionType::metadata`] of the given [`ExtensionType`], if the
/// given extension type supports the [`Field::data_type`] of this field
/// ([`ExtensionType::supports_data_type`]).
///
/// If the given extension type defines no metadata, a previously set
/// value of [`EXTENSION_TYPE_METADATA_KEY`] is cleared.
///
/// # Error
///
/// This functions returns an error if the data type of this field does not
/// match any of the supported storage types of the given extension type.
pub fn try_with_extension_type<E: ExtensionType>(
&mut self,
extension_type: E,
) -> Result<(), ArrowError> {
// Make sure the data type of this field is supported
extension_type.supports_data_type(&self.data_type)?;
self.metadata
.insert(EXTENSION_TYPE_NAME_KEY.to_owned(), E::NAME.to_owned());
match extension_type.serialize_metadata() {
Some(metadata) => self
.metadata
.insert(EXTENSION_TYPE_METADATA_KEY.to_owned(), metadata),
// If this extension type has no metadata, we make sure to
// clear previously set metadata.
None => self.metadata.remove(EXTENSION_TYPE_METADATA_KEY),
};
Ok(())
}
/// Updates the metadata of this [`Field`] with the [`ExtensionType::NAME`]
/// and [`ExtensionType::metadata`] of the given [`ExtensionType`].
///
/// # Panics
///
/// This calls [`Field::try_with_extension_type`] and panics when it
/// returns an error.
pub fn with_extension_type<E: ExtensionType>(mut self, extension_type: E) -> Self {
self.try_with_extension_type(extension_type)
.unwrap_or_else(|e| panic!("{e}"));
self
}
/// Returns the [`CanonicalExtensionType`] of this [`Field`], if set.
///
/// # Error
///
/// Returns an error if
/// - this field does have a canonical extension type (mismatch or missing)
/// - the canonical extension is not supported
/// - the construction of the extension type fails
#[cfg(feature = "canonical_extension_types")]
pub fn try_canonical_extension_type(&self) -> Result<CanonicalExtensionType, ArrowError> {
CanonicalExtensionType::try_from(self)
}
/// Indicates whether this [`Field`] supports null values.
///
/// If true, the field *may* contain null values.
#[inline]
pub const fn is_nullable(&self) -> bool {
self.nullable
}
/// Set the `nullable` of this [`Field`].
///
/// ```
/// # use arrow_schema::*;
/// let mut field = Field::new("c1", DataType::Int64, false);
/// field.set_nullable(true);
///
/// assert_eq!(field.is_nullable(), true);
/// ```
#[inline]
pub fn set_nullable(&mut self, nullable: bool) {
self.nullable = nullable;
}
/// Set `nullable` of the [`Field`] and returns self.
///
/// ```
/// # use arrow_schema::*;
/// let field = Field::new("c1", DataType::Int64, false)
/// .with_nullable(true);
///
/// assert_eq!(field.is_nullable(), true);
/// ```
pub fn with_nullable(mut self, nullable: bool) -> Self {
self.set_nullable(nullable);
self
}
/// Returns a (flattened) [`Vec`] containing all child [`Field`]s
/// within `self` contained within this field (including `self`)
pub(crate) fn fields(&self) -> Vec<&Field> {
let mut collected_fields = vec![self];
collected_fields.append(&mut Field::_fields(&self.data_type));
collected_fields
}
fn _fields(dt: &DataType) -> Vec<&Field> {
match dt {
DataType::Struct(fields) => fields.iter().flat_map(|f| f.fields()).collect(),
DataType::Union(fields, _) => fields.iter().flat_map(|(_, f)| f.fields()).collect(),
DataType::List(field)
| DataType::LargeList(field)
| DataType::FixedSizeList(field, _)
| DataType::Map(field, _) => field.fields(),
DataType::Dictionary(_, value_field) => Field::_fields(value_field.as_ref()),
DataType::RunEndEncoded(_, field) => field.fields(),
_ => vec![],
}
}
/// Returns a vector containing all (potentially nested) `Field` instances selected by the
/// dictionary ID they use
#[inline]
#[deprecated(
since = "54.0.0",
note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it."
)]
pub(crate) fn fields_with_dict_id(&self, id: i64) -> Vec<&Field> {
self.fields()
.into_iter()
.filter(|&field| {
#[allow(deprecated)]
let matching_dict_id = field.dict_id == id;
matches!(field.data_type(), DataType::Dictionary(_, _)) && matching_dict_id
})
.collect()
}
/// Returns the dictionary ID, if this is a dictionary type.
#[inline]
#[deprecated(
since = "54.0.0",
note = "The ability to preserve dictionary IDs will be removed. With it, all fields related to it."
)]
pub const fn dict_id(&self) -> Option<i64> {
match self.data_type {
#[allow(deprecated)]
DataType::Dictionary(_, _) => Some(self.dict_id),
_ => None,
}
}
/// Returns whether this `Field`'s dictionary is ordered, if this is a dictionary type.
///
/// # Example
/// ```
/// # use arrow_schema::{DataType, Field};
/// // non dictionaries do not have a dict is ordered flat
/// let field = Field::new("c1", DataType::Int64, false);
/// assert_eq!(field.dict_is_ordered(), None);
/// // by default dictionary is not ordered
/// let field = Field::new("c1", DataType::Dictionary(Box::new(DataType::Int64), Box::new(DataType::Utf8)), false);
/// assert_eq!(field.dict_is_ordered(), Some(false));
/// let field = field.with_dict_is_ordered(true);
/// assert_eq!(field.dict_is_ordered(), Some(true));
/// ```
#[inline]
pub const fn dict_is_ordered(&self) -> Option<bool> {
match self.data_type {
DataType::Dictionary(_, _) => Some(self.dict_is_ordered),
_ => None,
}
}
/// Set the is ordered field for this `Field`, if it is a dictionary.
///
/// Does nothing if this is not a dictionary type.
///
/// See [`Field::dict_is_ordered`] for more information.
pub fn with_dict_is_ordered(mut self, dict_is_ordered: bool) -> Self {
if matches!(self.data_type, DataType::Dictionary(_, _)) {
self.dict_is_ordered = dict_is_ordered;
};
self
}
/// Merge this field into self if it is compatible.
///
/// Struct fields are merged recursively.
///
/// NOTE: `self` may be updated to a partial / unexpected state in case of merge failure.
///
/// Example:
///
/// ```
/// # use arrow_schema::*;
/// let mut field = Field::new("c1", DataType::Int64, false);
/// assert!(field.try_merge(&Field::new("c1", DataType::Int64, true)).is_ok());
/// assert!(field.is_nullable());
/// ```
pub fn try_merge(&mut self, from: &Field) -> Result<(), ArrowError> {
if from.dict_is_ordered != self.dict_is_ordered {
return Err(ArrowError::SchemaError(format!(
"Fail to merge schema field '{}' because from dict_is_ordered = {} does not match {}",
self.name, from.dict_is_ordered, self.dict_is_ordered
)));
}
// merge metadata
match (self.metadata().is_empty(), from.metadata().is_empty()) {
(false, false) => {
let mut merged = self.metadata().clone();
for (key, from_value) in from.metadata() {
if let Some(self_value) = self.metadata.get(key) {
if self_value != from_value {
return Err(ArrowError::SchemaError(format!(
"Fail to merge field '{}' due to conflicting metadata data value for key {}.
From value = {} does not match {}", self.name, key, from_value, self_value),
));
}
} else {
merged.insert(key.clone(), from_value.clone());
}
}
self.set_metadata(merged);
}
(true, false) => {
self.set_metadata(from.metadata().clone());
}
_ => {}
}
match &mut self.data_type {
DataType::Struct(nested_fields) => match &from.data_type {
DataType::Struct(from_nested_fields) => {
let mut builder = SchemaBuilder::new();
nested_fields.iter().chain(from_nested_fields).try_for_each(|f| builder.try_merge(f))?;
*nested_fields = builder.finish().fields;
}
_ => {
return Err(ArrowError::SchemaError(
format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::Struct",
self.name, from.data_type)
))}
},
DataType::Union(nested_fields, _) => match &from.data_type {
DataType::Union(from_nested_fields, _) => {
nested_fields.try_merge(from_nested_fields)?
}
_ => {
return Err(ArrowError::SchemaError(
format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::Union",
self.name, from.data_type)
));
}
},
DataType::List(field) => match &from.data_type {
DataType::List(from_field) => {
let mut f = (**field).clone();
f.try_merge(from_field)?;
(*field) = Arc::new(f);
},
_ => {
return Err(ArrowError::SchemaError(
format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::List",
self.name, from.data_type)
))}
},
DataType::LargeList(field) => match &from.data_type {
DataType::LargeList(from_field) => {
let mut f = (**field).clone();
f.try_merge(from_field)?;
(*field) = Arc::new(f);
},
_ => {
return Err(ArrowError::SchemaError(
format!("Fail to merge schema field '{}' because the from data_type = {} is not DataType::LargeList",
self.name, from.data_type)
))}
},
DataType::Null => {
self.nullable = true;
self.data_type = from.data_type.clone();
}
| DataType::Boolean
| DataType::Int8
| DataType::Int16
| DataType::Int32
| DataType::Int64
| DataType::UInt8
| DataType::UInt16
| DataType::UInt32
| DataType::UInt64
| DataType::Float16
| DataType::Float32
| DataType::Float64
| DataType::Timestamp(_, _)
| DataType::Date32
| DataType::Date64
| DataType::Time32(_)
| DataType::Time64(_)
| DataType::Duration(_)
| DataType::Binary
| DataType::LargeBinary
| DataType::BinaryView
| DataType::Interval(_)
| DataType::LargeListView(_)
| DataType::ListView(_)
| DataType::Map(_, _)
| DataType::Dictionary(_, _)
| DataType::RunEndEncoded(_, _)
| DataType::FixedSizeList(_, _)
| DataType::FixedSizeBinary(_)
| DataType::Utf8
| DataType::LargeUtf8
| DataType::Utf8View
| DataType::Decimal32(_, _)
| DataType::Decimal64(_, _)
| DataType::Decimal128(_, _)
| DataType::Decimal256(_, _) => {
if from.data_type == DataType::Null {
self.nullable = true;
} else if self.data_type != from.data_type {
return Err(ArrowError::SchemaError(
format!("Fail to merge schema field '{}' because the from data_type = {} does not equal {}",
self.name, from.data_type, self.data_type)
));
}
}
}
self.nullable |= from.nullable;
Ok(())
}
/// Check to see if `self` is a superset of `other` field. Superset is defined as:
///
/// * if nullability doesn't match, self needs to be nullable
/// * self.metadata is a superset of other.metadata
/// * all other fields are equal
pub fn contains(&self, other: &Field) -> bool {
self.name == other.name
&& self.data_type.contains(&other.data_type)
&& self.dict_is_ordered == other.dict_is_ordered
// self need to be nullable or both of them are not nullable
&& (self.nullable || !other.nullable)
// make sure self.metadata is a superset of other.metadata
&& other.metadata.iter().all(|(k, v1)| {
self.metadata.get(k).map(|v2| v1 == v2).unwrap_or_default()
})
}
/// Return size of this instance in bytes.
///
/// Includes the size of `Self`.
pub fn size(&self) -> usize {
std::mem::size_of_val(self) - std::mem::size_of_val(&self.data_type)
+ self.data_type.size()
+ self.name.capacity()
+ (std::mem::size_of::<(String, String)>() * self.metadata.capacity())
+ self
.metadata
.iter()
.map(|(k, v)| k.capacity() + v.capacity())
.sum::<usize>()
}
}
// TODO: improve display with crate https://crates.io/crates/derive_more ?
impl std::fmt::Display for Field {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "{self:?}")
}
}
#[cfg(test)]
mod test {
use super::*;
use std::collections::hash_map::DefaultHasher;
#[test]
fn test_new_with_string() {
// Fields should allow owned Strings to support reuse
let s = "c1";
Field::new(s, DataType::Int64, false);
}
#[test]
fn test_new_dict_with_string() {
// Fields should allow owned Strings to support reuse
let s = "c1";
#[allow(deprecated)]
Field::new_dict(s, DataType::Int64, false, 4, false);
}
#[test]
fn test_merge_incompatible_types() {
let mut field = Field::new("c1", DataType::Int64, false);
let result = field
.try_merge(&Field::new("c1", DataType::Float32, true))
.expect_err("should fail")
.to_string();
assert_eq!("Schema error: Fail to merge schema field 'c1' because the from data_type = Float32 does not equal Int64", result);
}
#[test]
fn test_merge_with_null() {
let mut field1 = Field::new("c1", DataType::Null, true);
field1
.try_merge(&Field::new("c1", DataType::Float32, false))
.expect("should widen type to nullable float");
assert_eq!(Field::new("c1", DataType::Float32, true), field1);
let mut field2 = Field::new("c2", DataType::Utf8, false);
field2
.try_merge(&Field::new("c2", DataType::Null, true))
.expect("should widen type to nullable utf8");
assert_eq!(Field::new("c2", DataType::Utf8, true), field2);
}
#[test]
fn test_merge_with_nested_null() {
let mut struct1 = Field::new(
"s1",
DataType::Struct(Fields::from(vec![Field::new(
"inner",
DataType::Float32,
false,
)])),
false,
);
let struct2 = Field::new(
"s2",
DataType::Struct(Fields::from(vec![Field::new(
"inner",
DataType::Null,
false,
)])),
true,
);
struct1
.try_merge(&struct2)
.expect("should widen inner field's type to nullable float");
assert_eq!(
Field::new(
"s1",
DataType::Struct(Fields::from(vec![Field::new(
"inner",
DataType::Float32,
true,
)])),
true,
),
struct1
);
let mut list1 = Field::new(
"l1",
DataType::List(Field::new("inner", DataType::Float32, false).into()),
false,
);
let list2 = Field::new(
"l2",
DataType::List(Field::new("inner", DataType::Null, false).into()),
true,
);
list1
.try_merge(&list2)
.expect("should widen inner field's type to nullable float");
assert_eq!(
Field::new(
"l1",
DataType::List(Field::new("inner", DataType::Float32, true).into()),
true,
),
list1
);
let mut large_list1 = Field::new(
"ll1",
DataType::LargeList(Field::new("inner", DataType::Float32, false).into()),
false,
);
let large_list2 = Field::new(
"ll2",
DataType::LargeList(Field::new("inner", DataType::Null, false).into()),
true,
);
large_list1
.try_merge(&large_list2)
.expect("should widen inner field's type to nullable float");
assert_eq!(
Field::new(
"ll1",
DataType::LargeList(Field::new("inner", DataType::Float32, true).into()),
true,
),
large_list1
);
}
#[test]
fn test_fields_with_dict_id() {
#[allow(deprecated)]
let dict1 = Field::new_dict(
"dict1",
DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()),
false,
10,
false,
);
#[allow(deprecated)]
let dict2 = Field::new_dict(
"dict2",
DataType::Dictionary(DataType::Int32.into(), DataType::Int8.into()),
false,
20,
false,
);
let field = Field::new(
"struct<dict1, list[struct<dict2, list[struct<dict1]>]>",
DataType::Struct(Fields::from(vec![
dict1.clone(),
Field::new(
"list[struct<dict1, list[struct<dict2>]>]",
DataType::List(Arc::new(Field::new(
"struct<dict1, list[struct<dict2>]>",
DataType::Struct(Fields::from(vec![
dict1.clone(),
Field::new(
"list[struct<dict2>]",
DataType::List(Arc::new(Field::new(
"struct<dict2>",
DataType::Struct(vec![dict2.clone()].into()),
false,
))),
false,
),
])),
false,
))),
false,
),
])),
false,
);
#[allow(deprecated)]
for field in field.fields_with_dict_id(10) {
assert_eq!(dict1, *field);
}
#[allow(deprecated)]
for field in field.fields_with_dict_id(20) {
assert_eq!(dict2, *field);
}
}
fn get_field_hash(field: &Field) -> u64 {
let mut s = DefaultHasher::new();
field.hash(&mut s);
s.finish()
}
#[test]
fn test_field_comparison_case() {
// dictionary-encoding properties not used for field comparison
#[allow(deprecated)]
let dict1 = Field::new_dict(
"dict1",
DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()),
false,
10,
false,
);
#[allow(deprecated)]
let dict2 = Field::new_dict(
"dict1",
DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()),
false,
20,
false,
);
assert_eq!(dict1, dict2);
assert_eq!(get_field_hash(&dict1), get_field_hash(&dict2));
#[allow(deprecated)]
let dict1 = Field::new_dict(
"dict0",
DataType::Dictionary(DataType::Utf8.into(), DataType::Int32.into()),
false,
10,
false,
);
assert_ne!(dict1, dict2);
assert_ne!(get_field_hash(&dict1), get_field_hash(&dict2));
}
#[test]
fn test_field_comparison_metadata() {
let f1 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
(String::from("k1"), String::from("v1")),
(String::from("k2"), String::from("v2")),
]));
let f2 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
(String::from("k1"), String::from("v1")),
(String::from("k3"), String::from("v3")),
]));
let f3 = Field::new("x", DataType::Binary, false).with_metadata(HashMap::from([
(String::from("k1"), String::from("v1")),
(String::from("k3"), String::from("v4")),
]));
assert!(f1.cmp(&f2).is_lt());
assert!(f2.cmp(&f3).is_lt());
assert!(f1.cmp(&f3).is_lt());
}
#[test]
fn test_contains_reflexivity() {
let mut field = Field::new("field1", DataType::Float16, false);
field.set_metadata(HashMap::from([
(String::from("k0"), String::from("v0")),
(String::from("k1"), String::from("v1")),
]));
assert!(field.contains(&field))
}
#[test]
fn test_contains_transitivity() {
let child_field = Field::new("child1", DataType::Float16, false);
let mut field1 = Field::new(
"field1",
DataType::Struct(Fields::from(vec![child_field])),
false,
);
field1.set_metadata(HashMap::from([(String::from("k1"), String::from("v1"))]));
let mut field2 = Field::new("field1", DataType::Struct(Fields::default()), true);
field2.set_metadata(HashMap::from([(String::from("k2"), String::from("v2"))]));
field2.try_merge(&field1).unwrap();
let mut field3 = Field::new("field1", DataType::Struct(Fields::default()), false);
field3.set_metadata(HashMap::from([(String::from("k3"), String::from("v3"))]));
field3.try_merge(&field2).unwrap();
assert!(field2.contains(&field1));
assert!(field3.contains(&field2));
assert!(field3.contains(&field1));
assert!(!field1.contains(&field2));
assert!(!field1.contains(&field3));
assert!(!field2.contains(&field3));
}
#[test]
fn test_contains_nullable() {
let field1 = Field::new("field1", DataType::Boolean, true);
let field2 = Field::new("field1", DataType::Boolean, false);
assert!(field1.contains(&field2));
assert!(!field2.contains(&field1));
}
#[test]
fn test_contains_must_have_same_fields() {
let child_field1 = Field::new("child1", DataType::Float16, false);
let child_field2 = Field::new("child2", DataType::Float16, false);
let field1 = Field::new(
"field1",
DataType::Struct(vec![child_field1.clone()].into()),
true,
);
let field2 = Field::new(
"field1",
DataType::Struct(vec![child_field1, child_field2].into()),
true,
);
assert!(!field1.contains(&field2));
assert!(!field2.contains(&field1));
// UnionFields with different type ID
let field1 = Field::new(
"field1",
DataType::Union(
UnionFields::new(
vec![1, 2],
vec![
Field::new("field1", DataType::UInt8, true),
Field::new("field3", DataType::Utf8, false),
],
),
UnionMode::Dense,
),
true,
);
let field2 = Field::new(
"field1",
DataType::Union(
UnionFields::new(
vec![1, 3],
vec![
Field::new("field1", DataType::UInt8, false),
Field::new("field3", DataType::Utf8, false),
],
),
UnionMode::Dense,
),
true,
);
assert!(!field1.contains(&field2));
// UnionFields with same type ID
let field1 = Field::new(
"field1",
DataType::Union(
UnionFields::new(
vec![1, 2],
vec![
Field::new("field1", DataType::UInt8, true),
Field::new("field3", DataType::Utf8, false),
],
),
UnionMode::Dense,
),
true,
);
let field2 = Field::new(
"field1",
DataType::Union(
UnionFields::new(
vec![1, 2],
vec![
Field::new("field1", DataType::UInt8, false),
Field::new("field3", DataType::Utf8, false),
],
),
UnionMode::Dense,
),
true,
);
assert!(field1.contains(&field2));
}
#[cfg(feature = "serde")]
fn assert_binary_serde_round_trip(field: Field) {
let serialized = bincode::serialize(&field).unwrap();
let deserialized: Field = bincode::deserialize(&serialized).unwrap();
assert_eq!(field, deserialized)
}
#[cfg(feature = "serde")]
#[test]
fn test_field_without_metadata_serde() {
let field = Field::new("name", DataType::Boolean, true);
assert_binary_serde_round_trip(field)
}
#[cfg(feature = "serde")]
#[test]
fn test_field_with_empty_metadata_serde() {
let field = Field::new("name", DataType::Boolean, false).with_metadata(HashMap::new());
assert_binary_serde_round_trip(field)
}
#[cfg(feature = "serde")]
#[test]
fn test_field_with_nonempty_metadata_serde() {
let mut metadata = HashMap::new();
metadata.insert("hi".to_owned(), "".to_owned());
let field = Field::new("name", DataType::Boolean, false).with_metadata(metadata);
assert_binary_serde_round_trip(field)
}
}