| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| //! Defines the logical data types of Arrow arrays. |
| //! |
| //! The most important things you might be looking for are: |
| //! * [`Schema`](crate::datatypes::Schema) to describe a schema. |
| //! * [`Field`](crate::datatypes::Field) to describe one field within a schema. |
| //! * [`DataType`](crate::datatypes::DataType) to describe the type of a field. |
| |
| use std::collections::HashMap; |
| use std::default::Default; |
| use std::fmt; |
| use std::mem::size_of; |
| #[cfg(feature = "simd")] |
| use std::ops::{Add, Div, Mul, Sub}; |
| use std::slice::from_raw_parts; |
| use std::str::FromStr; |
| use std::sync::Arc; |
| |
| #[cfg(feature = "simd")] |
| use packed_simd::*; |
| use serde_derive::{Deserialize, Serialize}; |
| use serde_json::{ |
| json, Number, Value, Value::Number as VNumber, Value::String as VString, |
| }; |
| |
| use crate::error::{ArrowError, Result}; |
| use crate::util::bit_util; |
| |
| /// The set of datatypes that are supported by this implementation of Apache Arrow. |
| /// |
| /// The Arrow specification on data types includes some more types. |
| /// See also [`Schema.fbs`](https://github.com/apache/arrow/blob/master/format/Schema.fbs) |
| /// for Arrow's specification. |
| /// |
| /// The variants of this enum include primitive fixed size types as well as parametric or |
| /// nested types. |
| /// Currently the Rust implementation supports the following nested types: |
| /// - `List<T>` |
| /// - `Struct<T, U, V, ...>` |
| /// |
| /// Nested types can themselves be nested within other arrays. |
| /// For more information on these types please see |
| /// [the physical memory layout of Apache Arrow](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout). |
| #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] |
| pub enum DataType { |
| /// Null type |
| Null, |
| /// A boolean datatype representing the values `true` and `false`. |
| Boolean, |
| /// A signed 8-bit integer. |
| Int8, |
| /// A signed 16-bit integer. |
| Int16, |
| /// A signed 32-bit integer. |
| Int32, |
| /// A signed 64-bit integer. |
| Int64, |
| /// An unsigned 8-bit integer. |
| UInt8, |
| /// An unsigned 16-bit integer. |
| UInt16, |
| /// An unsigned 32-bit integer. |
| UInt32, |
| /// An unsigned 64-bit integer. |
| UInt64, |
| /// A 16-bit floating point number. |
| Float16, |
| /// A 32-bit floating point number. |
| Float32, |
| /// A 64-bit floating point number. |
| Float64, |
| /// A timestamp with an optional timezone. |
| /// |
| /// Time is measured as a Unix epoch, counting the seconds from |
| /// 00:00:00.000 on 1 January 1970, excluding leap seconds, |
| /// as a 64-bit integer. |
| /// |
| /// The time zone is a string indicating the name of a time zone, one of: |
| /// |
| /// * As used in the Olson time zone database (the "tz database" or |
| /// "tzdata"), such as "America/New_York" |
| /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 |
| Timestamp(TimeUnit, Option<Arc<String>>), |
| /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) |
| /// in days (32 bits). |
| Date32(DateUnit), |
| /// A 64-bit date representing the elapsed time since UNIX epoch (1970-01-01) |
| /// in milliseconds (64 bits). |
| Date64(DateUnit), |
| /// A 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. |
| Time32(TimeUnit), |
| /// A 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. |
| Time64(TimeUnit), |
| /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. |
| Duration(TimeUnit), |
| /// A "calendar" interval which models types that don't necessarily |
| /// have a precise duration without the context of a base timestamp (e.g. |
| /// days can differ in length during day light savings time transitions). |
| Interval(IntervalUnit), |
| /// Opaque binary data of variable length. |
| Binary, |
| /// Opaque binary data of fixed size. |
| /// Enum parameter specifies the number of bytes per value. |
| FixedSizeBinary(i32), |
| /// Opaque binary data of variable length and 64-bit offsets. |
| LargeBinary, |
| /// A variable-length string in Unicode with UTF-8 encoding. |
| Utf8, |
| /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. |
| LargeUtf8, |
| /// A list of some logical data type with variable length. |
| List(Box<NullableDataType>), |
| /// A list of some logical data type with fixed length. |
| FixedSizeList(Box<NullableDataType>, i32), |
| /// A list of some logical data type with variable length and 64-bit offsets. |
| LargeList(Box<NullableDataType>), |
| /// A nested datatype that contains a number of sub-fields. |
| Struct(Vec<Field>), |
| /// A nested datatype that can represent slots of differing types. |
| Union(Vec<Field>), |
| /// A dictionary encoded array (`key_type`, `value_type`), where |
| /// each array element is an index of `key_type` into an |
| /// associated dictionary of `value_type`. |
| /// |
| /// Dictionary arrays are used to store columns of `value_type` |
| /// that contain many repeated values using less memory, but with |
| /// a higher CPU overhead for some operations. |
| /// |
| /// This type mostly used to represent low cardinality string |
| /// arrays or a limited set of primitive types as integers. |
| Dictionary(Box<DataType>, Box<DataType>), |
| /// Decimal value with precision and scale |
| Decimal(usize, usize), |
| } |
| |
| /// Extends data type with nullability |
| #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] |
| pub struct NullableDataType { |
| data_type: DataType, |
| nullable: bool, |
| } |
| |
| /// Date is either a 32-bit or 64-bit type representing elapsed time since UNIX |
| /// epoch (1970-01-01) in days or milliseconds. |
| #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] |
| pub enum DateUnit { |
| /// Days since the UNIX epoch. |
| Day, |
| /// Milliseconds indicating UNIX time elapsed since the epoch (no |
| /// leap seconds), where the values are evenly divisible by 86400000. |
| Millisecond, |
| } |
| |
| /// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds. |
| #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] |
| pub enum TimeUnit { |
| /// Time in seconds. |
| Second, |
| /// Time in milliseconds. |
| Millisecond, |
| /// Time in microseconds. |
| Microsecond, |
| /// Time in nanoseconds. |
| Nanosecond, |
| } |
| |
| /// YEAR_MONTH or DAY_TIME interval in SQL style. |
| #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] |
| pub enum IntervalUnit { |
| /// Indicates the number of elapsed whole months, stored as 4-byte integers. |
| YearMonth, |
| /// Indicates the number of elapsed days and milliseconds, |
| /// stored as 2 contiguous 32-bit integers (8-bytes in total). |
| DayTime, |
| } |
| |
| /// Contains the meta-data for a single relative type. |
| /// |
| /// The `Schema` object is an ordered collection of `Field` objects. |
| #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] |
| pub struct Field { |
| name: String, |
| data_type: NullableDataType, |
| dict_id: i64, |
| dict_is_ordered: bool, |
| } |
| |
| pub trait ArrowNativeType: |
| fmt::Debug + Send + Sync + Copy + PartialOrd + FromStr + Default + 'static |
| { |
| fn into_json_value(self) -> Option<Value>; |
| |
| /// Convert native type from usize. |
| fn from_usize(_: usize) -> Option<Self> { |
| None |
| } |
| |
| /// Convert native type to usize. |
| fn to_usize(&self) -> Option<usize> { |
| None |
| } |
| } |
| |
| /// Trait indicating a primitive fixed-width type (bool, ints and floats). |
| pub trait ArrowPrimitiveType: 'static { |
| /// Corresponding Rust native type for the primitive type. |
| type Native: ArrowNativeType; |
| |
| /// the corresponding Arrow data type of this primitive type. |
| const DATA_TYPE: DataType; |
| |
| /// Returns the bit width of this primitive type. |
| fn get_bit_width() -> usize { |
| size_of::<Self::Native>() * 8 |
| } |
| |
| /// Returns a default value of this primitive type. |
| /// |
| /// This is useful for aggregate array ops like `sum()`, `mean()`. |
| fn default_value() -> Self::Native { |
| Default::default() |
| } |
| |
| /// Returns a value offset from the given pointer by the given index. The default |
| /// implementation (used for all non-boolean types) is simply equivalent to pointer-arithmetic. |
| /// # Safety |
| /// Just like array-access in C: the raw_ptr must be the start of a valid array, and the index |
| /// must be less than the size of the array. |
| unsafe fn index(raw_ptr: *const Self::Native, i: usize) -> Self::Native { |
| *(raw_ptr.add(i)) |
| } |
| } |
| |
| impl ArrowNativeType for bool { |
| fn into_json_value(self) -> Option<Value> { |
| Some(self.into()) |
| } |
| } |
| |
| impl ArrowNativeType for i8 { |
| fn into_json_value(self) -> Option<Value> { |
| Some(VNumber(Number::from(self))) |
| } |
| |
| fn from_usize(v: usize) -> Option<Self> { |
| num::FromPrimitive::from_usize(v) |
| } |
| |
| fn to_usize(&self) -> Option<usize> { |
| num::ToPrimitive::to_usize(self) |
| } |
| } |
| |
| impl ArrowNativeType for i16 { |
| fn into_json_value(self) -> Option<Value> { |
| Some(VNumber(Number::from(self))) |
| } |
| |
| fn from_usize(v: usize) -> Option<Self> { |
| num::FromPrimitive::from_usize(v) |
| } |
| |
| fn to_usize(&self) -> Option<usize> { |
| num::ToPrimitive::to_usize(self) |
| } |
| } |
| |
| impl ArrowNativeType for i32 { |
| fn into_json_value(self) -> Option<Value> { |
| Some(VNumber(Number::from(self))) |
| } |
| |
| fn from_usize(v: usize) -> Option<Self> { |
| num::FromPrimitive::from_usize(v) |
| } |
| |
| fn to_usize(&self) -> Option<usize> { |
| num::ToPrimitive::to_usize(self) |
| } |
| } |
| |
| impl ArrowNativeType for i64 { |
| fn into_json_value(self) -> Option<Value> { |
| Some(VNumber(Number::from(self))) |
| } |
| |
| fn from_usize(v: usize) -> Option<Self> { |
| num::FromPrimitive::from_usize(v) |
| } |
| |
| fn to_usize(&self) -> Option<usize> { |
| num::ToPrimitive::to_usize(self) |
| } |
| } |
| |
| impl ArrowNativeType for u8 { |
| fn into_json_value(self) -> Option<Value> { |
| Some(VNumber(Number::from(self))) |
| } |
| |
| fn from_usize(v: usize) -> Option<Self> { |
| num::FromPrimitive::from_usize(v) |
| } |
| |
| fn to_usize(&self) -> Option<usize> { |
| num::ToPrimitive::to_usize(self) |
| } |
| } |
| |
| impl ArrowNativeType for u16 { |
| fn into_json_value(self) -> Option<Value> { |
| Some(VNumber(Number::from(self))) |
| } |
| |
| fn from_usize(v: usize) -> Option<Self> { |
| num::FromPrimitive::from_usize(v) |
| } |
| |
| fn to_usize(&self) -> Option<usize> { |
| num::ToPrimitive::to_usize(self) |
| } |
| } |
| |
| impl ArrowNativeType for u32 { |
| fn into_json_value(self) -> Option<Value> { |
| Some(VNumber(Number::from(self))) |
| } |
| |
| fn from_usize(v: usize) -> Option<Self> { |
| num::FromPrimitive::from_usize(v) |
| } |
| |
| fn to_usize(&self) -> Option<usize> { |
| num::ToPrimitive::to_usize(self) |
| } |
| } |
| |
| impl ArrowNativeType for u64 { |
| fn into_json_value(self) -> Option<Value> { |
| Some(VNumber(Number::from(self))) |
| } |
| |
| fn from_usize(v: usize) -> Option<Self> { |
| num::FromPrimitive::from_usize(v) |
| } |
| |
| fn to_usize(&self) -> Option<usize> { |
| num::ToPrimitive::to_usize(self) |
| } |
| } |
| |
| impl ArrowNativeType for f32 { |
| fn into_json_value(self) -> Option<Value> { |
| Number::from_f64(f64::round(self as f64 * 1000.0) / 1000.0).map(VNumber) |
| } |
| } |
| |
| impl ArrowNativeType for f64 { |
| fn into_json_value(self) -> Option<Value> { |
| Number::from_f64(self).map(VNumber) |
| } |
| } |
| |
| // BooleanType is special: its bit-width is not the size of the primitive type, and its `index` |
| // operation assumes bit-packing. |
| #[derive(Debug)] |
| pub struct BooleanType {} |
| |
| impl ArrowPrimitiveType for BooleanType { |
| type Native = bool; |
| const DATA_TYPE: DataType = DataType::Boolean; |
| |
| fn get_bit_width() -> usize { |
| 1 |
| } |
| |
| /// # Safety |
| /// The pointer must be part of a bit-packed boolean array, and the index must be less than the |
| /// size of the array. |
| unsafe fn index(raw_ptr: *const Self::Native, i: usize) -> Self::Native { |
| bit_util::get_bit_raw(raw_ptr as *const u8, i) |
| } |
| } |
| |
| macro_rules! make_type { |
| ($name:ident, $native_ty:ty, $data_ty:expr) => { |
| #[derive(Debug)] |
| pub struct $name {} |
| |
| impl ArrowPrimitiveType for $name { |
| type Native = $native_ty; |
| const DATA_TYPE: DataType = $data_ty; |
| } |
| }; |
| } |
| |
| make_type!(Int8Type, i8, DataType::Int8); |
| make_type!(Int16Type, i16, DataType::Int16); |
| make_type!(Int32Type, i32, DataType::Int32); |
| make_type!(Int64Type, i64, DataType::Int64); |
| make_type!(UInt8Type, u8, DataType::UInt8); |
| make_type!(UInt16Type, u16, DataType::UInt16); |
| make_type!(UInt32Type, u32, DataType::UInt32); |
| make_type!(UInt64Type, u64, DataType::UInt64); |
| make_type!(Float32Type, f32, DataType::Float32); |
| make_type!(Float64Type, f64, DataType::Float64); |
| make_type!( |
| TimestampSecondType, |
| i64, |
| DataType::Timestamp(TimeUnit::Second, None) |
| ); |
| make_type!( |
| TimestampMillisecondType, |
| i64, |
| DataType::Timestamp(TimeUnit::Millisecond, None) |
| ); |
| make_type!( |
| TimestampMicrosecondType, |
| i64, |
| DataType::Timestamp(TimeUnit::Microsecond, None) |
| ); |
| make_type!( |
| TimestampNanosecondType, |
| i64, |
| DataType::Timestamp(TimeUnit::Nanosecond, None) |
| ); |
| make_type!(Date32Type, i32, DataType::Date32(DateUnit::Day)); |
| make_type!(Date64Type, i64, DataType::Date64(DateUnit::Millisecond)); |
| make_type!(Time32SecondType, i32, DataType::Time32(TimeUnit::Second)); |
| make_type!( |
| Time32MillisecondType, |
| i32, |
| DataType::Time32(TimeUnit::Millisecond) |
| ); |
| make_type!( |
| Time64MicrosecondType, |
| i64, |
| DataType::Time64(TimeUnit::Microsecond) |
| ); |
| make_type!( |
| Time64NanosecondType, |
| i64, |
| DataType::Time64(TimeUnit::Nanosecond) |
| ); |
| make_type!( |
| IntervalYearMonthType, |
| i32, |
| DataType::Interval(IntervalUnit::YearMonth) |
| ); |
| make_type!( |
| IntervalDayTimeType, |
| i64, |
| DataType::Interval(IntervalUnit::DayTime) |
| ); |
| make_type!( |
| DurationSecondType, |
| i64, |
| DataType::Duration(TimeUnit::Second) |
| ); |
| make_type!( |
| DurationMillisecondType, |
| i64, |
| DataType::Duration(TimeUnit::Millisecond) |
| ); |
| make_type!( |
| DurationMicrosecondType, |
| i64, |
| DataType::Duration(TimeUnit::Microsecond) |
| ); |
| make_type!( |
| DurationNanosecondType, |
| i64, |
| DataType::Duration(TimeUnit::Nanosecond) |
| ); |
| |
| /// A subtype of primitive type that represents legal dictionary keys. |
| /// See https://arrow.apache.org/docs/format/Columnar.html |
| pub trait ArrowDictionaryKeyType: ArrowPrimitiveType {} |
| |
| impl ArrowDictionaryKeyType for Int8Type {} |
| |
| impl ArrowDictionaryKeyType for Int16Type {} |
| |
| impl ArrowDictionaryKeyType for Int32Type {} |
| |
| impl ArrowDictionaryKeyType for Int64Type {} |
| |
| impl ArrowDictionaryKeyType for UInt8Type {} |
| |
| impl ArrowDictionaryKeyType for UInt16Type {} |
| |
| impl ArrowDictionaryKeyType for UInt32Type {} |
| |
| impl ArrowDictionaryKeyType for UInt64Type {} |
| |
| /// A subtype of primitive type that represents numeric values. |
| /// |
| /// SIMD operations are defined in this trait if available on the target system. |
| #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))] |
| pub trait ArrowNumericType: ArrowPrimitiveType |
| where |
| Self::Simd: Add<Output = Self::Simd> |
| + Sub<Output = Self::Simd> |
| + Mul<Output = Self::Simd> |
| + Div<Output = Self::Simd> |
| + Copy, |
| { |
| /// Defines the SIMD type that should be used for this numeric type |
| type Simd; |
| |
| /// Defines the SIMD Mask type that should be used for this numeric type |
| type SimdMask; |
| |
| /// The number of SIMD lanes available |
| fn lanes() -> usize; |
| |
| /// Initializes a SIMD register to a constant value |
| fn init(value: Self::Native) -> Self::Simd; |
| |
| /// Loads a slice into a SIMD register |
| fn load(slice: &[Self::Native]) -> Self::Simd; |
| |
| /// Creates a new SIMD mask for this SIMD type filling it with `value` |
| fn mask_init(value: bool) -> Self::SimdMask; |
| |
| /// Creates a new SIMD mask for this SIMD type from the lower-most bits of the given `mask` |
| fn mask_from_u64(mask: u64) -> Self::SimdMask; |
| |
| /// Gets the value of a single lane in a SIMD mask |
| fn mask_get(mask: &Self::SimdMask, idx: usize) -> bool; |
| |
| /// Gets the bitmask for a SimdMask as a byte slice and passes it to the closure used as the action parameter |
| fn bitmask<T>(mask: &Self::SimdMask, action: T) |
| where |
| T: FnMut(&[u8]); |
| |
| /// Sets the value of a single lane of a SIMD mask |
| fn mask_set(mask: Self::SimdMask, idx: usize, value: bool) -> Self::SimdMask; |
| |
| /// Selects elements of `a` and `b` using `mask` |
| fn mask_select(mask: Self::SimdMask, a: Self::Simd, b: Self::Simd) -> Self::Simd; |
| |
| /// Returns `true` if any of the lanes in the mask are `true` |
| fn mask_any(mask: Self::SimdMask) -> bool; |
| |
| /// Performs a SIMD binary operation |
| fn bin_op<F: Fn(Self::Simd, Self::Simd) -> Self::Simd>( |
| left: Self::Simd, |
| right: Self::Simd, |
| op: F, |
| ) -> Self::Simd; |
| |
| /// SIMD version of equal |
| fn eq(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; |
| |
| /// SIMD version of not equal |
| fn ne(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; |
| |
| /// SIMD version of less than |
| fn lt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; |
| |
| /// SIMD version of less than or equal to |
| fn le(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; |
| |
| /// SIMD version of greater than |
| fn gt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; |
| |
| /// SIMD version of greater than or equal to |
| fn ge(left: Self::Simd, right: Self::Simd) -> Self::SimdMask; |
| |
| /// Writes a SIMD result back to a slice |
| fn write(simd_result: Self::Simd, slice: &mut [Self::Native]); |
| } |
| |
| #[cfg(any( |
| not(any(target_arch = "x86", target_arch = "x86_64")), |
| not(feature = "simd") |
| ))] |
| pub trait ArrowNumericType: ArrowPrimitiveType {} |
| |
| macro_rules! make_numeric_type { |
| ($impl_ty:ty, $native_ty:ty, $simd_ty:ident, $simd_mask_ty:ident) => { |
| #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))] |
| impl ArrowNumericType for $impl_ty { |
| type Simd = $simd_ty; |
| |
| type SimdMask = $simd_mask_ty; |
| |
| #[inline] |
| fn lanes() -> usize { |
| Self::Simd::lanes() |
| } |
| |
| #[inline] |
| fn init(value: Self::Native) -> Self::Simd { |
| Self::Simd::splat(value) |
| } |
| |
| #[inline] |
| fn load(slice: &[Self::Native]) -> Self::Simd { |
| unsafe { Self::Simd::from_slice_unaligned_unchecked(slice) } |
| } |
| |
| #[inline] |
| fn mask_init(value: bool) -> Self::SimdMask { |
| Self::SimdMask::splat(value) |
| } |
| |
| #[inline] |
| fn mask_from_u64(mask: u64) -> Self::SimdMask { |
| match Self::lanes() { |
| 8 => { |
| let vecidx = i64x8::new(128, 64, 32, 16, 8, 4, 2, 1); |
| |
| let vecmask = i64x8::splat((mask & 0xFF) as i64); |
| let vecmask = (vecidx & vecmask).eq(vecidx); |
| |
| unsafe { std::mem::transmute(vecmask) } |
| } |
| 16 => { |
| let vecidx = i32x16::new( |
| 32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, |
| 16, 8, 4, 2, 1, |
| ); |
| |
| let vecmask = i32x16::splat((mask & 0xFFFF) as i32); |
| let vecmask = (vecidx & vecmask).eq(vecidx); |
| |
| unsafe { std::mem::transmute(vecmask) } |
| } |
| 32 => { |
| let tmp = &mut [0_i16; 32]; |
| |
| let vecidx = i32x16::new( |
| 32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, |
| 16, 8, 4, 2, 1, |
| ); |
| |
| let vecmask = i32x16::splat((mask & 0xFFFF) as i32); |
| let vecmask = (vecidx & vecmask).eq(vecidx); |
| |
| i16x16::from_cast(vecmask) |
| .write_to_slice_unaligned(&mut tmp[0..16]); |
| |
| let vecmask = i32x16::splat(((mask >> 16) & 0xFFFF) as i32); |
| let vecmask = (vecidx & vecmask).eq(vecidx); |
| |
| i16x16::from_cast(vecmask) |
| .write_to_slice_unaligned(&mut tmp[16..32]); |
| |
| unsafe { std::mem::transmute(i16x32::from_slice_unaligned(tmp)) } |
| } |
| 64 => { |
| let tmp = &mut [0_i8; 64]; |
| |
| let vecidx = i32x16::new( |
| 32768, 16384, 8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, |
| 16, 8, 4, 2, 1, |
| ); |
| |
| let vecmask = i32x16::splat((mask & 0xFFFF) as i32); |
| let vecmask = (vecidx & vecmask).eq(vecidx); |
| |
| i8x16::from_cast(vecmask) |
| .write_to_slice_unaligned(&mut tmp[0..16]); |
| |
| let vecmask = i32x16::splat(((mask >> 16) & 0xFFFF) as i32); |
| let vecmask = (vecidx & vecmask).eq(vecidx); |
| |
| i8x16::from_cast(vecmask) |
| .write_to_slice_unaligned(&mut tmp[16..32]); |
| |
| let vecmask = i32x16::splat(((mask >> 32) & 0xFFFF) as i32); |
| let vecmask = (vecidx & vecmask).eq(vecidx); |
| |
| i8x16::from_cast(vecmask) |
| .write_to_slice_unaligned(&mut tmp[32..48]); |
| |
| let vecmask = i32x16::splat(((mask >> 48) & 0xFFFF) as i32); |
| let vecmask = (vecidx & vecmask).eq(vecidx); |
| |
| i8x16::from_cast(vecmask) |
| .write_to_slice_unaligned(&mut tmp[48..64]); |
| |
| unsafe { std::mem::transmute(i8x64::from_slice_unaligned(tmp)) } |
| } |
| _ => panic!("Invalid number of vector lanes"), |
| } |
| } |
| |
| #[inline] |
| fn mask_get(mask: &Self::SimdMask, idx: usize) -> bool { |
| unsafe { mask.extract_unchecked(idx) } |
| } |
| |
| fn bitmask<T>(mask: &Self::SimdMask, mut action: T) |
| where |
| T: FnMut(&[u8]), |
| { |
| action(mask.bitmask().to_byte_slice()); |
| } |
| |
| #[inline] |
| fn mask_set(mask: Self::SimdMask, idx: usize, value: bool) -> Self::SimdMask { |
| unsafe { mask.replace_unchecked(idx, value) } |
| } |
| |
| /// Selects elements of `a` and `b` using `mask` |
| #[inline] |
| fn mask_select( |
| mask: Self::SimdMask, |
| a: Self::Simd, |
| b: Self::Simd, |
| ) -> Self::Simd { |
| mask.select(a, b) |
| } |
| |
| #[inline] |
| fn mask_any(mask: Self::SimdMask) -> bool { |
| mask.any() |
| } |
| |
| #[inline] |
| fn bin_op<F: Fn(Self::Simd, Self::Simd) -> Self::Simd>( |
| left: Self::Simd, |
| right: Self::Simd, |
| op: F, |
| ) -> Self::Simd { |
| op(left, right) |
| } |
| |
| #[inline] |
| fn eq(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { |
| left.eq(right) |
| } |
| |
| #[inline] |
| fn ne(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { |
| left.ne(right) |
| } |
| |
| #[inline] |
| fn lt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { |
| left.lt(right) |
| } |
| |
| #[inline] |
| fn le(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { |
| left.le(right) |
| } |
| |
| #[inline] |
| fn gt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { |
| left.gt(right) |
| } |
| |
| #[inline] |
| fn ge(left: Self::Simd, right: Self::Simd) -> Self::SimdMask { |
| left.ge(right) |
| } |
| |
| #[inline] |
| fn write(simd_result: Self::Simd, slice: &mut [Self::Native]) { |
| unsafe { simd_result.write_to_slice_unaligned_unchecked(slice) }; |
| } |
| } |
| #[cfg(any( |
| not(any(target_arch = "x86", target_arch = "x86_64")), |
| not(feature = "simd") |
| ))] |
| impl ArrowNumericType for $impl_ty {} |
| }; |
| } |
| |
| make_numeric_type!(Int8Type, i8, i8x64, m8x64); |
| make_numeric_type!(Int16Type, i16, i16x32, m16x32); |
| make_numeric_type!(Int32Type, i32, i32x16, m32x16); |
| make_numeric_type!(Int64Type, i64, i64x8, m64x8); |
| make_numeric_type!(UInt8Type, u8, u8x64, m8x64); |
| make_numeric_type!(UInt16Type, u16, u16x32, m16x32); |
| make_numeric_type!(UInt32Type, u32, u32x16, m32x16); |
| make_numeric_type!(UInt64Type, u64, u64x8, m64x8); |
| make_numeric_type!(Float32Type, f32, f32x16, m32x16); |
| make_numeric_type!(Float64Type, f64, f64x8, m64x8); |
| |
| make_numeric_type!(TimestampSecondType, i64, i64x8, m64x8); |
| make_numeric_type!(TimestampMillisecondType, i64, i64x8, m64x8); |
| make_numeric_type!(TimestampMicrosecondType, i64, i64x8, m64x8); |
| make_numeric_type!(TimestampNanosecondType, i64, i64x8, m64x8); |
| make_numeric_type!(Date32Type, i32, i32x16, m32x16); |
| make_numeric_type!(Date64Type, i64, i64x8, m64x8); |
| make_numeric_type!(Time32SecondType, i32, i32x16, m32x16); |
| make_numeric_type!(Time32MillisecondType, i32, i32x16, m32x16); |
| make_numeric_type!(Time64MicrosecondType, i64, i64x8, m64x8); |
| make_numeric_type!(Time64NanosecondType, i64, i64x8, m64x8); |
| make_numeric_type!(IntervalYearMonthType, i32, i32x16, m32x16); |
| make_numeric_type!(IntervalDayTimeType, i64, i64x8, m64x8); |
| make_numeric_type!(DurationSecondType, i64, i64x8, m64x8); |
| make_numeric_type!(DurationMillisecondType, i64, i64x8, m64x8); |
| make_numeric_type!(DurationMicrosecondType, i64, i64x8, m64x8); |
| make_numeric_type!(DurationNanosecondType, i64, i64x8, m64x8); |
| |
| /// A subtype of primitive type that represents temporal values. |
| pub trait ArrowTemporalType: ArrowPrimitiveType {} |
| |
| impl ArrowTemporalType for TimestampSecondType {} |
| impl ArrowTemporalType for TimestampMillisecondType {} |
| impl ArrowTemporalType for TimestampMicrosecondType {} |
| impl ArrowTemporalType for TimestampNanosecondType {} |
| impl ArrowTemporalType for Date32Type {} |
| impl ArrowTemporalType for Date64Type {} |
| impl ArrowTemporalType for Time32SecondType {} |
| impl ArrowTemporalType for Time32MillisecondType {} |
| impl ArrowTemporalType for Time64MicrosecondType {} |
| impl ArrowTemporalType for Time64NanosecondType {} |
| // impl ArrowTemporalType for IntervalYearMonthType {} |
| // impl ArrowTemporalType for IntervalDayTimeType {} |
| |
| /// A timestamp type allows us to create array builders that take a timestamp. |
| pub trait ArrowTimestampType: ArrowTemporalType { |
| /// Returns the `TimeUnit` of this timestamp. |
| fn get_time_unit() -> TimeUnit; |
| } |
| |
| impl ArrowTimestampType for TimestampSecondType { |
| fn get_time_unit() -> TimeUnit { |
| TimeUnit::Second |
| } |
| } |
| impl ArrowTimestampType for TimestampMillisecondType { |
| fn get_time_unit() -> TimeUnit { |
| TimeUnit::Millisecond |
| } |
| } |
| impl ArrowTimestampType for TimestampMicrosecondType { |
| fn get_time_unit() -> TimeUnit { |
| TimeUnit::Microsecond |
| } |
| } |
| impl ArrowTimestampType for TimestampNanosecondType { |
| fn get_time_unit() -> TimeUnit { |
| TimeUnit::Nanosecond |
| } |
| } |
| |
| /// Allows conversion from supported Arrow types to a byte slice. |
| pub trait ToByteSlice { |
| /// Converts this instance into a byte slice |
| fn to_byte_slice(&self) -> &[u8]; |
| } |
| |
| impl<T: ArrowNativeType> ToByteSlice for [T] { |
| fn to_byte_slice(&self) -> &[u8] { |
| let raw_ptr = self.as_ptr() as *const T as *const u8; |
| unsafe { from_raw_parts(raw_ptr, self.len() * size_of::<T>()) } |
| } |
| } |
| |
| impl<T: ArrowNativeType> ToByteSlice for T { |
| fn to_byte_slice(&self) -> &[u8] { |
| let raw_ptr = self as *const T as *const u8; |
| unsafe { from_raw_parts(raw_ptr, size_of::<T>()) } |
| } |
| } |
| |
| impl DataType { |
| /// Parse a data type from a JSON representation |
| pub(crate) fn from(json: &Value) -> Result<DataType> { |
| let default_dt_ctx = NullableDataType::new(DataType::Boolean, true); |
| match *json { |
| Value::Object(ref map) => match map.get("name") { |
| Some(s) if s == "null" => Ok(DataType::Null), |
| Some(s) if s == "bool" => Ok(DataType::Boolean), |
| Some(s) if s == "binary" => Ok(DataType::Binary), |
| Some(s) if s == "largebinary" => Ok(DataType::LargeBinary), |
| Some(s) if s == "utf8" => Ok(DataType::Utf8), |
| Some(s) if s == "largeutf8" => Ok(DataType::LargeUtf8), |
| Some(s) if s == "fixedsizebinary" => { |
| // return a list with any type as its child isn't defined in the map |
| if let Some(Value::Number(size)) = map.get("byteWidth") { |
| Ok(DataType::FixedSizeBinary(size.as_i64().unwrap() as i32)) |
| } else { |
| Err(ArrowError::ParseError( |
| "Expecting a byteWidth for fixedsizebinary".to_string(), |
| )) |
| } |
| } |
| Some(s) if s == "floatingpoint" => match map.get("precision") { |
| Some(p) if p == "HALF" => Ok(DataType::Float16), |
| Some(p) if p == "SINGLE" => Ok(DataType::Float32), |
| Some(p) if p == "DOUBLE" => Ok(DataType::Float64), |
| _ => Err(ArrowError::ParseError( |
| "floatingpoint precision missing or invalid".to_string(), |
| )), |
| }, |
| Some(s) if s == "timestamp" => { |
| let unit = match map.get("unit") { |
| Some(p) if p == "SECOND" => Ok(TimeUnit::Second), |
| Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), |
| Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), |
| Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), |
| _ => Err(ArrowError::ParseError( |
| "timestamp unit missing or invalid".to_string(), |
| )), |
| }; |
| let tz = match map.get("timezone") { |
| None => Ok(None), |
| Some(VString(tz)) => Ok(Some(Arc::new(tz.to_string()))), |
| _ => Err(ArrowError::ParseError( |
| "timezone must be a string".to_string(), |
| )), |
| }; |
| Ok(DataType::Timestamp(unit?, tz?)) |
| } |
| Some(s) if s == "date" => match map.get("unit") { |
| Some(p) if p == "DAY" => Ok(DataType::Date32(DateUnit::Day)), |
| Some(p) if p == "MILLISECOND" => { |
| Ok(DataType::Date64(DateUnit::Millisecond)) |
| } |
| _ => Err(ArrowError::ParseError( |
| "date unit missing or invalid".to_string(), |
| )), |
| }, |
| Some(s) if s == "time" => { |
| let unit = match map.get("unit") { |
| Some(p) if p == "SECOND" => Ok(TimeUnit::Second), |
| Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), |
| Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), |
| Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), |
| _ => Err(ArrowError::ParseError( |
| "time unit missing or invalid".to_string(), |
| )), |
| }; |
| match map.get("bitWidth") { |
| Some(p) if p == 32 => Ok(DataType::Time32(unit?)), |
| Some(p) if p == 64 => Ok(DataType::Time64(unit?)), |
| _ => Err(ArrowError::ParseError( |
| "time bitWidth missing or invalid".to_string(), |
| )), |
| } |
| } |
| Some(s) if s == "duration" => match map.get("unit") { |
| Some(p) if p == "SECOND" => Ok(DataType::Duration(TimeUnit::Second)), |
| Some(p) if p == "MILLISECOND" => { |
| Ok(DataType::Duration(TimeUnit::Millisecond)) |
| } |
| Some(p) if p == "MICROSECOND" => { |
| Ok(DataType::Duration(TimeUnit::Microsecond)) |
| } |
| Some(p) if p == "NANOSECOND" => { |
| Ok(DataType::Duration(TimeUnit::Nanosecond)) |
| } |
| _ => Err(ArrowError::ParseError( |
| "time unit missing or invalid".to_string(), |
| )), |
| }, |
| Some(s) if s == "interval" => match map.get("unit") { |
| Some(p) if p == "DAY_TIME" => { |
| Ok(DataType::Interval(IntervalUnit::DayTime)) |
| } |
| Some(p) if p == "YEAR_MONTH" => { |
| Ok(DataType::Interval(IntervalUnit::YearMonth)) |
| } |
| _ => Err(ArrowError::ParseError( |
| "interval unit missing or invalid".to_string(), |
| )), |
| }, |
| Some(s) if s == "int" => match map.get("isSigned") { |
| Some(&Value::Bool(true)) => match map.get("bitWidth") { |
| Some(&Value::Number(ref n)) => match n.as_u64() { |
| Some(8) => Ok(DataType::Int8), |
| Some(16) => Ok(DataType::Int16), |
| Some(32) => Ok(DataType::Int32), |
| Some(64) => Ok(DataType::Int64), |
| _ => Err(ArrowError::ParseError( |
| "int bitWidth missing or invalid".to_string(), |
| )), |
| }, |
| _ => Err(ArrowError::ParseError( |
| "int bitWidth missing or invalid".to_string(), |
| )), |
| }, |
| Some(&Value::Bool(false)) => match map.get("bitWidth") { |
| Some(&Value::Number(ref n)) => match n.as_u64() { |
| Some(8) => Ok(DataType::UInt8), |
| Some(16) => Ok(DataType::UInt16), |
| Some(32) => Ok(DataType::UInt32), |
| Some(64) => Ok(DataType::UInt64), |
| _ => Err(ArrowError::ParseError( |
| "int bitWidth missing or invalid".to_string(), |
| )), |
| }, |
| _ => Err(ArrowError::ParseError( |
| "int bitWidth missing or invalid".to_string(), |
| )), |
| }, |
| _ => Err(ArrowError::ParseError( |
| "int signed missing or invalid".to_string(), |
| )), |
| }, |
| Some(s) if s == "list" => { |
| // return a list with any type as its child isn't defined in the map |
| Ok(DataType::List(Box::new(default_dt_ctx))) |
| } |
| Some(s) if s == "largelist" => { |
| // return a largelist with any type as its child isn't defined in the map |
| Ok(DataType::LargeList(Box::new(default_dt_ctx))) |
| } |
| Some(s) if s == "fixedsizelist" => { |
| // return a list with any type as its child isn't defined in the map |
| if let Some(Value::Number(size)) = map.get("listSize") { |
| Ok(DataType::FixedSizeList( |
| Box::new(default_dt_ctx), |
| size.as_i64().unwrap() as i32, |
| )) |
| } else { |
| Err(ArrowError::ParseError( |
| "Expecting a listSize for fixedsizelist".to_string(), |
| )) |
| } |
| } |
| Some(s) if s == "struct" => { |
| // return an empty `struct` type as its children aren't defined in the map |
| Ok(DataType::Struct(vec![])) |
| } |
| Some(other) => Err(ArrowError::ParseError(format!( |
| "invalid or unsupported type name: {} in {:?}", |
| other, json |
| ))), |
| None => Err(ArrowError::ParseError("type name missing".to_string())), |
| }, |
| _ => Err(ArrowError::ParseError( |
| "invalid json value type".to_string(), |
| )), |
| } |
| } |
| |
| /// Generate a JSON representation of the data type |
| pub fn to_json(&self) -> Value { |
| match self { |
| DataType::Null => json!({"name": "null"}), |
| DataType::Boolean => json!({"name": "bool"}), |
| DataType::Int8 => json!({"name": "int", "bitWidth": 8, "isSigned": true}), |
| DataType::Int16 => json!({"name": "int", "bitWidth": 16, "isSigned": true}), |
| DataType::Int32 => json!({"name": "int", "bitWidth": 32, "isSigned": true}), |
| DataType::Int64 => json!({"name": "int", "bitWidth": 64, "isSigned": true}), |
| DataType::UInt8 => json!({"name": "int", "bitWidth": 8, "isSigned": false}), |
| DataType::UInt16 => json!({"name": "int", "bitWidth": 16, "isSigned": false}), |
| DataType::UInt32 => json!({"name": "int", "bitWidth": 32, "isSigned": false}), |
| DataType::UInt64 => json!({"name": "int", "bitWidth": 64, "isSigned": false}), |
| DataType::Float16 => json!({"name": "floatingpoint", "precision": "HALF"}), |
| DataType::Float32 => json!({"name": "floatingpoint", "precision": "SINGLE"}), |
| DataType::Float64 => json!({"name": "floatingpoint", "precision": "DOUBLE"}), |
| DataType::Utf8 => json!({"name": "utf8"}), |
| DataType::LargeUtf8 => json!({"name": "largeutf8"}), |
| DataType::Binary => json!({"name": "binary"}), |
| DataType::LargeBinary => json!({"name": "largebinary"}), |
| DataType::FixedSizeBinary(byte_width) => { |
| json!({"name": "fixedsizebinary", "byteWidth": byte_width}) |
| } |
| DataType::Struct(_) => json!({"name": "struct"}), |
| DataType::Union(_) => json!({"name": "union"}), |
| DataType::List(_) => json!({ "name": "list"}), |
| DataType::LargeList(_) => json!({ "name": "largelist"}), |
| DataType::FixedSizeList(_, length) => { |
| json!({"name":"fixedsizelist", "listSize": length}) |
| } |
| DataType::Time32(unit) => { |
| json!({"name": "time", "bitWidth": 32, "unit": match unit { |
| TimeUnit::Second => "SECOND", |
| TimeUnit::Millisecond => "MILLISECOND", |
| TimeUnit::Microsecond => "MICROSECOND", |
| TimeUnit::Nanosecond => "NANOSECOND", |
| }}) |
| } |
| DataType::Time64(unit) => { |
| json!({"name": "time", "bitWidth": 64, "unit": match unit { |
| TimeUnit::Second => "SECOND", |
| TimeUnit::Millisecond => "MILLISECOND", |
| TimeUnit::Microsecond => "MICROSECOND", |
| TimeUnit::Nanosecond => "NANOSECOND", |
| }}) |
| } |
| DataType::Date32(unit) | DataType::Date64(unit) => { |
| json!({"name": "date", "unit": match unit { |
| DateUnit::Day => "DAY", |
| DateUnit::Millisecond => "MILLISECOND", |
| }}) |
| } |
| DataType::Timestamp(unit, None) => { |
| json!({"name": "timestamp", "unit": match unit { |
| TimeUnit::Second => "SECOND", |
| TimeUnit::Millisecond => "MILLISECOND", |
| TimeUnit::Microsecond => "MICROSECOND", |
| TimeUnit::Nanosecond => "NANOSECOND", |
| }}) |
| } |
| DataType::Timestamp(unit, Some(tz)) => { |
| json!({"name": "timestamp", "unit": match unit { |
| TimeUnit::Second => "SECOND", |
| TimeUnit::Millisecond => "MILLISECOND", |
| TimeUnit::Microsecond => "MICROSECOND", |
| TimeUnit::Nanosecond => "NANOSECOND", |
| }, "timezone": tz}) |
| } |
| DataType::Interval(unit) => json!({"name": "interval", "unit": match unit { |
| IntervalUnit::YearMonth => "YEAR_MONTH", |
| IntervalUnit::DayTime => "DAY_TIME", |
| }}), |
| DataType::Duration(unit) => json!({"name": "duration", "unit": match unit { |
| TimeUnit::Second => "SECOND", |
| TimeUnit::Millisecond => "MILLISECOND", |
| TimeUnit::Microsecond => "MICROSECOND", |
| TimeUnit::Nanosecond => "NANOSECOND", |
| }}), |
| DataType::Dictionary(_, _) => json!({ "name": "dictionary"}), |
| DataType::Decimal(precision, scale) => { |
| json!({"name": "decimal", "precision": precision, "scale": scale}) |
| } |
| } |
| } |
| |
| /// Returns true if this type is numeric: (UInt*, Unit*, or Float*) |
| pub fn is_numeric(t: &DataType) -> bool { |
| use DataType::*; |
| matches!( |
| t, |
| UInt8 |
| | UInt16 |
| | UInt32 |
| | UInt64 |
| | Int8 |
| | Int16 |
| | Int32 |
| | Int64 |
| | Float32 |
| | Float64 |
| ) |
| } |
| } |
| |
| impl NullableDataType { |
| /// Creates a new data type context |
| pub fn new(data_type: DataType, nullable: bool) -> Self { |
| NullableDataType { |
| data_type, |
| nullable, |
| } |
| } |
| |
| /// Returns an immutable reference to the data type |
| #[inline] |
| pub const fn data_type(&self) -> &DataType { |
| &self.data_type |
| } |
| |
| /// Indicates whether in this data type context null values are eligible |
| #[inline] |
| pub const fn is_nullable(&self) -> bool { |
| self.nullable |
| } |
| } |
| |
| impl Field { |
| /// Creates a new field |
| pub fn new(name: &str, data_type: DataType, nullable: bool) -> Self { |
| Field { |
| name: name.to_string(), |
| data_type: NullableDataType::new(data_type, nullable), |
| dict_id: 0, |
| dict_is_ordered: false, |
| } |
| } |
| |
| /// Creates a new field |
| pub fn new_dict( |
| name: &str, |
| data_type: DataType, |
| nullable: bool, |
| dict_id: i64, |
| dict_is_ordered: bool, |
| ) -> Self { |
| Field { |
| name: name.to_string(), |
| data_type: NullableDataType::new(data_type, nullable), |
| dict_id, |
| dict_is_ordered, |
| } |
| } |
| |
| /// Returns an immutable reference to the `Field`'s name |
| #[inline] |
| pub const fn name(&self) -> &String { |
| &self.name |
| } |
| |
| /// Returns an immutable reference to the `Field`'s data-type |
| #[inline] |
| pub const fn data_type(&self) -> &DataType { |
| self.data_type.data_type() |
| } |
| |
| /// Indicates whether this `Field` supports null values |
| #[inline] |
| pub const fn is_nullable(&self) -> bool { |
| self.data_type.nullable |
| } |
| |
| /// Returns the dictionary ID |
| #[inline] |
| pub const fn dict_id(&self) -> i64 { |
| self.dict_id |
| } |
| |
| /// Indicates whether this `Field`'s dictionary is ordered |
| #[inline] |
| pub const fn dict_is_ordered(&self) -> bool { |
| self.dict_is_ordered |
| } |
| |
| /// Parse a `Field` definition from a JSON representation |
| pub fn from(json: &Value) -> Result<Self> { |
| match *json { |
| Value::Object(ref map) => { |
| let name = match map.get("name") { |
| Some(&Value::String(ref name)) => name.to_string(), |
| _ => { |
| return Err(ArrowError::ParseError( |
| "Field missing 'name' attribute".to_string(), |
| )); |
| } |
| }; |
| let nullable = match map.get("nullable") { |
| Some(&Value::Bool(b)) => b, |
| _ => { |
| return Err(ArrowError::ParseError( |
| "Field missing 'nullable' attribute".to_string(), |
| )); |
| } |
| }; |
| let data_type = match map.get("type") { |
| Some(t) => DataType::from(t)?, |
| _ => { |
| return Err(ArrowError::ParseError( |
| "Field missing 'type' attribute".to_string(), |
| )); |
| } |
| }; |
| // if data_type is a struct or list, get its children |
| let data_type = match data_type { |
| DataType::List(_) |
| | DataType::LargeList(_) |
| | DataType::FixedSizeList(_, _) => match map.get("children") { |
| Some(Value::Array(values)) => { |
| if values.len() != 1 { |
| return Err(ArrowError::ParseError( |
| "Field 'children' must have one element for a list data type".to_string(), |
| )); |
| } |
| let nested_field = Self::from(&values[0])?; |
| let nexted_dt_ctx = NullableDataType::new( |
| nested_field.data_type.data_type, |
| nested_field.data_type.nullable, |
| ); |
| match data_type { |
| DataType::List(_) => DataType::List(Box::new( |
| nexted_dt_ctx, |
| )), |
| DataType::LargeList(_) => DataType::LargeList(Box::new( |
| nexted_dt_ctx, |
| )), |
| DataType::FixedSizeList(_, int) => { |
| DataType::FixedSizeList( |
| Box::new(nexted_dt_ctx), |
| int, |
| ) |
| } |
| _ => unreachable!( |
| "Data type should be a list, largelist or fixedsizelist" |
| ), |
| } |
| } |
| Some(_) => { |
| return Err(ArrowError::ParseError( |
| "Field 'children' must be an array".to_string(), |
| )) |
| } |
| None => { |
| return Err(ArrowError::ParseError( |
| "Field missing 'children' attribute".to_string(), |
| )); |
| } |
| }, |
| DataType::Struct(mut fields) => match map.get("children") { |
| Some(Value::Array(values)) => { |
| let struct_fields: Result<Vec<Field>> = |
| values.iter().map(|v| Field::from(v)).collect(); |
| fields.append(&mut struct_fields?); |
| DataType::Struct(fields) |
| } |
| Some(_) => { |
| return Err(ArrowError::ParseError( |
| "Field 'children' must be an array".to_string(), |
| )) |
| } |
| None => { |
| return Err(ArrowError::ParseError( |
| "Field missing 'children' attribute".to_string(), |
| )); |
| } |
| }, |
| _ => data_type, |
| }; |
| |
| let mut dict_id = 0; |
| let mut dict_is_ordered = false; |
| |
| let data_type = match map.get("dictionary") { |
| Some(dictionary) => { |
| let index_type = match dictionary.get("indexType") { |
| Some(t) => DataType::from(t)?, |
| _ => { |
| return Err(ArrowError::ParseError( |
| "Field missing 'indexType' attribute".to_string(), |
| )); |
| } |
| }; |
| dict_id = match dictionary.get("id") { |
| Some(Value::Number(n)) => n.as_i64().unwrap(), |
| _ => { |
| return Err(ArrowError::ParseError( |
| "Field missing 'id' attribute".to_string(), |
| )); |
| } |
| }; |
| dict_is_ordered = match dictionary.get("isOrdered") { |
| Some(&Value::Bool(n)) => n, |
| _ => { |
| return Err(ArrowError::ParseError( |
| "Field missing 'isOrdered' attribute".to_string(), |
| )); |
| } |
| }; |
| DataType::Dictionary(Box::new(index_type), Box::new(data_type)) |
| } |
| _ => data_type, |
| }; |
| Ok(Field { |
| name, |
| data_type: NullableDataType::new(data_type, nullable), |
| dict_id, |
| dict_is_ordered, |
| }) |
| } |
| _ => Err(ArrowError::ParseError( |
| "Invalid json value type for field".to_string(), |
| )), |
| } |
| } |
| |
| /// Generate a JSON representation of the `Field` |
| pub fn to_json(&self) -> Value { |
| let children: Vec<Value> = match self.data_type() { |
| DataType::Struct(fields) => fields.iter().map(|f| f.to_json()).collect(), |
| DataType::List(type_ctx) => { |
| let item = Field::new( |
| "item", |
| type_ctx.data_type().clone(), |
| type_ctx.is_nullable(), |
| ); |
| vec![item.to_json()] |
| } |
| DataType::LargeList(type_ctx) => { |
| let item = Field::new( |
| "item", |
| type_ctx.data_type().clone(), |
| type_ctx.is_nullable(), |
| ); |
| vec![item.to_json()] |
| } |
| DataType::FixedSizeList(type_ctx, _) => { |
| let item = Field::new( |
| "item", |
| type_ctx.data_type().clone(), |
| type_ctx.is_nullable(), |
| ); |
| vec![item.to_json()] |
| } |
| _ => vec![], |
| }; |
| match self.data_type() { |
| DataType::Dictionary(ref index_type, ref value_type) => json!({ |
| "name": self.name, |
| "nullable": self.data_type.nullable, |
| "type": value_type.to_json(), |
| "children": children, |
| "dictionary": { |
| "id": self.dict_id, |
| "indexType": index_type.to_json(), |
| "isOrdered": self.dict_is_ordered |
| } |
| }), |
| _ => json!({ |
| "name": self.name, |
| "nullable": self.data_type.is_nullable(), |
| "type": self.data_type.data_type().to_json(), |
| "children": children |
| }), |
| } |
| } |
| |
| /// Merge field into self if it is compatible. Struct will be merged recursively. |
| /// |
| /// Example: |
| /// |
| /// ``` |
| /// use arrow::datatypes::*; |
| /// |
| /// let mut field = Field::new("c1", DataType::Int64, false); |
| /// assert!(field.try_merge(&Field::new("c1", DataType::Int64, true)).is_ok()); |
| /// assert!(field.is_nullable()); |
| /// ``` |
| pub fn try_merge(&mut self, from: &Field) -> Result<()> { |
| if from.dict_id != self.dict_id { |
| return Err(ArrowError::SchemaError( |
| "Fail to merge schema Field due to conflicting dict_id".to_string(), |
| )); |
| } |
| if from.dict_is_ordered != self.dict_is_ordered { |
| return Err(ArrowError::SchemaError( |
| "Fail to merge schema Field due to conflicting dict_is_ordered" |
| .to_string(), |
| )); |
| } |
| match &mut self.data_type.data_type { |
| DataType::Struct(nested_fields) => match &from.data_type.data_type { |
| DataType::Struct(from_nested_fields) => { |
| for from_field in from_nested_fields { |
| let mut is_new_field = true; |
| for self_field in nested_fields.iter_mut() { |
| if self_field.name != from_field.name { |
| continue; |
| } |
| is_new_field = false; |
| self_field.try_merge(&from_field)?; |
| } |
| if is_new_field { |
| nested_fields.push(from_field.clone()); |
| } |
| } |
| } |
| _ => { |
| return Err(ArrowError::SchemaError( |
| "Fail to merge schema Field due to conflicting datatype" |
| .to_string(), |
| )); |
| } |
| }, |
| DataType::Union(nested_fields) => match &from.data_type.data_type { |
| DataType::Union(from_nested_fields) => { |
| for from_field in from_nested_fields { |
| let mut is_new_field = true; |
| for self_field in nested_fields.iter_mut() { |
| if from_field == self_field { |
| is_new_field = false; |
| break; |
| } |
| } |
| if is_new_field { |
| nested_fields.push(from_field.clone()); |
| } |
| } |
| } |
| _ => { |
| return Err(ArrowError::SchemaError( |
| "Fail to merge schema Field due to conflicting datatype" |
| .to_string(), |
| )); |
| } |
| }, |
| DataType::Null |
| | DataType::Boolean |
| | DataType::Int8 |
| | DataType::Int16 |
| | DataType::Int32 |
| | DataType::Int64 |
| | DataType::UInt8 |
| | DataType::UInt16 |
| | DataType::UInt32 |
| | DataType::UInt64 |
| | DataType::Float16 |
| | DataType::Float32 |
| | DataType::Float64 |
| | DataType::Timestamp(_, _) |
| | DataType::Date32(_) |
| | DataType::Date64(_) |
| | DataType::Time32(_) |
| | DataType::Time64(_) |
| | DataType::Duration(_) |
| | DataType::Binary |
| | DataType::LargeBinary |
| | DataType::Interval(_) |
| | DataType::LargeList(_) |
| | DataType::List(_) |
| | DataType::Dictionary(_, _) |
| | DataType::FixedSizeList(_, _) |
| | DataType::FixedSizeBinary(_) |
| | DataType::Utf8 |
| | DataType::LargeUtf8 |
| | DataType::Decimal(_, _) => { |
| if self.data_type.data_type != from.data_type.data_type { |
| return Err(ArrowError::SchemaError( |
| "Fail to merge schema Field due to conflicting datatype" |
| .to_string(), |
| )); |
| } |
| } |
| } |
| if from.data_type.nullable { |
| self.data_type.nullable = from.data_type.nullable; |
| } |
| |
| Ok(()) |
| } |
| } |
| |
| impl fmt::Display for Field { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| write!(f, "{}: {:?}", self.name, self.data_type.data_type) |
| } |
| } |
| |
| /// Describes the meta-data of an ordered sequence of relative types. |
| /// |
| /// Note that this information is only part of the meta-data and not part of the physical |
| /// memory layout. |
| #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] |
| pub struct Schema { |
| pub(crate) fields: Vec<Field>, |
| /// A map of key-value pairs containing additional meta data. |
| #[serde(skip_serializing_if = "HashMap::is_empty")] |
| pub(crate) metadata: HashMap<String, String>, |
| } |
| |
| impl Schema { |
| /// Creates an empty `Schema` |
| pub fn empty() -> Self { |
| Self { |
| fields: vec![], |
| metadata: HashMap::new(), |
| } |
| } |
| |
| /// Creates a new `Schema` from a sequence of `Field` values |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// # extern crate arrow; |
| /// # use arrow::datatypes::{Field, DataType, Schema}; |
| /// let field_a = Field::new("a", DataType::Int64, false); |
| /// let field_b = Field::new("b", DataType::Boolean, false); |
| /// |
| /// let schema = Schema::new(vec![field_a, field_b]); |
| /// ``` |
| pub fn new(fields: Vec<Field>) -> Self { |
| Self::new_with_metadata(fields, HashMap::new()) |
| } |
| |
| /// Creates a new `Schema` from a sequence of `Field` values |
| /// and adds additional metadata in form of key value pairs. |
| /// |
| /// # Example |
| /// |
| /// ``` |
| /// # extern crate arrow; |
| /// # use arrow::datatypes::{Field, DataType, Schema}; |
| /// # use std::collections::HashMap; |
| /// let field_a = Field::new("a", DataType::Int64, false); |
| /// let field_b = Field::new("b", DataType::Boolean, false); |
| /// |
| /// let mut metadata: HashMap<String, String> = HashMap::new(); |
| /// metadata.insert("row_count".to_string(), "100".to_string()); |
| /// |
| /// let schema = Schema::new_with_metadata(vec![field_a, field_b], metadata); |
| /// ``` |
| #[inline] |
| pub const fn new_with_metadata( |
| fields: Vec<Field>, |
| metadata: HashMap<String, String>, |
| ) -> Self { |
| Self { fields, metadata } |
| } |
| |
| /// Merge schema into self if it is compatible. Struct fields will be merged recursively. |
| /// |
| /// Example: |
| /// |
| /// ``` |
| /// use arrow::datatypes::*; |
| /// |
| /// let merged = Schema::try_merge(&vec![ |
| /// Schema::new(vec![ |
| /// Field::new("c1", DataType::Int64, false), |
| /// Field::new("c2", DataType::Utf8, false), |
| /// ]), |
| /// Schema::new(vec![ |
| /// Field::new("c1", DataType::Int64, true), |
| /// Field::new("c2", DataType::Utf8, false), |
| /// Field::new("c3", DataType::Utf8, false), |
| /// ]), |
| /// ]).unwrap(); |
| /// |
| /// assert_eq!( |
| /// merged, |
| /// Schema::new(vec![ |
| /// Field::new("c1", DataType::Int64, true), |
| /// Field::new("c2", DataType::Utf8, false), |
| /// Field::new("c3", DataType::Utf8, false), |
| /// ]), |
| /// ); |
| /// ``` |
| pub fn try_merge(schemas: &[Self]) -> Result<Self> { |
| let mut merged = Self::empty(); |
| |
| for schema in schemas { |
| for (key, value) in schema.metadata.iter() { |
| // merge metadata |
| match merged.metadata.get(key) { |
| Some(old_val) => { |
| if old_val != value { |
| return Err(ArrowError::SchemaError( |
| "Fail to merge schema due to conflicting metadata" |
| .to_string(), |
| )); |
| } |
| } |
| None => { |
| merged.metadata.insert(key.clone(), value.clone()); |
| } |
| } |
| } |
| // merge fileds |
| for field in &schema.fields { |
| let mut new_field = true; |
| for merged_field in &mut merged.fields { |
| if field.name != merged_field.name { |
| continue; |
| } |
| new_field = false; |
| merged_field.try_merge(field)? |
| } |
| // found a new field, add to field list |
| if new_field { |
| merged.fields.push(field.clone()); |
| } |
| } |
| } |
| |
| Ok(merged) |
| } |
| |
| /// Returns an immutable reference of the vector of `Field` instances |
| #[inline] |
| pub const fn fields(&self) -> &Vec<Field> { |
| &self.fields |
| } |
| |
| /// Returns an immutable reference of a specific `Field` instance selected using an |
| /// offset within the internal `fields` vector |
| pub fn field(&self, i: usize) -> &Field { |
| &self.fields[i] |
| } |
| |
| /// Returns an immutable reference of a specific `Field` instance selected by name |
| pub fn field_with_name(&self, name: &str) -> Result<&Field> { |
| Ok(&self.fields[self.index_of(name)?]) |
| } |
| |
| /// Find the index of the column with the given name |
| pub fn index_of(&self, name: &str) -> Result<usize> { |
| for i in 0..self.fields.len() { |
| if self.fields[i].name == name { |
| return Ok(i); |
| } |
| } |
| let valid_fields: Vec<String> = |
| self.fields.iter().map(|f| f.name().clone()).collect(); |
| Err(ArrowError::InvalidArgumentError(format!( |
| "Unable to get field named \"{}\". Valid fields: {:?}", |
| name, valid_fields |
| ))) |
| } |
| |
| /// Returns an immutable reference to the Map of custom metadata key-value pairs. |
| #[inline] |
| pub const fn metadata(&self) -> &HashMap<String, String> { |
| &self.metadata |
| } |
| |
| /// Look up a column by name and return a immutable reference to the column along with |
| /// it's index |
| pub fn column_with_name(&self, name: &str) -> Option<(usize, &Field)> { |
| self.fields |
| .iter() |
| .enumerate() |
| .find(|&(_, c)| c.name == name) |
| } |
| |
| /// Generate a JSON representation of the `Schema` |
| pub fn to_json(&self) -> Value { |
| json!({ |
| "fields": self.fields.iter().map(|field| field.to_json()).collect::<Vec<Value>>(), |
| "metadata": serde_json::to_value(&self.metadata).unwrap() |
| }) |
| } |
| |
| /// Parse a `Schema` definition from a JSON representation |
| pub fn from(json: &Value) -> Result<Self> { |
| match *json { |
| Value::Object(ref schema) => { |
| let fields = if let Some(Value::Array(fields)) = schema.get("fields") { |
| fields |
| .iter() |
| .map(|f| Field::from(f)) |
| .collect::<Result<_>>()? |
| } else { |
| return Err(ArrowError::ParseError( |
| "Schema fields should be an array".to_string(), |
| )); |
| }; |
| |
| let metadata = if let Some(value) = schema.get("metadata") { |
| Self::from_metadata(value)? |
| } else { |
| HashMap::default() |
| }; |
| |
| Ok(Self { fields, metadata }) |
| } |
| _ => Err(ArrowError::ParseError( |
| "Invalid json value type for schema".to_string(), |
| )), |
| } |
| } |
| |
| /// Parse a `metadata` definition from a JSON representation |
| /// The JSON can either be an Object or an Array of Objects |
| fn from_metadata(json: &Value) -> Result<HashMap<String, String>> { |
| match json { |
| Value::Array(_) => { |
| let mut hashmap = HashMap::new(); |
| let values: Vec<MetadataKeyValue> = serde_json::from_value(json.clone()) |
| .map_err(|_| { |
| ArrowError::JsonError( |
| "Unable to parse object into key-value pair".to_string(), |
| ) |
| })?; |
| for meta in values { |
| hashmap.insert(meta.key.clone(), meta.value); |
| } |
| Ok(hashmap) |
| } |
| Value::Object(md) => md |
| .iter() |
| .map(|(k, v)| { |
| if let Value::String(v) = v { |
| Ok((k.to_string(), v.to_string())) |
| } else { |
| Err(ArrowError::ParseError( |
| "metadata `value` field must be a string".to_string(), |
| )) |
| } |
| }) |
| .collect::<Result<_>>(), |
| _ => Err(ArrowError::ParseError( |
| "`metadata` field must be an object".to_string(), |
| )), |
| } |
| } |
| } |
| |
| impl fmt::Display for Schema { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| f.write_str( |
| &self |
| .fields |
| .iter() |
| .map(|c| c.to_string()) |
| .collect::<Vec<String>>() |
| .join(", "), |
| ) |
| } |
| } |
| |
| /// A reference-counted reference to a [`Schema`](crate::datatypes::Schema). |
| pub type SchemaRef = Arc<Schema>; |
| |
| #[derive(Deserialize)] |
| struct MetadataKeyValue { |
| key: String, |
| value: String, |
| } |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| use serde_json::Number; |
| use serde_json::Value::{Bool, Number as VNumber}; |
| use std::f32::NAN; |
| |
| #[test] |
| fn create_struct_type() { |
| let _person = DataType::Struct(vec![ |
| Field::new("first_name", DataType::Utf8, false), |
| Field::new("last_name", DataType::Utf8, false), |
| Field::new( |
| "address", |
| DataType::Struct(vec![ |
| Field::new("street", DataType::Utf8, false), |
| Field::new("zip", DataType::UInt16, false), |
| ]), |
| false, |
| ), |
| ]); |
| } |
| |
| #[test] |
| fn serde_struct_type() { |
| let person = DataType::Struct(vec![ |
| Field::new("first_name", DataType::Utf8, false), |
| Field::new("last_name", DataType::Utf8, false), |
| Field::new( |
| "address", |
| DataType::Struct(vec![ |
| Field::new("street", DataType::Utf8, false), |
| Field::new("zip", DataType::UInt16, false), |
| ]), |
| false, |
| ), |
| ]); |
| |
| let serialized = serde_json::to_string(&person).unwrap(); |
| |
| // NOTE that this is testing the default (derived) serialization format, not the |
| // JSON format specified in metadata.md |
| |
| assert_eq!( |
| "{\"Struct\":[\ |
| {\"name\":\"first_name\",\"data_type\":{\"data_type\":\"Utf8\",\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false},\ |
| {\"name\":\"last_name\",\"data_type\":{\"data_type\":\"Utf8\",\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false},\ |
| {\"name\":\"address\",\"data_type\":{\"data_type\":{\"Struct\":\ |
| [{\"name\":\"street\",\"data_type\":{\"data_type\":\"Utf8\",\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false},\ |
| {\"name\":\"zip\",\"data_type\":{\"data_type\":\"UInt16\",\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false}\ |
| ]},\"nullable\":false},\"dict_id\":0,\"dict_is_ordered\":false}]}", |
| serialized |
| ); |
| |
| let deserialized = serde_json::from_str(&serialized).unwrap(); |
| |
| assert_eq!(person, deserialized); |
| } |
| |
| #[test] |
| fn struct_field_to_json() { |
| let f = Field::new( |
| "address", |
| DataType::Struct(vec![ |
| Field::new("street", DataType::Utf8, false), |
| Field::new("zip", DataType::UInt16, false), |
| ]), |
| false, |
| ); |
| let value: Value = serde_json::from_str( |
| r#"{ |
| "name": "address", |
| "nullable": false, |
| "type": { |
| "name": "struct" |
| }, |
| "children": [ |
| { |
| "name": "street", |
| "nullable": false, |
| "type": { |
| "name": "utf8" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "zip", |
| "nullable": false, |
| "type": { |
| "name": "int", |
| "bitWidth": 16, |
| "isSigned": false |
| }, |
| "children": [] |
| } |
| ] |
| }"#, |
| ) |
| .unwrap(); |
| assert_eq!(value, f.to_json()); |
| } |
| |
| #[test] |
| fn primitive_field_to_json() { |
| let f = Field::new("first_name", DataType::Utf8, false); |
| let value: Value = serde_json::from_str( |
| r#"{ |
| "name": "first_name", |
| "nullable": false, |
| "type": { |
| "name": "utf8" |
| }, |
| "children": [] |
| }"#, |
| ) |
| .unwrap(); |
| assert_eq!(value, f.to_json()); |
| } |
| #[test] |
| fn parse_struct_from_json() { |
| let json = r#" |
| { |
| "name": "address", |
| "type": { |
| "name": "struct" |
| }, |
| "nullable": false, |
| "children": [ |
| { |
| "name": "street", |
| "type": { |
| "name": "utf8" |
| }, |
| "nullable": false, |
| "children": [] |
| }, |
| { |
| "name": "zip", |
| "type": { |
| "name": "int", |
| "isSigned": false, |
| "bitWidth": 16 |
| }, |
| "nullable": false, |
| "children": [] |
| } |
| ] |
| } |
| "#; |
| let value: Value = serde_json::from_str(json).unwrap(); |
| let dt = Field::from(&value).unwrap(); |
| |
| let expected = Field::new( |
| "address", |
| DataType::Struct(vec![ |
| Field::new("street", DataType::Utf8, false), |
| Field::new("zip", DataType::UInt16, false), |
| ]), |
| false, |
| ); |
| |
| assert_eq!(expected, dt); |
| } |
| |
| #[test] |
| fn parse_utf8_from_json() { |
| let json = "{\"name\":\"utf8\"}"; |
| let value: Value = serde_json::from_str(json).unwrap(); |
| let dt = DataType::from(&value).unwrap(); |
| assert_eq!(DataType::Utf8, dt); |
| } |
| |
| #[test] |
| fn parse_int32_from_json() { |
| let json = "{\"name\": \"int\", \"isSigned\": true, \"bitWidth\": 32}"; |
| let value: Value = serde_json::from_str(json).unwrap(); |
| let dt = DataType::from(&value).unwrap(); |
| assert_eq!(DataType::Int32, dt); |
| } |
| |
| #[test] |
| fn schema_json() { |
| // Add some custom metadata |
| let metadata: HashMap<String, String> = |
| [("Key".to_string(), "Value".to_string())] |
| .iter() |
| .cloned() |
| .collect(); |
| |
| let schema = Schema::new_with_metadata( |
| vec![ |
| Field::new("c1", DataType::Utf8, false), |
| Field::new("c2", DataType::Binary, false), |
| Field::new("c3", DataType::FixedSizeBinary(3), false), |
| Field::new("c4", DataType::Boolean, false), |
| Field::new("c5", DataType::Date32(DateUnit::Day), false), |
| Field::new("c6", DataType::Date64(DateUnit::Millisecond), false), |
| Field::new("c7", DataType::Time32(TimeUnit::Second), false), |
| Field::new("c8", DataType::Time32(TimeUnit::Millisecond), false), |
| Field::new("c9", DataType::Time32(TimeUnit::Microsecond), false), |
| Field::new("c10", DataType::Time32(TimeUnit::Nanosecond), false), |
| Field::new("c11", DataType::Time64(TimeUnit::Second), false), |
| Field::new("c12", DataType::Time64(TimeUnit::Millisecond), false), |
| Field::new("c13", DataType::Time64(TimeUnit::Microsecond), false), |
| Field::new("c14", DataType::Time64(TimeUnit::Nanosecond), false), |
| Field::new("c15", DataType::Timestamp(TimeUnit::Second, None), false), |
| Field::new( |
| "c16", |
| DataType::Timestamp( |
| TimeUnit::Millisecond, |
| Some(Arc::new("UTC".to_string())), |
| ), |
| false, |
| ), |
| Field::new( |
| "c17", |
| DataType::Timestamp( |
| TimeUnit::Microsecond, |
| Some(Arc::new("Africa/Johannesburg".to_string())), |
| ), |
| false, |
| ), |
| Field::new( |
| "c18", |
| DataType::Timestamp(TimeUnit::Nanosecond, None), |
| false, |
| ), |
| Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false), |
| Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false), |
| Field::new( |
| "c21", |
| DataType::List(Box::new(NullableDataType::new( |
| DataType::Boolean, |
| true, |
| ))), |
| false, |
| ), |
| Field::new( |
| "c22", |
| DataType::FixedSizeList( |
| Box::new(NullableDataType::new(DataType::Boolean, false)), |
| 5, |
| ), |
| false, |
| ), |
| Field::new( |
| "c23", |
| DataType::List(Box::new(NullableDataType::new( |
| DataType::List(Box::new(NullableDataType::new( |
| DataType::Struct(vec![]), |
| true, |
| ))), |
| false, |
| ))), |
| true, |
| ), |
| Field::new( |
| "c24", |
| DataType::Struct(vec![ |
| Field::new("a", DataType::Utf8, false), |
| Field::new("b", DataType::UInt16, false), |
| ]), |
| false, |
| ), |
| Field::new("c25", DataType::Interval(IntervalUnit::YearMonth), true), |
| Field::new("c26", DataType::Interval(IntervalUnit::DayTime), true), |
| Field::new("c27", DataType::Duration(TimeUnit::Second), false), |
| Field::new("c28", DataType::Duration(TimeUnit::Millisecond), false), |
| Field::new("c29", DataType::Duration(TimeUnit::Microsecond), false), |
| Field::new("c30", DataType::Duration(TimeUnit::Nanosecond), false), |
| Field::new_dict( |
| "c31", |
| DataType::Dictionary( |
| Box::new(DataType::Int32), |
| Box::new(DataType::Utf8), |
| ), |
| true, |
| 123, |
| true, |
| ), |
| Field::new("c32", DataType::LargeBinary, true), |
| Field::new("c33", DataType::LargeUtf8, true), |
| Field::new( |
| "c34", |
| DataType::LargeList(Box::new(NullableDataType::new( |
| DataType::LargeList(Box::new(NullableDataType::new( |
| DataType::Struct(vec![]), |
| false, |
| ))), |
| true, |
| ))), |
| true, |
| ), |
| ], |
| metadata, |
| ); |
| |
| let expected = schema.to_json(); |
| let json = r#"{ |
| "fields": [ |
| { |
| "name": "c1", |
| "nullable": false, |
| "type": { |
| "name": "utf8" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c2", |
| "nullable": false, |
| "type": { |
| "name": "binary" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c3", |
| "nullable": false, |
| "type": { |
| "name": "fixedsizebinary", |
| "byteWidth": 3 |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c4", |
| "nullable": false, |
| "type": { |
| "name": "bool" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c5", |
| "nullable": false, |
| "type": { |
| "name": "date", |
| "unit": "DAY" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c6", |
| "nullable": false, |
| "type": { |
| "name": "date", |
| "unit": "MILLISECOND" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c7", |
| "nullable": false, |
| "type": { |
| "name": "time", |
| "bitWidth": 32, |
| "unit": "SECOND" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c8", |
| "nullable": false, |
| "type": { |
| "name": "time", |
| "bitWidth": 32, |
| "unit": "MILLISECOND" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c9", |
| "nullable": false, |
| "type": { |
| "name": "time", |
| "bitWidth": 32, |
| "unit": "MICROSECOND" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c10", |
| "nullable": false, |
| "type": { |
| "name": "time", |
| "bitWidth": 32, |
| "unit": "NANOSECOND" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c11", |
| "nullable": false, |
| "type": { |
| "name": "time", |
| "bitWidth": 64, |
| "unit": "SECOND" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c12", |
| "nullable": false, |
| "type": { |
| "name": "time", |
| "bitWidth": 64, |
| "unit": "MILLISECOND" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c13", |
| "nullable": false, |
| "type": { |
| "name": "time", |
| "bitWidth": 64, |
| "unit": "MICROSECOND" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c14", |
| "nullable": false, |
| "type": { |
| "name": "time", |
| "bitWidth": 64, |
| "unit": "NANOSECOND" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c15", |
| "nullable": false, |
| "type": { |
| "name": "timestamp", |
| "unit": "SECOND" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c16", |
| "nullable": false, |
| "type": { |
| "name": "timestamp", |
| "unit": "MILLISECOND", |
| "timezone": "UTC" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c17", |
| "nullable": false, |
| "type": { |
| "name": "timestamp", |
| "unit": "MICROSECOND", |
| "timezone": "Africa/Johannesburg" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c18", |
| "nullable": false, |
| "type": { |
| "name": "timestamp", |
| "unit": "NANOSECOND" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c19", |
| "nullable": false, |
| "type": { |
| "name": "interval", |
| "unit": "DAY_TIME" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c20", |
| "nullable": false, |
| "type": { |
| "name": "interval", |
| "unit": "YEAR_MONTH" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c21", |
| "nullable": false, |
| "type": { |
| "name": "list" |
| }, |
| "children": [ |
| { |
| "name": "item", |
| "nullable": true, |
| "type": { |
| "name": "bool" |
| }, |
| "children": [] |
| } |
| ] |
| }, |
| { |
| "name": "c22", |
| "nullable": false, |
| "type": { |
| "name": "fixedsizelist", |
| "listSize": 5 |
| }, |
| "children": [ |
| { |
| "name": "item", |
| "nullable": false, |
| "type": { |
| "name": "bool" |
| }, |
| "children": [] |
| } |
| ] |
| }, |
| { |
| "name": "c23", |
| "nullable": true, |
| "type": { |
| "name": "list" |
| }, |
| "children": [ |
| { |
| "name": "item", |
| "nullable": false, |
| "type": { |
| "name": "list" |
| }, |
| "children": [ |
| { |
| "name": "item", |
| "nullable": true, |
| "type": { |
| "name": "struct" |
| }, |
| "children": [] |
| } |
| ] |
| } |
| ] |
| }, |
| { |
| "name": "c24", |
| "nullable": false, |
| "type": { |
| "name": "struct" |
| }, |
| "children": [ |
| { |
| "name": "a", |
| "nullable": false, |
| "type": { |
| "name": "utf8" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "b", |
| "nullable": false, |
| "type": { |
| "name": "int", |
| "bitWidth": 16, |
| "isSigned": false |
| }, |
| "children": [] |
| } |
| ] |
| }, |
| { |
| "name": "c25", |
| "nullable": true, |
| "type": { |
| "name": "interval", |
| "unit": "YEAR_MONTH" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c26", |
| "nullable": true, |
| "type": { |
| "name": "interval", |
| "unit": "DAY_TIME" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c27", |
| "nullable": false, |
| "type": { |
| "name": "duration", |
| "unit": "SECOND" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c28", |
| "nullable": false, |
| "type": { |
| "name": "duration", |
| "unit": "MILLISECOND" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c29", |
| "nullable": false, |
| "type": { |
| "name": "duration", |
| "unit": "MICROSECOND" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c30", |
| "nullable": false, |
| "type": { |
| "name": "duration", |
| "unit": "NANOSECOND" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c31", |
| "nullable": true, |
| "children": [], |
| "type": { |
| "name": "utf8" |
| }, |
| "dictionary": { |
| "id": 123, |
| "indexType": { |
| "name": "int", |
| "bitWidth": 32, |
| "isSigned": true |
| }, |
| "isOrdered": true |
| } |
| }, |
| { |
| "name": "c32", |
| "nullable": true, |
| "type": { |
| "name": "largebinary" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c33", |
| "nullable": true, |
| "type": { |
| "name": "largeutf8" |
| }, |
| "children": [] |
| }, |
| { |
| "name": "c34", |
| "nullable": true, |
| "type": { |
| "name": "largelist" |
| }, |
| "children": [ |
| { |
| "name": "item", |
| "nullable": true, |
| "type": { |
| "name": "largelist" |
| }, |
| "children": [ |
| { |
| "name": "item", |
| "nullable": false, |
| "type": { |
| "name": "struct" |
| }, |
| "children": [] |
| } |
| ] |
| } |
| ] |
| } |
| ], |
| "metadata" : { |
| "Key": "Value" |
| } |
| }"#; |
| let value: Value = serde_json::from_str(&json).unwrap(); |
| assert_eq!(expected, value); |
| |
| // convert back to a schema |
| let value: Value = serde_json::from_str(&json).unwrap(); |
| let schema2 = Schema::from(&value).unwrap(); |
| |
| assert_eq!(schema, schema2); |
| |
| // Check that empty metadata produces empty value in JSON and can be parsed |
| let json = r#"{ |
| "fields": [ |
| { |
| "name": "c1", |
| "nullable": false, |
| "type": { |
| "name": "utf8" |
| }, |
| "children": [] |
| } |
| ], |
| "metadata": {} |
| }"#; |
| let value: Value = serde_json::from_str(&json).unwrap(); |
| let schema = Schema::from(&value).unwrap(); |
| assert!(schema.metadata.is_empty()); |
| |
| // Check that metadata field is not required in the JSON. |
| let json = r#"{ |
| "fields": [ |
| { |
| "name": "c1", |
| "nullable": false, |
| "type": { |
| "name": "utf8" |
| }, |
| "children": [] |
| } |
| ] |
| }"#; |
| let value: Value = serde_json::from_str(&json).unwrap(); |
| let schema = Schema::from(&value).unwrap(); |
| assert!(schema.metadata.is_empty()); |
| } |
| |
| #[test] |
| fn create_schema_string() { |
| let schema = person_schema(); |
| assert_eq!(schema.to_string(), "first_name: Utf8, \ |
| last_name: Utf8, \ |
| address: Struct([\ |
| Field { name: \"street\", data_type: NullableDataType { data_type: Utf8, nullable: false }, dict_id: 0, dict_is_ordered: false }, \ |
| Field { name: \"zip\", data_type: NullableDataType { data_type: UInt16, nullable: false }, dict_id: 0, dict_is_ordered: false }])") |
| } |
| |
| #[test] |
| fn schema_field_accessors() { |
| let schema = person_schema(); |
| |
| // test schema accessors |
| assert_eq!(schema.fields().len(), 3); |
| |
| // test field accessors |
| let first_name = &schema.fields()[0]; |
| assert_eq!(first_name.name(), "first_name"); |
| assert_eq!(first_name.data_type(), &DataType::Utf8); |
| assert_eq!(first_name.is_nullable(), false); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Unable to get field named \\\"nickname\\\". Valid fields: [\\\"first_name\\\", \\\"last_name\\\", \\\"address\\\"]" |
| )] |
| fn schema_index_of() { |
| let schema = person_schema(); |
| assert_eq!(schema.index_of("first_name").unwrap(), 0); |
| assert_eq!(schema.index_of("last_name").unwrap(), 1); |
| schema.index_of("nickname").unwrap(); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Unable to get field named \\\"nickname\\\". Valid fields: [\\\"first_name\\\", \\\"last_name\\\", \\\"address\\\"]" |
| )] |
| fn schema_field_with_name() { |
| let schema = person_schema(); |
| assert_eq!( |
| schema.field_with_name("first_name").unwrap().name(), |
| "first_name" |
| ); |
| assert_eq!( |
| schema.field_with_name("last_name").unwrap().name(), |
| "last_name" |
| ); |
| schema.field_with_name("nickname").unwrap(); |
| } |
| |
| #[test] |
| fn schema_equality() { |
| let schema1 = Schema::new(vec![ |
| Field::new("c1", DataType::Utf8, false), |
| Field::new("c2", DataType::Float64, true), |
| Field::new("c3", DataType::LargeBinary, true), |
| ]); |
| let schema2 = Schema::new(vec![ |
| Field::new("c1", DataType::Utf8, false), |
| Field::new("c2", DataType::Float64, true), |
| Field::new("c3", DataType::LargeBinary, true), |
| ]); |
| |
| assert_eq!(schema1, schema2); |
| |
| let schema3 = Schema::new(vec![ |
| Field::new("c1", DataType::Utf8, false), |
| Field::new("c2", DataType::Float32, true), |
| ]); |
| let schema4 = Schema::new(vec![ |
| Field::new("C1", DataType::Utf8, false), |
| Field::new("C2", DataType::Float64, true), |
| ]); |
| |
| assert!(schema1 != schema3); |
| assert!(schema1 != schema4); |
| assert!(schema2 != schema3); |
| assert!(schema2 != schema4); |
| assert!(schema3 != schema4); |
| } |
| |
| #[test] |
| fn test_arrow_native_type_to_json() { |
| assert_eq!(Some(Bool(true)), true.into_json_value()); |
| assert_eq!(Some(VNumber(Number::from(1))), 1i8.into_json_value()); |
| assert_eq!(Some(VNumber(Number::from(1))), 1i16.into_json_value()); |
| assert_eq!(Some(VNumber(Number::from(1))), 1i32.into_json_value()); |
| assert_eq!(Some(VNumber(Number::from(1))), 1i64.into_json_value()); |
| assert_eq!(Some(VNumber(Number::from(1))), 1u8.into_json_value()); |
| assert_eq!(Some(VNumber(Number::from(1))), 1u16.into_json_value()); |
| assert_eq!(Some(VNumber(Number::from(1))), 1u32.into_json_value()); |
| assert_eq!(Some(VNumber(Number::from(1))), 1u64.into_json_value()); |
| assert_eq!( |
| Some(VNumber(Number::from_f64(0.01f64).unwrap())), |
| 0.01.into_json_value() |
| ); |
| assert_eq!( |
| Some(VNumber(Number::from_f64(0.01f64).unwrap())), |
| 0.01f64.into_json_value() |
| ); |
| assert_eq!(None, NAN.into_json_value()); |
| } |
| |
| fn person_schema() -> Schema { |
| Schema::new(vec![ |
| Field::new("first_name", DataType::Utf8, false), |
| Field::new("last_name", DataType::Utf8, false), |
| Field::new( |
| "address", |
| DataType::Struct(vec![ |
| Field::new("street", DataType::Utf8, false), |
| Field::new("zip", DataType::UInt16, false), |
| ]), |
| false, |
| ), |
| ]) |
| } |
| |
| #[test] |
| fn test_schema_merge() -> Result<()> { |
| let merged = Schema::try_merge(&[ |
| Schema::new(vec![ |
| Field::new("first_name", DataType::Utf8, false), |
| Field::new("last_name", DataType::Utf8, false), |
| Field::new( |
| "address", |
| DataType::Struct(vec![Field::new("zip", DataType::UInt16, false)]), |
| false, |
| ), |
| ]), |
| Schema::new_with_metadata( |
| vec![ |
| // nullable merge |
| Field::new("last_name", DataType::Utf8, true), |
| Field::new( |
| "address", |
| DataType::Struct(vec![ |
| // add new nested field |
| Field::new("street", DataType::Utf8, false), |
| // nullable merge on nested field |
| Field::new("zip", DataType::UInt16, true), |
| ]), |
| false, |
| ), |
| // new field |
| Field::new("number", DataType::Utf8, true), |
| ], |
| [("foo".to_string(), "bar".to_string())] |
| .iter() |
| .cloned() |
| .collect::<HashMap<String, String>>(), |
| ), |
| ])?; |
| |
| assert_eq!( |
| merged, |
| Schema::new_with_metadata( |
| vec![ |
| Field::new("first_name", DataType::Utf8, false), |
| Field::new("last_name", DataType::Utf8, true), |
| Field::new( |
| "address", |
| DataType::Struct(vec![ |
| Field::new("zip", DataType::UInt16, true), |
| Field::new("street", DataType::Utf8, false), |
| ]), |
| false, |
| ), |
| Field::new("number", DataType::Utf8, true), |
| ], |
| [("foo".to_string(), "bar".to_string())] |
| .iter() |
| .cloned() |
| .collect::<HashMap<String, String>>() |
| ) |
| ); |
| |
| // support merge union fields |
| assert_eq!( |
| Schema::try_merge(&[ |
| Schema::new(vec![Field::new( |
| "c1", |
| DataType::Union(vec![ |
| Field::new("c11", DataType::Utf8, true), |
| Field::new("c12", DataType::Utf8, true), |
| ]), |
| false |
| ),]), |
| Schema::new(vec![Field::new( |
| "c1", |
| DataType::Union(vec![ |
| Field::new("c12", DataType::Utf8, true), |
| Field::new("c13", DataType::Time64(TimeUnit::Second), true), |
| ]), |
| false |
| ),]) |
| ])?, |
| Schema::new(vec![Field::new( |
| "c1", |
| DataType::Union(vec![ |
| Field::new("c11", DataType::Utf8, true), |
| Field::new("c12", DataType::Utf8, true), |
| Field::new("c13", DataType::Time64(TimeUnit::Second), true), |
| ]), |
| false |
| ),]), |
| ); |
| |
| // incompatible field should throw error |
| assert!(Schema::try_merge(&[ |
| Schema::new(vec![ |
| Field::new("first_name", DataType::Utf8, false), |
| Field::new("last_name", DataType::Utf8, false), |
| ]), |
| Schema::new(vec![Field::new("last_name", DataType::Int64, false),]) |
| ]) |
| .is_err()); |
| |
| // incompatible metadata should throw error |
| assert!(Schema::try_merge(&[ |
| Schema::new_with_metadata( |
| vec![Field::new("first_name", DataType::Utf8, false)], |
| [("foo".to_string(), "bar".to_string()),] |
| .iter() |
| .cloned() |
| .collect::<HashMap<String, String>>() |
| ), |
| Schema::new_with_metadata( |
| vec![Field::new("last_name", DataType::Utf8, false)], |
| [("foo".to_string(), "baz".to_string()),] |
| .iter() |
| .cloned() |
| .collect::<HashMap<String, String>>() |
| ) |
| ]) |
| .is_err()); |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_compare_nested_types() { |
| let list_type_a = &DataType::List(Box::new(NullableDataType::new( |
| DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), |
| true, |
| ))); |
| let list_type_b = &DataType::List(Box::new(NullableDataType::new( |
| DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), |
| true, |
| ))); |
| |
| assert_eq!(list_type_a, list_type_b); |
| } |
| |
| #[test] |
| fn test_compare_mismatching_types() { |
| let list_type_a = &DataType::LargeList(Box::new(NullableDataType::new( |
| DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), |
| true, |
| ))); |
| let list_type_b = &DataType::LargeList(Box::new(NullableDataType::new( |
| DataType::Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)), |
| false, |
| ))); |
| |
| assert_ne!(list_type_a, list_type_b); |
| } |
| } |
| |
| #[cfg(all( |
| test, |
| any(target_arch = "x86", target_arch = "x86_64"), |
| feature = "simd" |
| ))] |
| mod arrow_numeric_type_tests { |
| use crate::datatypes::{ |
| ArrowNumericType, Float32Type, Float64Type, Int32Type, Int64Type, Int8Type, |
| UInt16Type, |
| }; |
| use packed_simd::*; |
| use FromCast; |
| |
| #[test] |
| fn test_mask_f64() { |
| let mask = Float64Type::mask_from_u64(0b10101010); |
| |
| let expected = |
| m64x8::from_cast(i64x8::from_slice_unaligned(&[-1, 0, -1, 0, -1, 0, -1, 0])); |
| |
| assert_eq!(expected, mask); |
| } |
| |
| #[test] |
| fn test_mask_u64() { |
| let mask = Int64Type::mask_from_u64(0b01010101); |
| |
| let expected = |
| m64x8::from_cast(i64x8::from_slice_unaligned(&[0, -1, 0, -1, 0, -1, 0, -1])); |
| |
| assert_eq!(expected, mask); |
| } |
| |
| #[test] |
| fn test_mask_f32() { |
| let mask = Float32Type::mask_from_u64(0b10101010_10101010); |
| |
| let expected = m32x16::from_cast(i32x16::from_slice_unaligned(&[ |
| -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, |
| ])); |
| |
| assert_eq!(expected, mask); |
| } |
| |
| #[test] |
| fn test_mask_i32() { |
| let mask = Int32Type::mask_from_u64(0b01010101_01010101); |
| |
| let expected = m32x16::from_cast(i32x16::from_slice_unaligned(&[ |
| 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, |
| ])); |
| |
| assert_eq!(expected, mask); |
| } |
| |
| #[test] |
| fn test_mask_u16() { |
| let mask = UInt16Type::mask_from_u64(0b01010101_01010101_10101010_10101010); |
| |
| let expected = m16x32::from_cast(i16x32::from_slice_unaligned(&[ |
| -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, |
| 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, |
| ])); |
| |
| assert_eq!(expected, mask); |
| } |
| |
| #[test] |
| fn test_mask_i8() { |
| let mask = Int8Type::mask_from_u64( |
| 0b01010101_01010101_10101010_10101010_01010101_01010101_10101010_10101010, |
| ); |
| |
| let expected = m8x64::from_cast(i8x64::from_slice_unaligned(&[ |
| -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, |
| 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, |
| -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, |
| ])); |
| |
| assert_eq!(expected, mask); |
| } |
| } |