blob: abd526f839eb913151502110dfe43d9a2c26c6da [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Defines the data-types of Arrow arrays.
//!
//! For an overview of the terminology used within the arrow project and more general
//! information regarding data-types and memory layouts see
//! [here](https://arrow.apache.org/docs/memory_layout.html).
use std::fmt;
use std::mem::size_of;
#[cfg(feature = "simd")]
use std::ops::{Add, Div, Mul, Sub};
use std::slice::from_raw_parts;
use std::str::FromStr;
#[cfg(feature = "simd")]
use packed_simd::*;
use serde_derive::{Deserialize, Serialize};
use serde_json::{json, Number, Value, Value::Number as VNumber};
use crate::error::{ArrowError, Result};
use std::sync::Arc;
/// The possible relative types that are supported.
///
/// The variants of this enum include primitive fixed size types as well as parametric or
/// nested types.
/// Currently the Rust implementation supports the following nested types:
/// - `List<T>`
/// - `Struct<T, U, V, ...>`
///
/// Nested types can themselves be nested within other arrays.
/// For more information on these types please see
/// [here](https://arrow.apache.org/docs/memory_layout.html).
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum DataType {
Boolean,
Int8,
Int16,
Int32,
Int64,
UInt8,
UInt16,
UInt32,
UInt64,
Float16,
Float32,
Float64,
Timestamp(TimeUnit),
Date32(DateUnit),
Date64(DateUnit),
Time32(TimeUnit),
Time64(TimeUnit),
Interval(IntervalUnit),
Utf8,
List(Box<DataType>),
Struct(Vec<Field>),
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum DateUnit {
Day,
Millisecond,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum TimeUnit {
Second,
Millisecond,
Microsecond,
Nanosecond,
}
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum IntervalUnit {
YearMonth,
DayTime,
}
/// Contains the meta-data for a single relative type.
///
/// The `Schema` object is an ordered collection of `Field` objects.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct Field {
name: String,
data_type: DataType,
nullable: bool,
}
pub trait ArrowNativeType:
fmt::Debug + Send + Sync + Copy + PartialOrd + FromStr + 'static
{
fn into_json_value(self) -> Option<Value>;
}
/// Trait indicating a primitive fixed-width type (bool, ints and floats).
pub trait ArrowPrimitiveType: 'static {
/// Corresponding Rust native type for the primitive type.
type Native: ArrowNativeType;
/// Returns the corresponding Arrow data type of this primitive type.
fn get_data_type() -> DataType;
/// Returns the bit width of this primitive type.
fn get_bit_width() -> usize;
/// Returns a default value of this primitive type.
///
/// This is useful for aggregate array ops like `sum()`, `mean()`.
fn default_value() -> Self::Native;
}
impl ArrowNativeType for bool {
fn into_json_value(self) -> Option<Value> {
Some(self.into())
}
}
impl ArrowNativeType for i8 {
fn into_json_value(self) -> Option<Value> {
Some(VNumber(Number::from(self)))
}
}
impl ArrowNativeType for i16 {
fn into_json_value(self) -> Option<Value> {
Some(VNumber(Number::from(self)))
}
}
impl ArrowNativeType for i32 {
fn into_json_value(self) -> Option<Value> {
Some(VNumber(Number::from(self)))
}
}
impl ArrowNativeType for i64 {
fn into_json_value(self) -> Option<Value> {
Some(VNumber(Number::from(self)))
}
}
impl ArrowNativeType for u8 {
fn into_json_value(self) -> Option<Value> {
Some(VNumber(Number::from(self)))
}
}
impl ArrowNativeType for u16 {
fn into_json_value(self) -> Option<Value> {
Some(VNumber(Number::from(self)))
}
}
impl ArrowNativeType for u32 {
fn into_json_value(self) -> Option<Value> {
Some(VNumber(Number::from(self)))
}
}
impl ArrowNativeType for u64 {
fn into_json_value(self) -> Option<Value> {
Some(VNumber(Number::from(self)))
}
}
impl ArrowNativeType for f32 {
fn into_json_value(self) -> Option<Value> {
Number::from_f64(self as f64).map(|num| VNumber(num))
}
}
impl ArrowNativeType for f64 {
fn into_json_value(self) -> Option<Value> {
Number::from_f64(self).map(|num| VNumber(num))
}
}
macro_rules! make_type {
($name:ident, $native_ty:ty, $data_ty:expr, $bit_width:expr, $default_val:expr) => {
pub struct $name {}
impl ArrowPrimitiveType for $name {
type Native = $native_ty;
fn get_data_type() -> DataType {
$data_ty
}
fn get_bit_width() -> usize {
$bit_width
}
fn default_value() -> Self::Native {
$default_val
}
}
};
}
make_type!(BooleanType, bool, DataType::Boolean, 1, false);
make_type!(Int8Type, i8, DataType::Int8, 8, 0i8);
make_type!(Int16Type, i16, DataType::Int16, 16, 0i16);
make_type!(Int32Type, i32, DataType::Int32, 32, 0i32);
make_type!(Int64Type, i64, DataType::Int64, 64, 0i64);
make_type!(UInt8Type, u8, DataType::UInt8, 8, 0u8);
make_type!(UInt16Type, u16, DataType::UInt16, 16, 0u16);
make_type!(UInt32Type, u32, DataType::UInt32, 32, 0u32);
make_type!(UInt64Type, u64, DataType::UInt64, 64, 0u64);
make_type!(Float32Type, f32, DataType::Float32, 32, 0.0f32);
make_type!(Float64Type, f64, DataType::Float64, 64, 0.0f64);
make_type!(
TimestampSecondType,
i64,
DataType::Timestamp(TimeUnit::Second),
64,
0i64
);
make_type!(
TimestampMillisecondType,
i64,
DataType::Timestamp(TimeUnit::Millisecond),
64,
0i64
);
make_type!(
TimestampMicrosecondType,
i64,
DataType::Timestamp(TimeUnit::Microsecond),
64,
0i64
);
make_type!(
TimestampNanosecondType,
i64,
DataType::Timestamp(TimeUnit::Nanosecond),
64,
0i64
);
make_type!(Date32Type, i32, DataType::Date32(DateUnit::Day), 32, 0i32);
make_type!(
Date64Type,
i64,
DataType::Date64(DateUnit::Millisecond),
64,
0i64
);
make_type!(
Time32SecondType,
i32,
DataType::Time32(TimeUnit::Second),
32,
0i32
);
make_type!(
Time32MillisecondType,
i32,
DataType::Time32(TimeUnit::Millisecond),
32,
0i32
);
make_type!(
Time64MicrosecondType,
i64,
DataType::Time64(TimeUnit::Microsecond),
64,
0i64
);
make_type!(
Time64NanosecondType,
i64,
DataType::Time64(TimeUnit::Nanosecond),
64,
0i64
);
make_type!(
IntervalYearMonthType,
i64,
DataType::Interval(IntervalUnit::YearMonth),
64,
0i64
);
make_type!(
IntervalDayTimeType,
i64,
DataType::Interval(IntervalUnit::DayTime),
64,
0i64
);
/// A subtype of primitive type that represents numeric values.
///
/// SIMD operations are defined in this trait if available on the target system.
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))]
pub trait ArrowNumericType: ArrowPrimitiveType
where
Self::Simd: Add<Output = Self::Simd>
+ Sub<Output = Self::Simd>
+ Mul<Output = Self::Simd>
+ Div<Output = Self::Simd>,
{
/// Defines the SIMD type that should be used for this numeric type
type Simd;
/// Defines the SIMD Mask type that should be used for this numeric type
type SimdMask;
/// The number of SIMD lanes available
fn lanes() -> usize;
/// Initializes a SIMD register to a constant value
fn init(value: Self::Native) -> Self::Simd;
/// Loads a slice into a SIMD register
fn load(slice: &[Self::Native]) -> Self::Simd;
/// Creates a new SIMD mask for this SIMD type filling it with `value`
fn mask_init(value: bool) -> Self::SimdMask;
/// Gets the value of a single lane in a SIMD mask
fn mask_get(mask: &Self::SimdMask, idx: usize) -> bool;
/// Sets the value of a single lane of a SIMD mask
fn mask_set(mask: Self::SimdMask, idx: usize, value: bool) -> Self::SimdMask;
/// Selects elements of `a` and `b` using `mask`
fn mask_select(mask: Self::SimdMask, a: Self::Simd, b: Self::Simd) -> Self::Simd;
/// Returns `true` if any of the lanes in the mask are `true`
fn mask_any(mask: Self::SimdMask) -> bool;
/// Performs a SIMD binary operation
fn bin_op<F: Fn(Self::Simd, Self::Simd) -> Self::Simd>(
left: Self::Simd,
right: Self::Simd,
op: F,
) -> Self::Simd;
// SIMD version of equal
fn eq(left: Self::Simd, right: Self::Simd) -> Self::SimdMask;
// SIMD version of not equal
fn ne(left: Self::Simd, right: Self::Simd) -> Self::SimdMask;
// SIMD version of less than
fn lt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask;
// SIMD version of less than or equal to
fn le(left: Self::Simd, right: Self::Simd) -> Self::SimdMask;
// SIMD version of greater than
fn gt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask;
// SIMD version of greater than or equal to
fn ge(left: Self::Simd, right: Self::Simd) -> Self::SimdMask;
/// Writes a SIMD result back to a slice
fn write(simd_result: Self::Simd, slice: &mut [Self::Native]);
}
#[cfg(any(
not(any(target_arch = "x86", target_arch = "x86_64")),
not(feature = "simd")
))]
pub trait ArrowNumericType: ArrowPrimitiveType {}
macro_rules! make_numeric_type {
($impl_ty:ty, $native_ty:ty, $simd_ty:ident, $simd_mask_ty:ident) => {
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "simd"))]
impl ArrowNumericType for $impl_ty {
type Simd = $simd_ty;
type SimdMask = $simd_mask_ty;
fn lanes() -> usize {
Self::Simd::lanes()
}
fn init(value: Self::Native) -> Self::Simd {
Self::Simd::splat(value)
}
fn load(slice: &[Self::Native]) -> Self::Simd {
unsafe { Self::Simd::from_slice_unaligned_unchecked(slice) }
}
fn mask_init(value: bool) -> Self::SimdMask {
Self::SimdMask::splat(value)
}
fn mask_get(mask: &Self::SimdMask, idx: usize) -> bool {
unsafe { mask.extract_unchecked(idx) }
}
fn mask_set(mask: Self::SimdMask, idx: usize, value: bool) -> Self::SimdMask {
unsafe { mask.replace_unchecked(idx, value) }
}
/// Selects elements of `a` and `b` using `mask`
fn mask_select(
mask: Self::SimdMask,
a: Self::Simd,
b: Self::Simd,
) -> Self::Simd {
mask.select(a, b)
}
fn mask_any(mask: Self::SimdMask) -> bool {
mask.any()
}
fn bin_op<F: Fn(Self::Simd, Self::Simd) -> Self::Simd>(
left: Self::Simd,
right: Self::Simd,
op: F,
) -> Self::Simd {
op(left, right)
}
fn eq(left: Self::Simd, right: Self::Simd) -> Self::SimdMask {
left.eq(right)
}
fn ne(left: Self::Simd, right: Self::Simd) -> Self::SimdMask {
left.ne(right)
}
fn lt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask {
left.lt(right)
}
fn le(left: Self::Simd, right: Self::Simd) -> Self::SimdMask {
left.le(right)
}
fn gt(left: Self::Simd, right: Self::Simd) -> Self::SimdMask {
left.gt(right)
}
fn ge(left: Self::Simd, right: Self::Simd) -> Self::SimdMask {
left.ge(right)
}
fn write(simd_result: Self::Simd, slice: &mut [Self::Native]) {
unsafe { simd_result.write_to_slice_unaligned_unchecked(slice) };
}
}
#[cfg(any(
not(any(target_arch = "x86", target_arch = "x86_64")),
not(feature = "simd")
))]
impl ArrowNumericType for $impl_ty {}
};
}
make_numeric_type!(Int8Type, i8, i8x64, m8x64);
make_numeric_type!(Int16Type, i16, i16x32, m16x32);
make_numeric_type!(Int32Type, i32, i32x16, m32x16);
make_numeric_type!(Int64Type, i64, i64x8, m64x8);
make_numeric_type!(UInt8Type, u8, u8x64, m8x64);
make_numeric_type!(UInt16Type, u16, u16x32, m16x32);
make_numeric_type!(UInt32Type, u32, u32x16, m32x16);
make_numeric_type!(UInt64Type, u64, u64x8, m64x8);
make_numeric_type!(Float32Type, f32, f32x16, m32x16);
make_numeric_type!(Float64Type, f64, f64x8, m64x8);
make_numeric_type!(TimestampSecondType, i64, i64x8, m64x8);
make_numeric_type!(TimestampMillisecondType, i64, i64x8, m64x8);
make_numeric_type!(TimestampMicrosecondType, i64, i64x8, m64x8);
make_numeric_type!(TimestampNanosecondType, i64, i64x8, m64x8);
make_numeric_type!(Date32Type, i32, i32x16, m32x16);
make_numeric_type!(Date64Type, i64, i64x8, m64x8);
make_numeric_type!(Time32SecondType, i32, i32x16, m32x16);
make_numeric_type!(Time32MillisecondType, i32, i32x16, m32x16);
make_numeric_type!(Time64MicrosecondType, i64, i64x8, m64x8);
make_numeric_type!(Time64NanosecondType, i64, i64x8, m64x8);
make_numeric_type!(IntervalYearMonthType, i64, i64x8, m64x8);
make_numeric_type!(IntervalDayTimeType, i64, i64x8, m64x8);
/// A subtype of primitive type that represents temporal values.
pub trait ArrowTemporalType: ArrowPrimitiveType {}
impl ArrowTemporalType for TimestampSecondType {}
impl ArrowTemporalType for TimestampMillisecondType {}
impl ArrowTemporalType for TimestampMicrosecondType {}
impl ArrowTemporalType for TimestampNanosecondType {}
impl ArrowTemporalType for Date32Type {}
impl ArrowTemporalType for Date64Type {}
impl ArrowTemporalType for Time32SecondType {}
impl ArrowTemporalType for Time32MillisecondType {}
impl ArrowTemporalType for Time64MicrosecondType {}
impl ArrowTemporalType for Time64NanosecondType {}
impl ArrowTemporalType for IntervalYearMonthType {}
impl ArrowTemporalType for IntervalDayTimeType {}
/// Allows conversion from supported Arrow types to a byte slice.
pub trait ToByteSlice {
/// Converts this instance into a byte slice
fn to_byte_slice(&self) -> &[u8];
}
impl<T: ArrowNativeType> ToByteSlice for [T] {
fn to_byte_slice(&self) -> &[u8] {
let raw_ptr = self.as_ptr() as *const T as *const u8;
unsafe { from_raw_parts(raw_ptr, self.len() * size_of::<T>()) }
}
}
impl<T: ArrowNativeType> ToByteSlice for T {
fn to_byte_slice(&self) -> &[u8] {
let raw_ptr = self as *const T as *const u8;
unsafe { from_raw_parts(raw_ptr, size_of::<T>()) }
}
}
impl DataType {
/// Parse a data type from a JSON representation
fn from(json: &Value) -> Result<DataType> {
match *json {
Value::Object(ref map) => match map.get("name") {
Some(s) if s == "bool" => Ok(DataType::Boolean),
Some(s) if s == "utf8" => Ok(DataType::Utf8),
Some(s) if s == "floatingpoint" => match map.get("precision") {
Some(p) if p == "HALF" => Ok(DataType::Float16),
Some(p) if p == "SINGLE" => Ok(DataType::Float32),
Some(p) if p == "DOUBLE" => Ok(DataType::Float64),
_ => Err(ArrowError::ParseError(
"floatingpoint precision missing or invalid".to_string(),
)),
},
Some(s) if s == "timestamp" => match map.get("unit") {
Some(p) if p == "SECOND" => Ok(DataType::Timestamp(TimeUnit::Second)),
Some(p) if p == "MILLISECOND" => {
Ok(DataType::Timestamp(TimeUnit::Millisecond))
}
Some(p) if p == "MICROSECOND" => {
Ok(DataType::Timestamp(TimeUnit::Microsecond))
}
Some(p) if p == "NANOSECOND" => {
Ok(DataType::Timestamp(TimeUnit::Nanosecond))
}
_ => Err(ArrowError::ParseError(
"timestamp unit missing or invalid".to_string(),
)),
},
Some(s) if s == "date" => match map.get("unit") {
Some(p) if p == "DAY" => Ok(DataType::Date32(DateUnit::Day)),
Some(p) if p == "MILLISECOND" => {
Ok(DataType::Date64(DateUnit::Millisecond))
}
_ => Err(ArrowError::ParseError(
"date unit missing or invalid".to_string(),
)),
},
Some(s) if s == "time" => {
let unit = match map.get("unit") {
Some(p) if p == "SECOND" => Ok(TimeUnit::Second),
Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond),
Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond),
Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond),
_ => Err(ArrowError::ParseError(
"time unit missing or invalid".to_string(),
)),
};
match map.get("bitWidth") {
Some(p) if p == "32" => Ok(DataType::Time32(unit?)),
Some(p) if p == "64" => Ok(DataType::Time32(unit?)),
_ => Err(ArrowError::ParseError(
"time bitWidth missing or invalid".to_string(),
)),
}
}
Some(s) if s == "interval" => match map.get("unit") {
Some(p) if p == "DAY_TIME" => {
Ok(DataType::Interval(IntervalUnit::DayTime))
}
Some(p) if p == "YEAR_MONTH" => {
Ok(DataType::Interval(IntervalUnit::YearMonth))
}
_ => Err(ArrowError::ParseError(
"interval unit missing or invalid".to_string(),
)),
},
Some(s) if s == "int" => match map.get("isSigned") {
Some(&Value::Bool(true)) => match map.get("bitWidth") {
Some(&Value::Number(ref n)) => match n.as_u64() {
Some(8) => Ok(DataType::Int8),
Some(16) => Ok(DataType::Int16),
Some(32) => Ok(DataType::Int32),
Some(64) => Ok(DataType::Int32),
_ => Err(ArrowError::ParseError(
"int bitWidth missing or invalid".to_string(),
)),
},
_ => Err(ArrowError::ParseError(
"int bitWidth missing or invalid".to_string(),
)),
},
Some(&Value::Bool(false)) => match map.get("bitWidth") {
Some(&Value::Number(ref n)) => match n.as_u64() {
Some(8) => Ok(DataType::UInt8),
Some(16) => Ok(DataType::UInt16),
Some(32) => Ok(DataType::UInt32),
Some(64) => Ok(DataType::UInt64),
_ => Err(ArrowError::ParseError(
"int bitWidth missing or invalid".to_string(),
)),
},
_ => Err(ArrowError::ParseError(
"int bitWidth missing or invalid".to_string(),
)),
},
_ => Err(ArrowError::ParseError(
"int signed missing or invalid".to_string(),
)),
},
Some(other) => Err(ArrowError::ParseError(format!(
"invalid type name: {}",
other
))),
None => match map.get("fields") {
Some(&Value::Array(ref fields_array)) => {
let fields = fields_array
.iter()
.map(|f| Field::from(f))
.collect::<Result<Vec<Field>>>();
Ok(DataType::Struct(fields?))
}
_ => Err(ArrowError::ParseError("empty type".to_string())),
},
},
_ => Err(ArrowError::ParseError(
"invalid json value type".to_string(),
)),
}
}
/// Generate a JSON representation of the data type
pub fn to_json(&self) -> Value {
match self {
DataType::Boolean => json!({"name": "bool"}),
DataType::Int8 => json!({"name": "int", "bitWidth": 8, "isSigned": true}),
DataType::Int16 => json!({"name": "int", "bitWidth": 16, "isSigned": true}),
DataType::Int32 => json!({"name": "int", "bitWidth": 32, "isSigned": true}),
DataType::Int64 => json!({"name": "int", "bitWidth": 64, "isSigned": true}),
DataType::UInt8 => json!({"name": "int", "bitWidth": 8, "isSigned": false}),
DataType::UInt16 => json!({"name": "int", "bitWidth": 16, "isSigned": false}),
DataType::UInt32 => json!({"name": "int", "bitWidth": 32, "isSigned": false}),
DataType::UInt64 => json!({"name": "int", "bitWidth": 64, "isSigned": false}),
DataType::Float16 => json!({"name": "floatingpoint", "precision": "HALF"}),
DataType::Float32 => json!({"name": "floatingpoint", "precision": "SINGLE"}),
DataType::Float64 => json!({"name": "floatingpoint", "precision": "DOUBLE"}),
DataType::Utf8 => json!({"name": "utf8"}),
DataType::Struct(ref fields) => {
let field_json_array = Value::Array(
fields.iter().map(|f| f.to_json()).collect::<Vec<Value>>(),
);
json!({ "fields": field_json_array })
}
DataType::List(ref t) => {
let child_json = t.to_json();
json!({ "name": "list", "children": child_json })
}
DataType::Time32(unit) => {
json!({"name": "time", "bitWidth": "32", "unit": match unit {
TimeUnit::Second => "SECOND",
TimeUnit::Millisecond => "MILLISECOND",
TimeUnit::Microsecond => "MICROSECOND",
TimeUnit::Nanosecond => "NANOSECOND",
}})
}
DataType::Time64(unit) => {
json!({"name": "time", "bitWidth": "64", "unit": match unit {
TimeUnit::Second => "SECOND",
TimeUnit::Millisecond => "MILLISECOND",
TimeUnit::Microsecond => "MICROSECOND",
TimeUnit::Nanosecond => "NANOSECOND",
}})
}
DataType::Date32(unit) | DataType::Date64(unit) => {
json!({"name": "date", "unit": match unit {
DateUnit::Day => "DAY",
DateUnit::Millisecond => "MILLISECOND",
}})
}
DataType::Timestamp(unit) => json!({"name": "timestamp", "unit": match unit {
TimeUnit::Second => "SECOND",
TimeUnit::Millisecond => "MILLISECOND",
TimeUnit::Microsecond => "MICROSECOND",
TimeUnit::Nanosecond => "NANOSECOND",
}}),
DataType::Interval(unit) => json!({"name": "interval", "unit": match unit {
IntervalUnit::YearMonth => "YEAR_MONTH",
IntervalUnit::DayTime => "DAY_TIME",
}}),
}
}
}
impl Field {
/// Creates a new field
pub fn new(name: &str, data_type: DataType, nullable: bool) -> Self {
Field {
name: name.to_string(),
data_type,
nullable,
}
}
/// Returns an immutable reference to the `Field`'s name
pub fn name(&self) -> &String {
&self.name
}
/// Returns an immutable reference to the `Field`'s data-type
pub fn data_type(&self) -> &DataType {
&self.data_type
}
/// Indicates whether this `Field` supports null values
pub fn is_nullable(&self) -> bool {
self.nullable
}
/// Parse a `Field` definition from a JSON representation
pub fn from(json: &Value) -> Result<Self> {
match *json {
Value::Object(ref map) => {
let name = match map.get("name") {
Some(&Value::String(ref name)) => name.to_string(),
_ => {
return Err(ArrowError::ParseError(
"Field missing 'name' attribute".to_string(),
));
}
};
let nullable = match map.get("nullable") {
Some(&Value::Bool(b)) => b,
_ => {
return Err(ArrowError::ParseError(
"Field missing 'nullable' attribute".to_string(),
));
}
};
let data_type = match map.get("type") {
Some(t) => DataType::from(t)?,
_ => {
return Err(ArrowError::ParseError(
"Field missing 'type' attribute".to_string(),
));
}
};
Ok(Field {
name,
nullable,
data_type,
})
}
_ => Err(ArrowError::ParseError(
"Invalid json value type for field".to_string(),
)),
}
}
/// Generate a JSON representation of the `Field`
pub fn to_json(&self) -> Value {
json!({
"name": self.name,
"nullable": self.nullable,
"type": self.data_type.to_json(),
})
}
/// Converts to a `String` representation of the `Field`
pub fn to_string(&self) -> String {
format!("{}: {:?}", self.name, self.data_type)
}
}
impl fmt::Display for Field {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", self.to_string())
}
}
/// Describes the meta-data of an ordered sequence of relative types.
///
/// Note that this information is only part of the meta-data and not part of the physical
/// memory layout.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
pub struct Schema {
pub(crate) fields: Vec<Field>,
}
impl Schema {
/// Creates an empty `Schema`
pub fn empty() -> Self {
Self { fields: vec![] }
}
/// Creates a new `Schema` from a sequence of `Field` values
///
/// # Example
///
/// ```
/// # extern crate arrow;
/// # use arrow::datatypes::{Field, DataType, Schema};
/// let field_a = Field::new("a", DataType::Int64, false);
/// let field_b = Field::new("b", DataType::Boolean, false);
///
/// let schema = Schema::new(vec![field_a, field_b]);
/// ```
pub fn new(fields: Vec<Field>) -> Self {
Self { fields }
}
/// Returns an immutable reference of the vector of `Field` instances
pub fn fields(&self) -> &Vec<Field> {
&self.fields
}
/// Returns an immutable reference of a specific `Field` instance selected using an
/// offset within the internal `fields` vector
pub fn field(&self, i: usize) -> &Field {
&self.fields[i]
}
/// Look up a column by name and return a immutable reference to the column along with
/// it's index
pub fn column_with_name(&self, name: &str) -> Option<(usize, &Field)> {
self.fields
.iter()
.enumerate()
.find(|&(_, c)| c.name == name)
}
/// Generate a JSON representation of the `Field`
pub fn to_json(&self) -> Value {
json!({
"fields": self.fields.iter().map(|field| field.to_json()).collect::<Vec<Value>>(),
})
}
}
impl fmt::Display for Schema {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str(
&self
.fields
.iter()
.map(|c| c.to_string())
.collect::<Vec<String>>()
.join(", "),
)
}
}
pub type SchemaRef = Arc<Schema>;
#[cfg(test)]
mod tests {
use super::*;
use serde_json;
use serde_json::Number;
use serde_json::Value::{Bool, Number as VNumber};
use std::f32::NAN;
#[test]
fn create_struct_type() {
let _person = DataType::Struct(vec![
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new(
"address",
DataType::Struct(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
false,
),
]);
}
#[test]
fn serde_struct_type() {
let person = DataType::Struct(vec![
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new(
"address",
DataType::Struct(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
false,
),
]);
let serialized = serde_json::to_string(&person).unwrap();
// NOTE that this is testing the default (derived) serialization format, not the
// JSON format specified in metadata.md
assert_eq!(
"{\"Struct\":[\
{\"name\":\"first_name\",\"data_type\":\"Utf8\",\"nullable\":false},\
{\"name\":\"last_name\",\"data_type\":\"Utf8\",\"nullable\":false},\
{\"name\":\"address\",\"data_type\":{\"Struct\":\
[{\"name\":\"street\",\"data_type\":\"Utf8\",\"nullable\":false},\
{\"name\":\"zip\",\"data_type\":\"UInt16\",\"nullable\":false}\
]},\"nullable\":false}]}",
serialized
);
let deserialized = serde_json::from_str(&serialized).unwrap();
assert_eq!(person, deserialized);
}
#[test]
fn struct_field_to_json() {
let f = Field::new(
"address",
DataType::Struct(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
false,
);
assert_eq!(
"{\"name\":\"address\",\"nullable\":false,\"type\":{\"fields\":[\
{\"name\":\"street\",\"nullable\":false,\"type\":{\"name\":\"utf8\"}},\
{\"name\":\"zip\",\"nullable\":false,\"type\":{\"name\":\"int\",\"bitWidth\":16,\"isSigned\":false}}]}}",
f.to_json().to_string()
);
}
#[test]
fn primitive_field_to_json() {
let f = Field::new("first_name", DataType::Utf8, false);
assert_eq!(
"{\"name\":\"first_name\",\"nullable\":false,\"type\":{\"name\":\"utf8\"}}",
f.to_json().to_string()
);
}
#[test]
fn parse_struct_from_json() {
let json = "{\"name\":\"address\",\"nullable\":false,\"type\":{\"fields\":[\
{\"name\":\"street\",\"nullable\":false,\"type\":{\"name\":\"utf8\"}},\
{\"name\":\"zip\",\"nullable\":false,\"type\":{\"bitWidth\":16,\"isSigned\":false,\"name\":\"int\"}}]}}";
let value: Value = serde_json::from_str(json).unwrap();
let dt = Field::from(&value).unwrap();
let expected = Field::new(
"address",
DataType::Struct(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
false,
);
assert_eq!(expected, dt);
}
#[test]
fn parse_utf8_from_json() {
let json = "{\"name\":\"utf8\"}";
let value: Value = serde_json::from_str(json).unwrap();
let dt = DataType::from(&value).unwrap();
assert_eq!(DataType::Utf8, dt);
}
#[test]
fn parse_int32_from_json() {
let json = "{\"name\": \"int\", \"isSigned\": true, \"bitWidth\": 32}";
let value: Value = serde_json::from_str(json).unwrap();
let dt = DataType::from(&value).unwrap();
assert_eq!(DataType::Int32, dt);
}
#[test]
fn schema_json() {
let schema = Schema::new(vec![
Field::new("c1", DataType::Utf8, false),
Field::new("c2", DataType::Date32(DateUnit::Day), false),
Field::new("c3", DataType::Date64(DateUnit::Millisecond), false),
Field::new("c7", DataType::Time32(TimeUnit::Second), false),
Field::new("c8", DataType::Time32(TimeUnit::Millisecond), false),
Field::new("c9", DataType::Time32(TimeUnit::Microsecond), false),
Field::new("c10", DataType::Time32(TimeUnit::Nanosecond), false),
Field::new("c11", DataType::Time64(TimeUnit::Second), false),
Field::new("c12", DataType::Time64(TimeUnit::Millisecond), false),
Field::new("c13", DataType::Time64(TimeUnit::Microsecond), false),
Field::new("c14", DataType::Time64(TimeUnit::Nanosecond), false),
Field::new("c15", DataType::Timestamp(TimeUnit::Second), false),
Field::new("c16", DataType::Timestamp(TimeUnit::Millisecond), false),
Field::new("c17", DataType::Timestamp(TimeUnit::Microsecond), false),
Field::new("c18", DataType::Timestamp(TimeUnit::Nanosecond), false),
Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false),
Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false),
Field::new(
"c21",
DataType::Struct(vec![
Field::new("a", DataType::Utf8, false),
Field::new("b", DataType::UInt16, false),
]),
false,
),
]);
let json = schema.to_json().to_string();
assert_eq!(json, "{\"fields\":[{\"name\":\"c1\",\"nullable\":false,\"type\":{\"name\":\"utf8\"}},\
{\"name\":\"c2\",\"nullable\":false,\"type\":{\"name\":\"date\",\"unit\":\"DAY\"}},\
{\"name\":\"c3\",\"nullable\":false,\"type\":{\"name\":\"date\",\"unit\":\"MILLISECOND\"}},\
{\"name\":\"c7\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"32\",\"unit\":\"SECOND\"}},\
{\"name\":\"c8\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"32\",\"unit\":\"MILLISECOND\"}},\
{\"name\":\"c9\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"32\",\"unit\":\"MICROSECOND\"}},\
{\"name\":\"c10\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"32\",\"unit\":\"NANOSECOND\"}},\
{\"name\":\"c11\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"64\",\"unit\":\"SECOND\"}},\
{\"name\":\"c12\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"64\",\"unit\":\"MILLISECOND\"}},\
{\"name\":\"c13\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"64\",\"unit\":\"MICROSECOND\"}},\
{\"name\":\"c14\",\"nullable\":false,\"type\":{\"name\":\"time\",\"bitWidth\":\"64\",\"unit\":\"NANOSECOND\"}},\
{\"name\":\"c15\",\"nullable\":false,\"type\":{\"name\":\"timestamp\",\"unit\":\"SECOND\"}},\
{\"name\":\"c16\",\"nullable\":false,\"type\":{\"name\":\"timestamp\",\"unit\":\"MILLISECOND\"}},\
{\"name\":\"c17\",\"nullable\":false,\"type\":{\"name\":\"timestamp\",\"unit\":\"MICROSECOND\"}},\
{\"name\":\"c18\",\"nullable\":false,\"type\":{\"name\":\"timestamp\",\"unit\":\"NANOSECOND\"}},\
{\"name\":\"c19\",\"nullable\":false,\"type\":{\"name\":\"interval\",\"unit\":\"DAY_TIME\"}},\
{\"name\":\"c20\",\"nullable\":false,\"type\":{\"name\":\"interval\",\"unit\":\"YEAR_MONTH\"}},\
{\"name\":\"c21\",\"nullable\":false,\"type\":{\"fields\":[\
{\"name\":\"a\",\"nullable\":false,\"type\":{\"name\":\"utf8\"}},\
{\"name\":\"b\",\"nullable\":false,\"type\":{\"name\":\"int\",\"bitWidth\":16,\"isSigned\":false}}]}}]}");
// convert back to a schema
let value: Value = serde_json::from_str(&json).unwrap();
let schema2 = DataType::from(&value).unwrap();
match schema2 {
DataType::Struct(fields) => {
assert_eq!(schema.fields().len(), fields.len());
}
_ => panic!(),
}
}
#[test]
fn create_schema_string() {
let _person = Schema::new(vec![
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new(
"address",
DataType::Struct(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
false,
),
]);
assert_eq!(_person.to_string(), "first_name: Utf8, last_name: Utf8, address: Struct([Field { name: \"street\", data_type: Utf8, nullable: false }, Field { name: \"zip\", data_type: UInt16, nullable: false }])")
}
#[test]
fn schema_field_accessors() {
let _person = Schema::new(vec![
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new(
"address",
DataType::Struct(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
false,
),
]);
// test schema accessors
assert_eq!(_person.fields().len(), 3);
// test field accessors
assert_eq!(_person.fields()[0].name(), "first_name");
assert_eq!(_person.fields()[0].data_type(), &DataType::Utf8);
assert_eq!(_person.fields()[0].is_nullable(), false);
}
#[test]
fn schema_equality() {
let schema1 = Schema::new(vec![
Field::new("c1", DataType::Utf8, false),
Field::new("c2", DataType::Float64, true),
]);
let schema2 = Schema::new(vec![
Field::new("c1", DataType::Utf8, false),
Field::new("c2", DataType::Float64, true),
]);
assert_eq!(schema1, schema2);
let schema3 = Schema::new(vec![
Field::new("c1", DataType::Utf8, false),
Field::new("c2", DataType::Float32, true),
]);
let schema4 = Schema::new(vec![
Field::new("C1", DataType::Utf8, false),
Field::new("C2", DataType::Float64, true),
]);
assert!(schema1 != schema3);
assert!(schema1 != schema4);
assert!(schema2 != schema3);
assert!(schema2 != schema4);
assert!(schema3 != schema4);
}
#[test]
fn test_arrow_native_type_to_json() {
assert_eq!(Some(Bool(true)), true.into_json_value());
assert_eq!(Some(VNumber(Number::from(1))), 1i8.into_json_value());
assert_eq!(Some(VNumber(Number::from(1))), 1i16.into_json_value());
assert_eq!(Some(VNumber(Number::from(1))), 1i32.into_json_value());
assert_eq!(Some(VNumber(Number::from(1))), 1i64.into_json_value());
assert_eq!(Some(VNumber(Number::from(1))), 1u8.into_json_value());
assert_eq!(Some(VNumber(Number::from(1))), 1u16.into_json_value());
assert_eq!(Some(VNumber(Number::from(1))), 1u32.into_json_value());
assert_eq!(Some(VNumber(Number::from(1))), 1u64.into_json_value());
assert_eq!(
Some(VNumber(Number::from_f64(0.01 as f64).unwrap())),
0.01.into_json_value()
);
assert_eq!(
Some(VNumber(Number::from_f64(0.01f64).unwrap())),
0.01f64.into_json_value()
);
assert_eq!(None, NAN.into_json_value());
}
}