| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| use std::fmt; |
| |
| use serde_derive::{Deserialize, Serialize}; |
| use serde_json::{json, Value, Value::String as VString}; |
| |
| use crate::error::{ArrowError, Result}; |
| |
| use super::Field; |
| |
| /// The set of datatypes that are supported by this implementation of Apache Arrow. |
| /// |
| /// The Arrow specification on data types includes some more types. |
| /// See also [`Schema.fbs`](https://github.com/apache/arrow/blob/master/format/Schema.fbs) |
| /// for Arrow's specification. |
| /// |
| /// The variants of this enum include primitive fixed size types as well as parametric or |
| /// nested types. |
| /// Currently the Rust implementation supports the following nested types: |
| /// - `List<T>` |
| /// - `Struct<T, U, V, ...>` |
| /// |
| /// Nested types can themselves be nested within other arrays. |
| /// For more information on these types please see |
| /// [the physical memory layout of Apache Arrow](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout). |
| #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] |
| pub enum DataType { |
| /// Null type |
| Null, |
| /// A boolean datatype representing the values `true` and `false`. |
| Boolean, |
| /// A signed 8-bit integer. |
| Int8, |
| /// A signed 16-bit integer. |
| Int16, |
| /// A signed 32-bit integer. |
| Int32, |
| /// A signed 64-bit integer. |
| Int64, |
| /// An unsigned 8-bit integer. |
| UInt8, |
| /// An unsigned 16-bit integer. |
| UInt16, |
| /// An unsigned 32-bit integer. |
| UInt32, |
| /// An unsigned 64-bit integer. |
| UInt64, |
| /// A 16-bit floating point number. |
| Float16, |
| /// A 32-bit floating point number. |
| Float32, |
| /// A 64-bit floating point number. |
| Float64, |
| /// A timestamp with an optional timezone. |
| /// |
| /// Time is measured as a Unix epoch, counting the seconds from |
| /// 00:00:00.000 on 1 January 1970, excluding leap seconds, |
| /// as a 64-bit integer. |
| /// |
| /// The time zone is a string indicating the name of a time zone, one of: |
| /// |
| /// * As used in the Olson time zone database (the "tz database" or |
| /// "tzdata"), such as "America/New_York" |
| /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 |
| Timestamp(TimeUnit, Option<String>), |
| /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01) |
| /// in days (32 bits). |
| Date32, |
| /// A 64-bit date representing the elapsed time since UNIX epoch (1970-01-01) |
| /// in milliseconds (64 bits). Values are evenly divisible by 86400000. |
| Date64, |
| /// A 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. |
| Time32(TimeUnit), |
| /// A 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`. |
| Time64(TimeUnit), |
| /// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds. |
| Duration(TimeUnit), |
| /// A "calendar" interval which models types that don't necessarily |
| /// have a precise duration without the context of a base timestamp (e.g. |
| /// days can differ in length during day light savings time transitions). |
| Interval(IntervalUnit), |
| /// Opaque binary data of variable length. |
| Binary, |
| /// Opaque binary data of fixed size. |
| /// Enum parameter specifies the number of bytes per value. |
| FixedSizeBinary(i32), |
| /// Opaque binary data of variable length and 64-bit offsets. |
| LargeBinary, |
| /// A variable-length string in Unicode with UTF-8 encoding. |
| Utf8, |
| /// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets. |
| LargeUtf8, |
| /// A list of some logical data type with variable length. |
| List(Box<Field>), |
| /// A list of some logical data type with fixed length. |
| FixedSizeList(Box<Field>, i32), |
| /// A list of some logical data type with variable length and 64-bit offsets. |
| LargeList(Box<Field>), |
| /// A nested datatype that contains a number of sub-fields. |
| Struct(Vec<Field>), |
| /// A nested datatype that can represent slots of differing types. |
| Union(Vec<Field>), |
| /// A dictionary encoded array (`key_type`, `value_type`), where |
| /// each array element is an index of `key_type` into an |
| /// associated dictionary of `value_type`. |
| /// |
| /// Dictionary arrays are used to store columns of `value_type` |
| /// that contain many repeated values using less memory, but with |
| /// a higher CPU overhead for some operations. |
| /// |
| /// This type mostly used to represent low cardinality string |
| /// arrays or a limited set of primitive types as integers. |
| Dictionary(Box<DataType>, Box<DataType>), |
| /// Decimal value with precision and scale |
| Decimal(usize, usize), |
| } |
| |
| /// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds. |
| #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] |
| pub enum TimeUnit { |
| /// Time in seconds. |
| Second, |
| /// Time in milliseconds. |
| Millisecond, |
| /// Time in microseconds. |
| Microsecond, |
| /// Time in nanoseconds. |
| Nanosecond, |
| } |
| |
| /// YEAR_MONTH or DAY_TIME interval in SQL style. |
| #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] |
| pub enum IntervalUnit { |
| /// Indicates the number of elapsed whole months, stored as 4-byte integers. |
| YearMonth, |
| /// Indicates the number of elapsed days and milliseconds, |
| /// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total). |
| DayTime, |
| } |
| |
| impl fmt::Display for DataType { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| write!(f, "{:?}", self) |
| } |
| } |
| |
| impl DataType { |
| /// Parse a data type from a JSON representation. |
| pub(crate) fn from(json: &Value) -> Result<DataType> { |
| let default_field = Field::new("", DataType::Boolean, true); |
| match *json { |
| Value::Object(ref map) => match map.get("name") { |
| Some(s) if s == "null" => Ok(DataType::Null), |
| Some(s) if s == "bool" => Ok(DataType::Boolean), |
| Some(s) if s == "binary" => Ok(DataType::Binary), |
| Some(s) if s == "largebinary" => Ok(DataType::LargeBinary), |
| Some(s) if s == "utf8" => Ok(DataType::Utf8), |
| Some(s) if s == "largeutf8" => Ok(DataType::LargeUtf8), |
| Some(s) if s == "fixedsizebinary" => { |
| // return a list with any type as its child isn't defined in the map |
| if let Some(Value::Number(size)) = map.get("byteWidth") { |
| Ok(DataType::FixedSizeBinary(size.as_i64().unwrap() as i32)) |
| } else { |
| Err(ArrowError::ParseError( |
| "Expecting a byteWidth for fixedsizebinary".to_string(), |
| )) |
| } |
| } |
| Some(s) if s == "decimal" => { |
| // return a list with any type as its child isn't defined in the map |
| let precision = match map.get("precision") { |
| Some(p) => Ok(p.as_u64().unwrap() as usize), |
| None => Err(ArrowError::ParseError( |
| "Expecting a precision for decimal".to_string(), |
| )), |
| }; |
| let scale = match map.get("scale") { |
| Some(s) => Ok(s.as_u64().unwrap() as usize), |
| _ => Err(ArrowError::ParseError( |
| "Expecting a scale for decimal".to_string(), |
| )), |
| }; |
| |
| Ok(DataType::Decimal(precision?, scale?)) |
| } |
| Some(s) if s == "floatingpoint" => match map.get("precision") { |
| Some(p) if p == "HALF" => Ok(DataType::Float16), |
| Some(p) if p == "SINGLE" => Ok(DataType::Float32), |
| Some(p) if p == "DOUBLE" => Ok(DataType::Float64), |
| _ => Err(ArrowError::ParseError( |
| "floatingpoint precision missing or invalid".to_string(), |
| )), |
| }, |
| Some(s) if s == "timestamp" => { |
| let unit = match map.get("unit") { |
| Some(p) if p == "SECOND" => Ok(TimeUnit::Second), |
| Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), |
| Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), |
| Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), |
| _ => Err(ArrowError::ParseError( |
| "timestamp unit missing or invalid".to_string(), |
| )), |
| }; |
| let tz = match map.get("timezone") { |
| None => Ok(None), |
| Some(VString(tz)) => Ok(Some(tz.clone())), |
| _ => Err(ArrowError::ParseError( |
| "timezone must be a string".to_string(), |
| )), |
| }; |
| Ok(DataType::Timestamp(unit?, tz?)) |
| } |
| Some(s) if s == "date" => match map.get("unit") { |
| Some(p) if p == "DAY" => Ok(DataType::Date32), |
| Some(p) if p == "MILLISECOND" => Ok(DataType::Date64), |
| _ => Err(ArrowError::ParseError( |
| "date unit missing or invalid".to_string(), |
| )), |
| }, |
| Some(s) if s == "time" => { |
| let unit = match map.get("unit") { |
| Some(p) if p == "SECOND" => Ok(TimeUnit::Second), |
| Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond), |
| Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond), |
| Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond), |
| _ => Err(ArrowError::ParseError( |
| "time unit missing or invalid".to_string(), |
| )), |
| }; |
| match map.get("bitWidth") { |
| Some(p) if p == 32 => Ok(DataType::Time32(unit?)), |
| Some(p) if p == 64 => Ok(DataType::Time64(unit?)), |
| _ => Err(ArrowError::ParseError( |
| "time bitWidth missing or invalid".to_string(), |
| )), |
| } |
| } |
| Some(s) if s == "duration" => match map.get("unit") { |
| Some(p) if p == "SECOND" => Ok(DataType::Duration(TimeUnit::Second)), |
| Some(p) if p == "MILLISECOND" => { |
| Ok(DataType::Duration(TimeUnit::Millisecond)) |
| } |
| Some(p) if p == "MICROSECOND" => { |
| Ok(DataType::Duration(TimeUnit::Microsecond)) |
| } |
| Some(p) if p == "NANOSECOND" => { |
| Ok(DataType::Duration(TimeUnit::Nanosecond)) |
| } |
| _ => Err(ArrowError::ParseError( |
| "time unit missing or invalid".to_string(), |
| )), |
| }, |
| Some(s) if s == "interval" => match map.get("unit") { |
| Some(p) if p == "DAY_TIME" => { |
| Ok(DataType::Interval(IntervalUnit::DayTime)) |
| } |
| Some(p) if p == "YEAR_MONTH" => { |
| Ok(DataType::Interval(IntervalUnit::YearMonth)) |
| } |
| _ => Err(ArrowError::ParseError( |
| "interval unit missing or invalid".to_string(), |
| )), |
| }, |
| Some(s) if s == "int" => match map.get("isSigned") { |
| Some(&Value::Bool(true)) => match map.get("bitWidth") { |
| Some(&Value::Number(ref n)) => match n.as_u64() { |
| Some(8) => Ok(DataType::Int8), |
| Some(16) => Ok(DataType::Int16), |
| Some(32) => Ok(DataType::Int32), |
| Some(64) => Ok(DataType::Int64), |
| _ => Err(ArrowError::ParseError( |
| "int bitWidth missing or invalid".to_string(), |
| )), |
| }, |
| _ => Err(ArrowError::ParseError( |
| "int bitWidth missing or invalid".to_string(), |
| )), |
| }, |
| Some(&Value::Bool(false)) => match map.get("bitWidth") { |
| Some(&Value::Number(ref n)) => match n.as_u64() { |
| Some(8) => Ok(DataType::UInt8), |
| Some(16) => Ok(DataType::UInt16), |
| Some(32) => Ok(DataType::UInt32), |
| Some(64) => Ok(DataType::UInt64), |
| _ => Err(ArrowError::ParseError( |
| "int bitWidth missing or invalid".to_string(), |
| )), |
| }, |
| _ => Err(ArrowError::ParseError( |
| "int bitWidth missing or invalid".to_string(), |
| )), |
| }, |
| _ => Err(ArrowError::ParseError( |
| "int signed missing or invalid".to_string(), |
| )), |
| }, |
| Some(s) if s == "list" => { |
| // return a list with any type as its child isn't defined in the map |
| Ok(DataType::List(Box::new(default_field))) |
| } |
| Some(s) if s == "largelist" => { |
| // return a largelist with any type as its child isn't defined in the map |
| Ok(DataType::LargeList(Box::new(default_field))) |
| } |
| Some(s) if s == "fixedsizelist" => { |
| // return a list with any type as its child isn't defined in the map |
| if let Some(Value::Number(size)) = map.get("listSize") { |
| Ok(DataType::FixedSizeList( |
| Box::new(default_field), |
| size.as_i64().unwrap() as i32, |
| )) |
| } else { |
| Err(ArrowError::ParseError( |
| "Expecting a listSize for fixedsizelist".to_string(), |
| )) |
| } |
| } |
| Some(s) if s == "struct" => { |
| // return an empty `struct` type as its children aren't defined in the map |
| Ok(DataType::Struct(vec![])) |
| } |
| Some(other) => Err(ArrowError::ParseError(format!( |
| "invalid or unsupported type name: {} in {:?}", |
| other, json |
| ))), |
| None => Err(ArrowError::ParseError("type name missing".to_string())), |
| }, |
| _ => Err(ArrowError::ParseError( |
| "invalid json value type".to_string(), |
| )), |
| } |
| } |
| |
| /// Generate a JSON representation of the data type. |
| pub fn to_json(&self) -> Value { |
| match self { |
| DataType::Null => json!({"name": "null"}), |
| DataType::Boolean => json!({"name": "bool"}), |
| DataType::Int8 => json!({"name": "int", "bitWidth": 8, "isSigned": true}), |
| DataType::Int16 => json!({"name": "int", "bitWidth": 16, "isSigned": true}), |
| DataType::Int32 => json!({"name": "int", "bitWidth": 32, "isSigned": true}), |
| DataType::Int64 => json!({"name": "int", "bitWidth": 64, "isSigned": true}), |
| DataType::UInt8 => json!({"name": "int", "bitWidth": 8, "isSigned": false}), |
| DataType::UInt16 => json!({"name": "int", "bitWidth": 16, "isSigned": false}), |
| DataType::UInt32 => json!({"name": "int", "bitWidth": 32, "isSigned": false}), |
| DataType::UInt64 => json!({"name": "int", "bitWidth": 64, "isSigned": false}), |
| DataType::Float16 => json!({"name": "floatingpoint", "precision": "HALF"}), |
| DataType::Float32 => json!({"name": "floatingpoint", "precision": "SINGLE"}), |
| DataType::Float64 => json!({"name": "floatingpoint", "precision": "DOUBLE"}), |
| DataType::Utf8 => json!({"name": "utf8"}), |
| DataType::LargeUtf8 => json!({"name": "largeutf8"}), |
| DataType::Binary => json!({"name": "binary"}), |
| DataType::LargeBinary => json!({"name": "largebinary"}), |
| DataType::FixedSizeBinary(byte_width) => { |
| json!({"name": "fixedsizebinary", "byteWidth": byte_width}) |
| } |
| DataType::Struct(_) => json!({"name": "struct"}), |
| DataType::Union(_) => json!({"name": "union"}), |
| DataType::List(_) => json!({ "name": "list"}), |
| DataType::LargeList(_) => json!({ "name": "largelist"}), |
| DataType::FixedSizeList(_, length) => { |
| json!({"name":"fixedsizelist", "listSize": length}) |
| } |
| DataType::Time32(unit) => { |
| json!({"name": "time", "bitWidth": 32, "unit": match unit { |
| TimeUnit::Second => "SECOND", |
| TimeUnit::Millisecond => "MILLISECOND", |
| TimeUnit::Microsecond => "MICROSECOND", |
| TimeUnit::Nanosecond => "NANOSECOND", |
| }}) |
| } |
| DataType::Time64(unit) => { |
| json!({"name": "time", "bitWidth": 64, "unit": match unit { |
| TimeUnit::Second => "SECOND", |
| TimeUnit::Millisecond => "MILLISECOND", |
| TimeUnit::Microsecond => "MICROSECOND", |
| TimeUnit::Nanosecond => "NANOSECOND", |
| }}) |
| } |
| DataType::Date32 => { |
| json!({"name": "date", "unit": "DAY"}) |
| } |
| DataType::Date64 => { |
| json!({"name": "date", "unit": "MILLISECOND"}) |
| } |
| DataType::Timestamp(unit, None) => { |
| json!({"name": "timestamp", "unit": match unit { |
| TimeUnit::Second => "SECOND", |
| TimeUnit::Millisecond => "MILLISECOND", |
| TimeUnit::Microsecond => "MICROSECOND", |
| TimeUnit::Nanosecond => "NANOSECOND", |
| }}) |
| } |
| DataType::Timestamp(unit, Some(tz)) => { |
| json!({"name": "timestamp", "unit": match unit { |
| TimeUnit::Second => "SECOND", |
| TimeUnit::Millisecond => "MILLISECOND", |
| TimeUnit::Microsecond => "MICROSECOND", |
| TimeUnit::Nanosecond => "NANOSECOND", |
| }, "timezone": tz}) |
| } |
| DataType::Interval(unit) => json!({"name": "interval", "unit": match unit { |
| IntervalUnit::YearMonth => "YEAR_MONTH", |
| IntervalUnit::DayTime => "DAY_TIME", |
| }}), |
| DataType::Duration(unit) => json!({"name": "duration", "unit": match unit { |
| TimeUnit::Second => "SECOND", |
| TimeUnit::Millisecond => "MILLISECOND", |
| TimeUnit::Microsecond => "MICROSECOND", |
| TimeUnit::Nanosecond => "NANOSECOND", |
| }}), |
| DataType::Dictionary(_, _) => json!({ "name": "dictionary"}), |
| DataType::Decimal(precision, scale) => { |
| json!({"name": "decimal", "precision": precision, "scale": scale}) |
| } |
| } |
| } |
| |
| /// Returns true if this type is numeric: (UInt*, Unit*, or Float*). |
| pub fn is_numeric(t: &DataType) -> bool { |
| use DataType::*; |
| matches!( |
| t, |
| UInt8 |
| | UInt16 |
| | UInt32 |
| | UInt64 |
| | Int8 |
| | Int16 |
| | Int32 |
| | Int64 |
| | Float32 |
| | Float64 |
| ) |
| } |
| |
| /// Compares the datatype with another, ignoring nested field names |
| /// and metadata. |
| pub(crate) fn equals_datatype(&self, other: &DataType) -> bool { |
| match (&self, other) { |
| (DataType::List(a), DataType::List(b)) |
| | (DataType::LargeList(a), DataType::LargeList(b)) => { |
| a.is_nullable() == b.is_nullable() |
| && a.data_type().equals_datatype(b.data_type()) |
| } |
| (DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => { |
| a_size == b_size |
| && a.is_nullable() == b.is_nullable() |
| && a.data_type().equals_datatype(b.data_type()) |
| } |
| (DataType::Struct(a), DataType::Struct(b)) => { |
| a.len() == b.len() |
| && a.iter().zip(b).all(|(a, b)| { |
| a.is_nullable() == b.is_nullable() |
| && a.data_type().equals_datatype(b.data_type()) |
| }) |
| } |
| _ => self == other, |
| } |
| } |
| } |