blob: 122cbdd5e47d952dd628068c66480f94eaabf06f [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use std::fmt;
use serde_derive::{Deserialize, Serialize};
use serde_json::{json, Value, Value::String as VString};
use crate::error::{ArrowError, Result};
use super::Field;
/// The set of datatypes that are supported by this implementation of Apache Arrow.
///
/// The Arrow specification on data types includes some more types.
/// See also [`Schema.fbs`](https://github.com/apache/arrow/blob/master/format/Schema.fbs)
/// for Arrow's specification.
///
/// The variants of this enum include primitive fixed size types as well as parametric or
/// nested types.
/// Currently the Rust implementation supports the following nested types:
/// - `List<T>`
/// - `Struct<T, U, V, ...>`
///
/// Nested types can themselves be nested within other arrays.
/// For more information on these types please see
/// [the physical memory layout of Apache Arrow](https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout).
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum DataType {
/// Null type
Null,
/// A boolean datatype representing the values `true` and `false`.
Boolean,
/// A signed 8-bit integer.
Int8,
/// A signed 16-bit integer.
Int16,
/// A signed 32-bit integer.
Int32,
/// A signed 64-bit integer.
Int64,
/// An unsigned 8-bit integer.
UInt8,
/// An unsigned 16-bit integer.
UInt16,
/// An unsigned 32-bit integer.
UInt32,
/// An unsigned 64-bit integer.
UInt64,
/// A 16-bit floating point number.
Float16,
/// A 32-bit floating point number.
Float32,
/// A 64-bit floating point number.
Float64,
/// A timestamp with an optional timezone.
///
/// Time is measured as a Unix epoch, counting the seconds from
/// 00:00:00.000 on 1 January 1970, excluding leap seconds,
/// as a 64-bit integer.
///
/// The time zone is a string indicating the name of a time zone, one of:
///
/// * As used in the Olson time zone database (the "tz database" or
/// "tzdata"), such as "America/New_York"
/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
Timestamp(TimeUnit, Option<String>),
/// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01)
/// in days (32 bits).
Date32,
/// A 64-bit date representing the elapsed time since UNIX epoch (1970-01-01)
/// in milliseconds (64 bits). Values are evenly divisible by 86400000.
Date64,
/// A 32-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
Time32(TimeUnit),
/// A 64-bit time representing the elapsed time since midnight in the unit of `TimeUnit`.
Time64(TimeUnit),
/// Measure of elapsed time in either seconds, milliseconds, microseconds or nanoseconds.
Duration(TimeUnit),
/// A "calendar" interval which models types that don't necessarily
/// have a precise duration without the context of a base timestamp (e.g.
/// days can differ in length during day light savings time transitions).
Interval(IntervalUnit),
/// Opaque binary data of variable length.
Binary,
/// Opaque binary data of fixed size.
/// Enum parameter specifies the number of bytes per value.
FixedSizeBinary(i32),
/// Opaque binary data of variable length and 64-bit offsets.
LargeBinary,
/// A variable-length string in Unicode with UTF-8 encoding.
Utf8,
/// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets.
LargeUtf8,
/// A list of some logical data type with variable length.
List(Box<Field>),
/// A list of some logical data type with fixed length.
FixedSizeList(Box<Field>, i32),
/// A list of some logical data type with variable length and 64-bit offsets.
LargeList(Box<Field>),
/// A nested datatype that contains a number of sub-fields.
Struct(Vec<Field>),
/// A nested datatype that can represent slots of differing types.
Union(Vec<Field>),
/// A dictionary encoded array (`key_type`, `value_type`), where
/// each array element is an index of `key_type` into an
/// associated dictionary of `value_type`.
///
/// Dictionary arrays are used to store columns of `value_type`
/// that contain many repeated values using less memory, but with
/// a higher CPU overhead for some operations.
///
/// This type mostly used to represent low cardinality string
/// arrays or a limited set of primitive types as integers.
Dictionary(Box<DataType>, Box<DataType>),
/// Decimal value with precision and scale
Decimal(usize, usize),
}
/// An absolute length of time in seconds, milliseconds, microseconds or nanoseconds.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum TimeUnit {
/// Time in seconds.
Second,
/// Time in milliseconds.
Millisecond,
/// Time in microseconds.
Microsecond,
/// Time in nanoseconds.
Nanosecond,
}
/// YEAR_MONTH or DAY_TIME interval in SQL style.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub enum IntervalUnit {
/// Indicates the number of elapsed whole months, stored as 4-byte integers.
YearMonth,
/// Indicates the number of elapsed days and milliseconds,
/// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total).
DayTime,
}
impl fmt::Display for DataType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{:?}", self)
}
}
impl DataType {
/// Parse a data type from a JSON representation.
pub(crate) fn from(json: &Value) -> Result<DataType> {
let default_field = Field::new("", DataType::Boolean, true);
match *json {
Value::Object(ref map) => match map.get("name") {
Some(s) if s == "null" => Ok(DataType::Null),
Some(s) if s == "bool" => Ok(DataType::Boolean),
Some(s) if s == "binary" => Ok(DataType::Binary),
Some(s) if s == "largebinary" => Ok(DataType::LargeBinary),
Some(s) if s == "utf8" => Ok(DataType::Utf8),
Some(s) if s == "largeutf8" => Ok(DataType::LargeUtf8),
Some(s) if s == "fixedsizebinary" => {
// return a list with any type as its child isn't defined in the map
if let Some(Value::Number(size)) = map.get("byteWidth") {
Ok(DataType::FixedSizeBinary(size.as_i64().unwrap() as i32))
} else {
Err(ArrowError::ParseError(
"Expecting a byteWidth for fixedsizebinary".to_string(),
))
}
}
Some(s) if s == "decimal" => {
// return a list with any type as its child isn't defined in the map
let precision = match map.get("precision") {
Some(p) => Ok(p.as_u64().unwrap() as usize),
None => Err(ArrowError::ParseError(
"Expecting a precision for decimal".to_string(),
)),
};
let scale = match map.get("scale") {
Some(s) => Ok(s.as_u64().unwrap() as usize),
_ => Err(ArrowError::ParseError(
"Expecting a scale for decimal".to_string(),
)),
};
Ok(DataType::Decimal(precision?, scale?))
}
Some(s) if s == "floatingpoint" => match map.get("precision") {
Some(p) if p == "HALF" => Ok(DataType::Float16),
Some(p) if p == "SINGLE" => Ok(DataType::Float32),
Some(p) if p == "DOUBLE" => Ok(DataType::Float64),
_ => Err(ArrowError::ParseError(
"floatingpoint precision missing or invalid".to_string(),
)),
},
Some(s) if s == "timestamp" => {
let unit = match map.get("unit") {
Some(p) if p == "SECOND" => Ok(TimeUnit::Second),
Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond),
Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond),
Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond),
_ => Err(ArrowError::ParseError(
"timestamp unit missing or invalid".to_string(),
)),
};
let tz = match map.get("timezone") {
None => Ok(None),
Some(VString(tz)) => Ok(Some(tz.clone())),
_ => Err(ArrowError::ParseError(
"timezone must be a string".to_string(),
)),
};
Ok(DataType::Timestamp(unit?, tz?))
}
Some(s) if s == "date" => match map.get("unit") {
Some(p) if p == "DAY" => Ok(DataType::Date32),
Some(p) if p == "MILLISECOND" => Ok(DataType::Date64),
_ => Err(ArrowError::ParseError(
"date unit missing or invalid".to_string(),
)),
},
Some(s) if s == "time" => {
let unit = match map.get("unit") {
Some(p) if p == "SECOND" => Ok(TimeUnit::Second),
Some(p) if p == "MILLISECOND" => Ok(TimeUnit::Millisecond),
Some(p) if p == "MICROSECOND" => Ok(TimeUnit::Microsecond),
Some(p) if p == "NANOSECOND" => Ok(TimeUnit::Nanosecond),
_ => Err(ArrowError::ParseError(
"time unit missing or invalid".to_string(),
)),
};
match map.get("bitWidth") {
Some(p) if p == 32 => Ok(DataType::Time32(unit?)),
Some(p) if p == 64 => Ok(DataType::Time64(unit?)),
_ => Err(ArrowError::ParseError(
"time bitWidth missing or invalid".to_string(),
)),
}
}
Some(s) if s == "duration" => match map.get("unit") {
Some(p) if p == "SECOND" => Ok(DataType::Duration(TimeUnit::Second)),
Some(p) if p == "MILLISECOND" => {
Ok(DataType::Duration(TimeUnit::Millisecond))
}
Some(p) if p == "MICROSECOND" => {
Ok(DataType::Duration(TimeUnit::Microsecond))
}
Some(p) if p == "NANOSECOND" => {
Ok(DataType::Duration(TimeUnit::Nanosecond))
}
_ => Err(ArrowError::ParseError(
"time unit missing or invalid".to_string(),
)),
},
Some(s) if s == "interval" => match map.get("unit") {
Some(p) if p == "DAY_TIME" => {
Ok(DataType::Interval(IntervalUnit::DayTime))
}
Some(p) if p == "YEAR_MONTH" => {
Ok(DataType::Interval(IntervalUnit::YearMonth))
}
_ => Err(ArrowError::ParseError(
"interval unit missing or invalid".to_string(),
)),
},
Some(s) if s == "int" => match map.get("isSigned") {
Some(&Value::Bool(true)) => match map.get("bitWidth") {
Some(&Value::Number(ref n)) => match n.as_u64() {
Some(8) => Ok(DataType::Int8),
Some(16) => Ok(DataType::Int16),
Some(32) => Ok(DataType::Int32),
Some(64) => Ok(DataType::Int64),
_ => Err(ArrowError::ParseError(
"int bitWidth missing or invalid".to_string(),
)),
},
_ => Err(ArrowError::ParseError(
"int bitWidth missing or invalid".to_string(),
)),
},
Some(&Value::Bool(false)) => match map.get("bitWidth") {
Some(&Value::Number(ref n)) => match n.as_u64() {
Some(8) => Ok(DataType::UInt8),
Some(16) => Ok(DataType::UInt16),
Some(32) => Ok(DataType::UInt32),
Some(64) => Ok(DataType::UInt64),
_ => Err(ArrowError::ParseError(
"int bitWidth missing or invalid".to_string(),
)),
},
_ => Err(ArrowError::ParseError(
"int bitWidth missing or invalid".to_string(),
)),
},
_ => Err(ArrowError::ParseError(
"int signed missing or invalid".to_string(),
)),
},
Some(s) if s == "list" => {
// return a list with any type as its child isn't defined in the map
Ok(DataType::List(Box::new(default_field)))
}
Some(s) if s == "largelist" => {
// return a largelist with any type as its child isn't defined in the map
Ok(DataType::LargeList(Box::new(default_field)))
}
Some(s) if s == "fixedsizelist" => {
// return a list with any type as its child isn't defined in the map
if let Some(Value::Number(size)) = map.get("listSize") {
Ok(DataType::FixedSizeList(
Box::new(default_field),
size.as_i64().unwrap() as i32,
))
} else {
Err(ArrowError::ParseError(
"Expecting a listSize for fixedsizelist".to_string(),
))
}
}
Some(s) if s == "struct" => {
// return an empty `struct` type as its children aren't defined in the map
Ok(DataType::Struct(vec![]))
}
Some(other) => Err(ArrowError::ParseError(format!(
"invalid or unsupported type name: {} in {:?}",
other, json
))),
None => Err(ArrowError::ParseError("type name missing".to_string())),
},
_ => Err(ArrowError::ParseError(
"invalid json value type".to_string(),
)),
}
}
/// Generate a JSON representation of the data type.
pub fn to_json(&self) -> Value {
match self {
DataType::Null => json!({"name": "null"}),
DataType::Boolean => json!({"name": "bool"}),
DataType::Int8 => json!({"name": "int", "bitWidth": 8, "isSigned": true}),
DataType::Int16 => json!({"name": "int", "bitWidth": 16, "isSigned": true}),
DataType::Int32 => json!({"name": "int", "bitWidth": 32, "isSigned": true}),
DataType::Int64 => json!({"name": "int", "bitWidth": 64, "isSigned": true}),
DataType::UInt8 => json!({"name": "int", "bitWidth": 8, "isSigned": false}),
DataType::UInt16 => json!({"name": "int", "bitWidth": 16, "isSigned": false}),
DataType::UInt32 => json!({"name": "int", "bitWidth": 32, "isSigned": false}),
DataType::UInt64 => json!({"name": "int", "bitWidth": 64, "isSigned": false}),
DataType::Float16 => json!({"name": "floatingpoint", "precision": "HALF"}),
DataType::Float32 => json!({"name": "floatingpoint", "precision": "SINGLE"}),
DataType::Float64 => json!({"name": "floatingpoint", "precision": "DOUBLE"}),
DataType::Utf8 => json!({"name": "utf8"}),
DataType::LargeUtf8 => json!({"name": "largeutf8"}),
DataType::Binary => json!({"name": "binary"}),
DataType::LargeBinary => json!({"name": "largebinary"}),
DataType::FixedSizeBinary(byte_width) => {
json!({"name": "fixedsizebinary", "byteWidth": byte_width})
}
DataType::Struct(_) => json!({"name": "struct"}),
DataType::Union(_) => json!({"name": "union"}),
DataType::List(_) => json!({ "name": "list"}),
DataType::LargeList(_) => json!({ "name": "largelist"}),
DataType::FixedSizeList(_, length) => {
json!({"name":"fixedsizelist", "listSize": length})
}
DataType::Time32(unit) => {
json!({"name": "time", "bitWidth": 32, "unit": match unit {
TimeUnit::Second => "SECOND",
TimeUnit::Millisecond => "MILLISECOND",
TimeUnit::Microsecond => "MICROSECOND",
TimeUnit::Nanosecond => "NANOSECOND",
}})
}
DataType::Time64(unit) => {
json!({"name": "time", "bitWidth": 64, "unit": match unit {
TimeUnit::Second => "SECOND",
TimeUnit::Millisecond => "MILLISECOND",
TimeUnit::Microsecond => "MICROSECOND",
TimeUnit::Nanosecond => "NANOSECOND",
}})
}
DataType::Date32 => {
json!({"name": "date", "unit": "DAY"})
}
DataType::Date64 => {
json!({"name": "date", "unit": "MILLISECOND"})
}
DataType::Timestamp(unit, None) => {
json!({"name": "timestamp", "unit": match unit {
TimeUnit::Second => "SECOND",
TimeUnit::Millisecond => "MILLISECOND",
TimeUnit::Microsecond => "MICROSECOND",
TimeUnit::Nanosecond => "NANOSECOND",
}})
}
DataType::Timestamp(unit, Some(tz)) => {
json!({"name": "timestamp", "unit": match unit {
TimeUnit::Second => "SECOND",
TimeUnit::Millisecond => "MILLISECOND",
TimeUnit::Microsecond => "MICROSECOND",
TimeUnit::Nanosecond => "NANOSECOND",
}, "timezone": tz})
}
DataType::Interval(unit) => json!({"name": "interval", "unit": match unit {
IntervalUnit::YearMonth => "YEAR_MONTH",
IntervalUnit::DayTime => "DAY_TIME",
}}),
DataType::Duration(unit) => json!({"name": "duration", "unit": match unit {
TimeUnit::Second => "SECOND",
TimeUnit::Millisecond => "MILLISECOND",
TimeUnit::Microsecond => "MICROSECOND",
TimeUnit::Nanosecond => "NANOSECOND",
}}),
DataType::Dictionary(_, _) => json!({ "name": "dictionary"}),
DataType::Decimal(precision, scale) => {
json!({"name": "decimal", "precision": precision, "scale": scale})
}
}
}
/// Returns true if this type is numeric: (UInt*, Unit*, or Float*).
pub fn is_numeric(t: &DataType) -> bool {
use DataType::*;
matches!(
t,
UInt8
| UInt16
| UInt32
| UInt64
| Int8
| Int16
| Int32
| Int64
| Float32
| Float64
)
}
/// Compares the datatype with another, ignoring nested field names
/// and metadata.
pub(crate) fn equals_datatype(&self, other: &DataType) -> bool {
match (&self, other) {
(DataType::List(a), DataType::List(b))
| (DataType::LargeList(a), DataType::LargeList(b)) => {
a.is_nullable() == b.is_nullable()
&& a.data_type().equals_datatype(b.data_type())
}
(DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => {
a_size == b_size
&& a.is_nullable() == b.is_nullable()
&& a.data_type().equals_datatype(b.data_type())
}
(DataType::Struct(a), DataType::Struct(b)) => {
a.len() == b.len()
&& a.iter().zip(b).all(|(a, b)| {
a.is_nullable() == b.is_nullable()
&& a.data_type().equals_datatype(b.data_type())
})
}
_ => self == other,
}
}
}