blob: 463da7c6fdfe8809b633c854e63b2894fc84829a [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Defines cast kernels for `ArrayRef`, to convert `Array`s between
//! supported datatypes.
//!
//! Example:
//!
//! ```
//! use arrow::array::*;
//! use arrow::compute::cast;
//! use arrow::datatypes::DataType;
//! use std::sync::Arc;
//!
//! let a = Int32Array::from(vec![5, 6, 7]);
//! let array = Arc::new(a) as ArrayRef;
//! let b = cast(&array, &DataType::Float64).unwrap();
//! let c = b.as_any().downcast_ref::<Float64Array>().unwrap();
//! assert_eq!(5.0, c.value(0));
//! assert_eq!(6.0, c.value(1));
//! assert_eq!(7.0, c.value(2));
//! ```
use std::str;
use std::sync::Arc;
use crate::buffer::MutableBuffer;
use crate::compute::kernels::arithmetic::{divide, multiply};
use crate::compute::kernels::arity::unary;
use crate::compute::kernels::cast_utils::string_to_timestamp_nanos;
use crate::datatypes::*;
use crate::error::{ArrowError, Result};
use crate::{array::*, compute::take};
use crate::{buffer::Buffer, util::serialization::lexical_to_string};
use num::{NumCast, ToPrimitive};
/// CastOptions provides a way to override the default cast behaviors
#[derive(Debug)]
pub struct CastOptions {
/// how to handle cast failures, either return NULL (safe=true) or return ERR (safe=false)
pub safe: bool,
}
pub const DEFAULT_CAST_OPTIONS: CastOptions = CastOptions { safe: true };
/// Return true if a value of type `from_type` can be cast into a
/// value of `to_type`. Note that such as cast may be lossy.
///
/// If this function returns true to stay consistent with the `cast` kernel below.
pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
use self::DataType::*;
if from_type == to_type {
return true;
}
match (from_type, to_type) {
(Struct(_), _) => false,
(_, Struct(_)) => false,
(LargeList(list_from), LargeList(list_to)) => {
can_cast_types(list_from.data_type(), list_to.data_type())
}
(List(list_from), List(list_to)) => {
can_cast_types(list_from.data_type(), list_to.data_type())
}
(List(list_from), LargeList(list_to)) => {
list_from.data_type() == list_to.data_type()
}
(List(_), _) => false,
(_, List(list_to)) => can_cast_types(from_type, list_to.data_type()),
(_, LargeList(list_to)) => can_cast_types(from_type, list_to.data_type()),
(Dictionary(_, from_value_type), Dictionary(_, to_value_type)) => {
can_cast_types(from_value_type, to_value_type)
}
(Dictionary(_, value_type), _) => can_cast_types(value_type, to_type),
(_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type),
(_, Boolean) => DataType::is_numeric(from_type),
(Boolean, _) => DataType::is_numeric(to_type) || to_type == &Utf8,
(Utf8, LargeUtf8) => true,
(LargeUtf8, Utf8) => true,
(Utf8, Date32) => true,
(Utf8, Date64) => true,
(Utf8, Timestamp(TimeUnit::Nanosecond, None)) => true,
(Utf8, _) => DataType::is_numeric(to_type),
(LargeUtf8, Date32) => true,
(LargeUtf8, Date64) => true,
(LargeUtf8, Timestamp(TimeUnit::Nanosecond, None)) => true,
(LargeUtf8, _) => DataType::is_numeric(to_type),
(_, Utf8) | (_, LargeUtf8) => {
DataType::is_numeric(from_type) || from_type == &Binary
}
// start numeric casts
(UInt8, UInt16) => true,
(UInt8, UInt32) => true,
(UInt8, UInt64) => true,
(UInt8, Int8) => true,
(UInt8, Int16) => true,
(UInt8, Int32) => true,
(UInt8, Int64) => true,
(UInt8, Float32) => true,
(UInt8, Float64) => true,
(UInt16, UInt8) => true,
(UInt16, UInt32) => true,
(UInt16, UInt64) => true,
(UInt16, Int8) => true,
(UInt16, Int16) => true,
(UInt16, Int32) => true,
(UInt16, Int64) => true,
(UInt16, Float32) => true,
(UInt16, Float64) => true,
(UInt32, UInt8) => true,
(UInt32, UInt16) => true,
(UInt32, UInt64) => true,
(UInt32, Int8) => true,
(UInt32, Int16) => true,
(UInt32, Int32) => true,
(UInt32, Int64) => true,
(UInt32, Float32) => true,
(UInt32, Float64) => true,
(UInt64, UInt8) => true,
(UInt64, UInt16) => true,
(UInt64, UInt32) => true,
(UInt64, Int8) => true,
(UInt64, Int16) => true,
(UInt64, Int32) => true,
(UInt64, Int64) => true,
(UInt64, Float32) => true,
(UInt64, Float64) => true,
(Int8, UInt8) => true,
(Int8, UInt16) => true,
(Int8, UInt32) => true,
(Int8, UInt64) => true,
(Int8, Int16) => true,
(Int8, Int32) => true,
(Int8, Int64) => true,
(Int8, Float32) => true,
(Int8, Float64) => true,
(Int16, UInt8) => true,
(Int16, UInt16) => true,
(Int16, UInt32) => true,
(Int16, UInt64) => true,
(Int16, Int8) => true,
(Int16, Int32) => true,
(Int16, Int64) => true,
(Int16, Float32) => true,
(Int16, Float64) => true,
(Int32, UInt8) => true,
(Int32, UInt16) => true,
(Int32, UInt32) => true,
(Int32, UInt64) => true,
(Int32, Int8) => true,
(Int32, Int16) => true,
(Int32, Int64) => true,
(Int32, Float32) => true,
(Int32, Float64) => true,
(Int64, UInt8) => true,
(Int64, UInt16) => true,
(Int64, UInt32) => true,
(Int64, UInt64) => true,
(Int64, Int8) => true,
(Int64, Int16) => true,
(Int64, Int32) => true,
(Int64, Float32) => true,
(Int64, Float64) => true,
(Float32, UInt8) => true,
(Float32, UInt16) => true,
(Float32, UInt32) => true,
(Float32, UInt64) => true,
(Float32, Int8) => true,
(Float32, Int16) => true,
(Float32, Int32) => true,
(Float32, Int64) => true,
(Float32, Float64) => true,
(Float64, UInt8) => true,
(Float64, UInt16) => true,
(Float64, UInt32) => true,
(Float64, UInt64) => true,
(Float64, Int8) => true,
(Float64, Int16) => true,
(Float64, Int32) => true,
(Float64, Int64) => true,
(Float64, Float32) => true,
// end numeric casts
// temporal casts
(Int32, Date32) => true,
(Int32, Date64) => true,
(Int32, Time32(_)) => true,
(Date32, Int32) => true,
(Date32, Int64) => true,
(Time32(_), Int32) => true,
(Int64, Date64) => true,
(Int64, Date32) => true,
(Int64, Time64(_)) => true,
(Date64, Int64) => true,
(Date64, Int32) => true,
(Time64(_), Int64) => true,
(Date32, Date64) => true,
(Date64, Date32) => true,
(Time32(TimeUnit::Second), Time32(TimeUnit::Millisecond)) => true,
(Time32(TimeUnit::Millisecond), Time32(TimeUnit::Second)) => true,
(Time32(_), Time64(_)) => true,
(Time64(TimeUnit::Microsecond), Time64(TimeUnit::Nanosecond)) => true,
(Time64(TimeUnit::Nanosecond), Time64(TimeUnit::Microsecond)) => true,
(Time64(_), Time32(to_unit)) => {
matches!(to_unit, TimeUnit::Second | TimeUnit::Millisecond)
}
(Timestamp(_, _), Int64) => true,
(Int64, Timestamp(_, _)) => true,
(Timestamp(_, _), Timestamp(_, _)) => true,
(Timestamp(_, _), Date32) => true,
(Timestamp(_, _), Date64) => true,
// date64 to timestamp might not make sense,
(Int64, Duration(_)) => true,
(Null, Int32) => true,
(_, _) => false,
}
}
/// Cast `array` to the provided data type and return a new Array with
/// type `to_type`, if possible.
///
/// Behavior:
/// * Boolean to Utf8: `true` => '1', `false` => `0`
/// * Utf8 to numeric: strings that can't be parsed to numbers return null, float strings
/// in integer casts return null
/// * Numeric to boolean: 0 returns `false`, any other value returns `true`
/// * List to List: the underlying data type is cast
/// * Primitive to List: a list array with 1 value per slot is created
/// * Date32 and Date64: precision lost when going to higher interval
/// * Time32 and Time64: precision lost when going to higher interval
/// * Timestamp and Date{32|64}: precision lost when going to higher interval
/// * Temporal to/from backing primitive: zero-copy with data type change
///
/// Unsupported Casts
/// * To or from `StructArray`
/// * List to primitive
/// * Utf8 to boolean
/// * Interval and duration
pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> {
cast_with_options(array, to_type, &DEFAULT_CAST_OPTIONS)
}
/// Cast `array` to the provided data type and return a new Array with
/// type `to_type`, if possible. It accepts `CastOptions` to allow consumers
/// to configure cast behavior.
///
/// Behavior:
/// * Boolean to Utf8: `true` => '1', `false` => `0`
/// * Utf8 to numeric: strings that can't be parsed to numbers return null, float strings
/// in integer casts return null
/// * Numeric to boolean: 0 returns `false`, any other value returns `true`
/// * List to List: the underlying data type is cast
/// * Primitive to List: a list array with 1 value per slot is created
/// * Date32 and Date64: precision lost when going to higher interval
/// * Time32 and Time64: precision lost when going to higher interval
/// * Timestamp and Date{32|64}: precision lost when going to higher interval
/// * Temporal to/from backing primitive: zero-copy with data type change
///
/// Unsupported Casts
/// * To or from `StructArray`
/// * List to primitive
/// * Utf8 to boolean
/// * Interval and duration
pub fn cast_with_options(
array: &ArrayRef,
to_type: &DataType,
cast_options: &CastOptions,
) -> Result<ArrayRef> {
use DataType::*;
let from_type = array.data_type();
// clone array if types are the same
if from_type == to_type {
return Ok(array.clone());
}
match (from_type, to_type) {
(Struct(_), _) => Err(ArrowError::CastError(
"Cannot cast from struct to other types".to_string(),
)),
(_, Struct(_)) => Err(ArrowError::CastError(
"Cannot cast to struct from other types".to_string(),
)),
(List(_), List(ref to)) => {
cast_list_inner::<i32>(array, to, to_type, cast_options)
}
(LargeList(_), LargeList(ref to)) => {
cast_list_inner::<i64>(array, to, to_type, cast_options)
}
(List(list_from), LargeList(list_to)) => {
if list_to.data_type() != list_from.data_type() {
Err(ArrowError::CastError(
"cannot cast list to large-list with different child data".into(),
))
} else {
cast_list_container::<i32, i64>(&**array, cast_options)
}
}
(LargeList(list_from), List(list_to)) => {
if list_to.data_type() != list_from.data_type() {
Err(ArrowError::CastError(
"cannot cast large-list to list with different child data".into(),
))
} else {
cast_list_container::<i64, i32>(&**array, cast_options)
}
}
(List(_), _) => Err(ArrowError::CastError(
"Cannot cast list to non-list data types".to_string(),
)),
(_, List(ref to)) => {
cast_primitive_to_list::<i32>(array, to, to_type, cast_options)
}
(_, LargeList(ref to)) => {
cast_primitive_to_list::<i64>(array, to, to_type, cast_options)
}
(Dictionary(index_type, _), _) => match **index_type {
DataType::Int8 => dictionary_cast::<Int8Type>(array, to_type, cast_options),
DataType::Int16 => dictionary_cast::<Int16Type>(array, to_type, cast_options),
DataType::Int32 => dictionary_cast::<Int32Type>(array, to_type, cast_options),
DataType::Int64 => dictionary_cast::<Int64Type>(array, to_type, cast_options),
DataType::UInt8 => dictionary_cast::<UInt8Type>(array, to_type, cast_options),
DataType::UInt16 => {
dictionary_cast::<UInt16Type>(array, to_type, cast_options)
}
DataType::UInt32 => {
dictionary_cast::<UInt32Type>(array, to_type, cast_options)
}
DataType::UInt64 => {
dictionary_cast::<UInt64Type>(array, to_type, cast_options)
}
_ => Err(ArrowError::CastError(format!(
"Casting from dictionary type {:?} to {:?} not supported",
from_type, to_type,
))),
},
(_, Dictionary(index_type, value_type)) => match **index_type {
DataType::Int8 => {
cast_to_dictionary::<Int8Type>(array, value_type, cast_options)
}
DataType::Int16 => {
cast_to_dictionary::<Int16Type>(array, value_type, cast_options)
}
DataType::Int32 => {
cast_to_dictionary::<Int32Type>(array, value_type, cast_options)
}
DataType::Int64 => {
cast_to_dictionary::<Int64Type>(array, value_type, cast_options)
}
DataType::UInt8 => {
cast_to_dictionary::<UInt8Type>(array, value_type, cast_options)
}
DataType::UInt16 => {
cast_to_dictionary::<UInt16Type>(array, value_type, cast_options)
}
DataType::UInt32 => {
cast_to_dictionary::<UInt32Type>(array, value_type, cast_options)
}
DataType::UInt64 => {
cast_to_dictionary::<UInt64Type>(array, value_type, cast_options)
}
_ => Err(ArrowError::CastError(format!(
"Casting from type {:?} to dictionary type {:?} not supported",
from_type, to_type,
))),
},
(_, Boolean) => match from_type {
UInt8 => cast_numeric_to_bool::<UInt8Type>(array),
UInt16 => cast_numeric_to_bool::<UInt16Type>(array),
UInt32 => cast_numeric_to_bool::<UInt32Type>(array),
UInt64 => cast_numeric_to_bool::<UInt64Type>(array),
Int8 => cast_numeric_to_bool::<Int8Type>(array),
Int16 => cast_numeric_to_bool::<Int16Type>(array),
Int32 => cast_numeric_to_bool::<Int32Type>(array),
Int64 => cast_numeric_to_bool::<Int64Type>(array),
Float32 => cast_numeric_to_bool::<Float32Type>(array),
Float64 => cast_numeric_to_bool::<Float64Type>(array),
Utf8 => Err(ArrowError::CastError(format!(
"Casting from {:?} to {:?} not supported",
from_type, to_type,
))),
_ => Err(ArrowError::CastError(format!(
"Casting from {:?} to {:?} not supported",
from_type, to_type,
))),
},
(Boolean, _) => match to_type {
UInt8 => cast_bool_to_numeric::<UInt8Type>(array, cast_options),
UInt16 => cast_bool_to_numeric::<UInt16Type>(array, cast_options),
UInt32 => cast_bool_to_numeric::<UInt32Type>(array, cast_options),
UInt64 => cast_bool_to_numeric::<UInt64Type>(array, cast_options),
Int8 => cast_bool_to_numeric::<Int8Type>(array, cast_options),
Int16 => cast_bool_to_numeric::<Int16Type>(array, cast_options),
Int32 => cast_bool_to_numeric::<Int32Type>(array, cast_options),
Int64 => cast_bool_to_numeric::<Int64Type>(array, cast_options),
Float32 => cast_bool_to_numeric::<Float32Type>(array, cast_options),
Float64 => cast_bool_to_numeric::<Float64Type>(array, cast_options),
Utf8 => {
let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
Ok(Arc::new(
array
.iter()
.map(|value| value.map(|value| if value { "1" } else { "0" }))
.collect::<StringArray>(),
))
}
_ => Err(ArrowError::CastError(format!(
"Casting from {:?} to {:?} not supported",
from_type, to_type,
))),
},
(Utf8, _) => match to_type {
LargeUtf8 => cast_str_container::<i32, i64>(&**array),
UInt8 => cast_string_to_numeric::<UInt8Type, i32>(array, cast_options),
UInt16 => cast_string_to_numeric::<UInt16Type, i32>(array, cast_options),
UInt32 => cast_string_to_numeric::<UInt32Type, i32>(array, cast_options),
UInt64 => cast_string_to_numeric::<UInt64Type, i32>(array, cast_options),
Int8 => cast_string_to_numeric::<Int8Type, i32>(array, cast_options),
Int16 => cast_string_to_numeric::<Int16Type, i32>(array, cast_options),
Int32 => cast_string_to_numeric::<Int32Type, i32>(array, cast_options),
Int64 => cast_string_to_numeric::<Int64Type, i32>(array, cast_options),
Float32 => cast_string_to_numeric::<Float32Type, i32>(array, cast_options),
Float64 => cast_string_to_numeric::<Float64Type, i32>(array, cast_options),
Date32 => cast_string_to_date32::<i32>(&**array, cast_options),
Date64 => cast_string_to_date64::<i32>(&**array, cast_options),
Timestamp(TimeUnit::Nanosecond, None) => {
cast_string_to_timestamp_ns::<i32>(&**array, cast_options)
}
_ => Err(ArrowError::CastError(format!(
"Casting from {:?} to {:?} not supported",
from_type, to_type,
))),
},
(_, Utf8) => match from_type {
LargeUtf8 => cast_str_container::<i64, i32>(&**array),
UInt8 => cast_numeric_to_string::<UInt8Type, i32>(array),
UInt16 => cast_numeric_to_string::<UInt16Type, i32>(array),
UInt32 => cast_numeric_to_string::<UInt32Type, i32>(array),
UInt64 => cast_numeric_to_string::<UInt64Type, i32>(array),
Int8 => cast_numeric_to_string::<Int8Type, i32>(array),
Int16 => cast_numeric_to_string::<Int16Type, i32>(array),
Int32 => cast_numeric_to_string::<Int32Type, i32>(array),
Int64 => cast_numeric_to_string::<Int64Type, i32>(array),
Float32 => cast_numeric_to_string::<Float32Type, i32>(array),
Float64 => cast_numeric_to_string::<Float64Type, i32>(array),
Binary => {
let array = array.as_any().downcast_ref::<BinaryArray>().unwrap();
Ok(Arc::new(
array
.iter()
.map(|maybe_value| match maybe_value {
Some(value) => {
let result = str::from_utf8(value);
if cast_options.safe {
Ok(result.ok())
} else {
Some(result.map_err(|_| {
ArrowError::CastError(
"Cannot cast binary to string".to_string(),
)
}))
.transpose()
}
}
None => Ok(None),
})
.collect::<Result<StringArray>>()?,
))
}
_ => Err(ArrowError::CastError(format!(
"Casting from {:?} to {:?} not supported",
from_type, to_type,
))),
},
(_, LargeUtf8) => match from_type {
UInt8 => cast_numeric_to_string::<UInt8Type, i64>(array),
UInt16 => cast_numeric_to_string::<UInt16Type, i64>(array),
UInt32 => cast_numeric_to_string::<UInt32Type, i64>(array),
UInt64 => cast_numeric_to_string::<UInt64Type, i64>(array),
Int8 => cast_numeric_to_string::<Int8Type, i64>(array),
Int16 => cast_numeric_to_string::<Int16Type, i64>(array),
Int32 => cast_numeric_to_string::<Int32Type, i64>(array),
Int64 => cast_numeric_to_string::<Int64Type, i64>(array),
Float32 => cast_numeric_to_string::<Float32Type, i64>(array),
Float64 => cast_numeric_to_string::<Float64Type, i64>(array),
Binary => {
let array = array.as_any().downcast_ref::<BinaryArray>().unwrap();
Ok(Arc::new(
array
.iter()
.map(|maybe_value| match maybe_value {
Some(value) => {
let result = str::from_utf8(value);
if cast_options.safe {
Ok(result.ok())
} else {
Some(result.map_err(|_| {
ArrowError::CastError(
"Cannot cast binary to string".to_string(),
)
}))
.transpose()
}
}
None => Ok(None),
})
.collect::<Result<LargeStringArray>>()?,
))
}
_ => Err(ArrowError::CastError(format!(
"Casting from {:?} to {:?} not supported",
from_type, to_type,
))),
},
(LargeUtf8, _) => match to_type {
UInt8 => cast_string_to_numeric::<UInt8Type, i64>(array, cast_options),
UInt16 => cast_string_to_numeric::<UInt16Type, i64>(array, cast_options),
UInt32 => cast_string_to_numeric::<UInt32Type, i64>(array, cast_options),
UInt64 => cast_string_to_numeric::<UInt64Type, i64>(array, cast_options),
Int8 => cast_string_to_numeric::<Int8Type, i64>(array, cast_options),
Int16 => cast_string_to_numeric::<Int16Type, i64>(array, cast_options),
Int32 => cast_string_to_numeric::<Int32Type, i64>(array, cast_options),
Int64 => cast_string_to_numeric::<Int64Type, i64>(array, cast_options),
Float32 => cast_string_to_numeric::<Float32Type, i64>(array, cast_options),
Float64 => cast_string_to_numeric::<Float64Type, i64>(array, cast_options),
Date32 => cast_string_to_date32::<i64>(&**array, cast_options),
Date64 => cast_string_to_date64::<i64>(&**array, cast_options),
Timestamp(TimeUnit::Nanosecond, None) => {
cast_string_to_timestamp_ns::<i64>(&**array, cast_options)
}
_ => Err(ArrowError::CastError(format!(
"Casting from {:?} to {:?} not supported",
from_type, to_type,
))),
},
// start numeric casts
(UInt8, UInt16) => cast_numeric_arrays::<UInt8Type, UInt16Type>(array),
(UInt8, UInt32) => cast_numeric_arrays::<UInt8Type, UInt32Type>(array),
(UInt8, UInt64) => cast_numeric_arrays::<UInt8Type, UInt64Type>(array),
(UInt8, Int8) => cast_numeric_arrays::<UInt8Type, Int8Type>(array),
(UInt8, Int16) => cast_numeric_arrays::<UInt8Type, Int16Type>(array),
(UInt8, Int32) => cast_numeric_arrays::<UInt8Type, Int32Type>(array),
(UInt8, Int64) => cast_numeric_arrays::<UInt8Type, Int64Type>(array),
(UInt8, Float32) => cast_numeric_arrays::<UInt8Type, Float32Type>(array),
(UInt8, Float64) => cast_numeric_arrays::<UInt8Type, Float64Type>(array),
(UInt16, UInt8) => cast_numeric_arrays::<UInt16Type, UInt8Type>(array),
(UInt16, UInt32) => cast_numeric_arrays::<UInt16Type, UInt32Type>(array),
(UInt16, UInt64) => cast_numeric_arrays::<UInt16Type, UInt64Type>(array),
(UInt16, Int8) => cast_numeric_arrays::<UInt16Type, Int8Type>(array),
(UInt16, Int16) => cast_numeric_arrays::<UInt16Type, Int16Type>(array),
(UInt16, Int32) => cast_numeric_arrays::<UInt16Type, Int32Type>(array),
(UInt16, Int64) => cast_numeric_arrays::<UInt16Type, Int64Type>(array),
(UInt16, Float32) => cast_numeric_arrays::<UInt16Type, Float32Type>(array),
(UInt16, Float64) => cast_numeric_arrays::<UInt16Type, Float64Type>(array),
(UInt32, UInt8) => cast_numeric_arrays::<UInt32Type, UInt8Type>(array),
(UInt32, UInt16) => cast_numeric_arrays::<UInt32Type, UInt16Type>(array),
(UInt32, UInt64) => cast_numeric_arrays::<UInt32Type, UInt64Type>(array),
(UInt32, Int8) => cast_numeric_arrays::<UInt32Type, Int8Type>(array),
(UInt32, Int16) => cast_numeric_arrays::<UInt32Type, Int16Type>(array),
(UInt32, Int32) => cast_numeric_arrays::<UInt32Type, Int32Type>(array),
(UInt32, Int64) => cast_numeric_arrays::<UInt32Type, Int64Type>(array),
(UInt32, Float32) => cast_numeric_arrays::<UInt32Type, Float32Type>(array),
(UInt32, Float64) => cast_numeric_arrays::<UInt32Type, Float64Type>(array),
(UInt64, UInt8) => cast_numeric_arrays::<UInt64Type, UInt8Type>(array),
(UInt64, UInt16) => cast_numeric_arrays::<UInt64Type, UInt16Type>(array),
(UInt64, UInt32) => cast_numeric_arrays::<UInt64Type, UInt32Type>(array),
(UInt64, Int8) => cast_numeric_arrays::<UInt64Type, Int8Type>(array),
(UInt64, Int16) => cast_numeric_arrays::<UInt64Type, Int16Type>(array),
(UInt64, Int32) => cast_numeric_arrays::<UInt64Type, Int32Type>(array),
(UInt64, Int64) => cast_numeric_arrays::<UInt64Type, Int64Type>(array),
(UInt64, Float32) => cast_numeric_arrays::<UInt64Type, Float32Type>(array),
(UInt64, Float64) => cast_numeric_arrays::<UInt64Type, Float64Type>(array),
(Int8, UInt8) => cast_numeric_arrays::<Int8Type, UInt8Type>(array),
(Int8, UInt16) => cast_numeric_arrays::<Int8Type, UInt16Type>(array),
(Int8, UInt32) => cast_numeric_arrays::<Int8Type, UInt32Type>(array),
(Int8, UInt64) => cast_numeric_arrays::<Int8Type, UInt64Type>(array),
(Int8, Int16) => cast_numeric_arrays::<Int8Type, Int16Type>(array),
(Int8, Int32) => cast_numeric_arrays::<Int8Type, Int32Type>(array),
(Int8, Int64) => cast_numeric_arrays::<Int8Type, Int64Type>(array),
(Int8, Float32) => cast_numeric_arrays::<Int8Type, Float32Type>(array),
(Int8, Float64) => cast_numeric_arrays::<Int8Type, Float64Type>(array),
(Int16, UInt8) => cast_numeric_arrays::<Int16Type, UInt8Type>(array),
(Int16, UInt16) => cast_numeric_arrays::<Int16Type, UInt16Type>(array),
(Int16, UInt32) => cast_numeric_arrays::<Int16Type, UInt32Type>(array),
(Int16, UInt64) => cast_numeric_arrays::<Int16Type, UInt64Type>(array),
(Int16, Int8) => cast_numeric_arrays::<Int16Type, Int8Type>(array),
(Int16, Int32) => cast_numeric_arrays::<Int16Type, Int32Type>(array),
(Int16, Int64) => cast_numeric_arrays::<Int16Type, Int64Type>(array),
(Int16, Float32) => cast_numeric_arrays::<Int16Type, Float32Type>(array),
(Int16, Float64) => cast_numeric_arrays::<Int16Type, Float64Type>(array),
(Int32, UInt8) => cast_numeric_arrays::<Int32Type, UInt8Type>(array),
(Int32, UInt16) => cast_numeric_arrays::<Int32Type, UInt16Type>(array),
(Int32, UInt32) => cast_numeric_arrays::<Int32Type, UInt32Type>(array),
(Int32, UInt64) => cast_numeric_arrays::<Int32Type, UInt64Type>(array),
(Int32, Int8) => cast_numeric_arrays::<Int32Type, Int8Type>(array),
(Int32, Int16) => cast_numeric_arrays::<Int32Type, Int16Type>(array),
(Int32, Int64) => cast_numeric_arrays::<Int32Type, Int64Type>(array),
(Int32, Float32) => cast_numeric_arrays::<Int32Type, Float32Type>(array),
(Int32, Float64) => cast_numeric_arrays::<Int32Type, Float64Type>(array),
(Int64, UInt8) => cast_numeric_arrays::<Int64Type, UInt8Type>(array),
(Int64, UInt16) => cast_numeric_arrays::<Int64Type, UInt16Type>(array),
(Int64, UInt32) => cast_numeric_arrays::<Int64Type, UInt32Type>(array),
(Int64, UInt64) => cast_numeric_arrays::<Int64Type, UInt64Type>(array),
(Int64, Int8) => cast_numeric_arrays::<Int64Type, Int8Type>(array),
(Int64, Int16) => cast_numeric_arrays::<Int64Type, Int16Type>(array),
(Int64, Int32) => cast_numeric_arrays::<Int64Type, Int32Type>(array),
(Int64, Float32) => cast_numeric_arrays::<Int64Type, Float32Type>(array),
(Int64, Float64) => cast_numeric_arrays::<Int64Type, Float64Type>(array),
(Float32, UInt8) => cast_numeric_arrays::<Float32Type, UInt8Type>(array),
(Float32, UInt16) => cast_numeric_arrays::<Float32Type, UInt16Type>(array),
(Float32, UInt32) => cast_numeric_arrays::<Float32Type, UInt32Type>(array),
(Float32, UInt64) => cast_numeric_arrays::<Float32Type, UInt64Type>(array),
(Float32, Int8) => cast_numeric_arrays::<Float32Type, Int8Type>(array),
(Float32, Int16) => cast_numeric_arrays::<Float32Type, Int16Type>(array),
(Float32, Int32) => cast_numeric_arrays::<Float32Type, Int32Type>(array),
(Float32, Int64) => cast_numeric_arrays::<Float32Type, Int64Type>(array),
(Float32, Float64) => cast_numeric_arrays::<Float32Type, Float64Type>(array),
(Float64, UInt8) => cast_numeric_arrays::<Float64Type, UInt8Type>(array),
(Float64, UInt16) => cast_numeric_arrays::<Float64Type, UInt16Type>(array),
(Float64, UInt32) => cast_numeric_arrays::<Float64Type, UInt32Type>(array),
(Float64, UInt64) => cast_numeric_arrays::<Float64Type, UInt64Type>(array),
(Float64, Int8) => cast_numeric_arrays::<Float64Type, Int8Type>(array),
(Float64, Int16) => cast_numeric_arrays::<Float64Type, Int16Type>(array),
(Float64, Int32) => cast_numeric_arrays::<Float64Type, Int32Type>(array),
(Float64, Int64) => cast_numeric_arrays::<Float64Type, Int64Type>(array),
(Float64, Float32) => cast_numeric_arrays::<Float64Type, Float32Type>(array),
// end numeric casts
// temporal casts
(Int32, Date32) => cast_array_data::<Date32Type>(array, to_type.clone()),
(Int32, Date64) => cast_with_options(
&cast_with_options(array, &DataType::Date32, &cast_options)?,
&DataType::Date64,
&cast_options,
),
(Int32, Time32(TimeUnit::Second)) => {
cast_array_data::<Time32SecondType>(array, to_type.clone())
}
(Int32, Time32(TimeUnit::Millisecond)) => {
cast_array_data::<Time32MillisecondType>(array, to_type.clone())
}
// No support for microsecond/nanosecond with i32
(Date32, Int32) => cast_array_data::<Int32Type>(array, to_type.clone()),
(Date32, Int64) => cast_with_options(
&cast_with_options(array, &DataType::Int32, cast_options)?,
&DataType::Int64,
&cast_options,
),
(Time32(_), Int32) => cast_array_data::<Int32Type>(array, to_type.clone()),
(Int64, Date64) => cast_array_data::<Date64Type>(array, to_type.clone()),
(Int64, Date32) => cast_with_options(
&cast_with_options(array, &DataType::Int32, &cast_options)?,
&DataType::Date32,
&cast_options,
),
// No support for second/milliseconds with i64
(Int64, Time64(TimeUnit::Microsecond)) => {
cast_array_data::<Time64MicrosecondType>(array, to_type.clone())
}
(Int64, Time64(TimeUnit::Nanosecond)) => {
cast_array_data::<Time64NanosecondType>(array, to_type.clone())
}
(Date64, Int64) => cast_array_data::<Int64Type>(array, to_type.clone()),
(Date64, Int32) => cast_with_options(
&cast_with_options(array, &DataType::Int64, &cast_options)?,
&DataType::Int32,
&cast_options,
),
(Time64(_), Int64) => cast_array_data::<Int64Type>(array, to_type.clone()),
(Date32, Date64) => {
let date_array = array.as_any().downcast_ref::<Date32Array>().unwrap();
let values =
unary::<_, _, Date64Type>(date_array, |x| x as i64 * MILLISECONDS_IN_DAY);
Ok(Arc::new(values) as ArrayRef)
}
(Date64, Date32) => {
let date_array = array.as_any().downcast_ref::<Date64Array>().unwrap();
let values = unary::<_, _, Date32Type>(date_array, |x| {
(x / MILLISECONDS_IN_DAY) as i32
});
Ok(Arc::new(values) as ArrayRef)
}
(Time32(TimeUnit::Second), Time32(TimeUnit::Millisecond)) => {
let time_array = array.as_any().downcast_ref::<Time32SecondArray>().unwrap();
let values = unary::<_, _, Time32MillisecondType>(time_array, |x| {
x * MILLISECONDS as i32
});
Ok(Arc::new(values) as ArrayRef)
}
(Time32(TimeUnit::Millisecond), Time32(TimeUnit::Second)) => {
let time_array = array
.as_any()
.downcast_ref::<Time32MillisecondArray>()
.unwrap();
let values = unary::<_, _, Time32SecondType>(time_array, |x| {
x / (MILLISECONDS as i32)
});
Ok(Arc::new(values) as ArrayRef)
}
//(Time32(TimeUnit::Second), Time64(_)) => {},
(Time32(from_unit), Time64(to_unit)) => {
let time_array = Int32Array::from(array.data().clone());
// note: (numeric_cast + SIMD multiply) is faster than (cast & multiply)
let c: Int64Array = numeric_cast(&time_array);
let from_size = time_unit_multiple(&from_unit);
let to_size = time_unit_multiple(&to_unit);
// from is only smaller than to if 64milli/64second don't exist
let mult = Int64Array::from(vec![to_size / from_size; array.len()]);
let converted = multiply(&c, &mult)?;
let array_ref = Arc::new(converted) as ArrayRef;
use TimeUnit::*;
match to_unit {
Microsecond => cast_array_data::<TimestampMicrosecondType>(
&array_ref,
to_type.clone(),
),
Nanosecond => cast_array_data::<TimestampNanosecondType>(
&array_ref,
to_type.clone(),
),
_ => unreachable!("array type not supported"),
}
}
(Time64(TimeUnit::Microsecond), Time64(TimeUnit::Nanosecond)) => {
let time_array = array
.as_any()
.downcast_ref::<Time64MicrosecondArray>()
.unwrap();
let values =
unary::<_, _, Time64NanosecondType>(time_array, |x| x * MILLISECONDS);
Ok(Arc::new(values) as ArrayRef)
}
(Time64(TimeUnit::Nanosecond), Time64(TimeUnit::Microsecond)) => {
let time_array = array
.as_any()
.downcast_ref::<Time64NanosecondArray>()
.unwrap();
let values =
unary::<_, _, Time64MicrosecondType>(time_array, |x| x / MILLISECONDS);
Ok(Arc::new(values) as ArrayRef)
}
(Time64(from_unit), Time32(to_unit)) => {
let time_array = Int64Array::from(array.data().clone());
let from_size = time_unit_multiple(&from_unit);
let to_size = time_unit_multiple(&to_unit);
let divisor = from_size / to_size;
match to_unit {
TimeUnit::Second => {
let values = unary::<_, _, Time32SecondType>(&time_array, |x| {
(x as i64 / divisor) as i32
});
Ok(Arc::new(values) as ArrayRef)
}
TimeUnit::Millisecond => {
let values = unary::<_, _, Time32MillisecondType>(&time_array, |x| {
(x as i64 / divisor) as i32
});
Ok(Arc::new(values) as ArrayRef)
}
_ => unreachable!("array type not supported"),
}
}
(Timestamp(_, _), Int64) => cast_array_data::<Int64Type>(array, to_type.clone()),
(Int64, Timestamp(to_unit, _)) => {
use TimeUnit::*;
match to_unit {
Second => cast_array_data::<TimestampSecondType>(array, to_type.clone()),
Millisecond => {
cast_array_data::<TimestampMillisecondType>(array, to_type.clone())
}
Microsecond => {
cast_array_data::<TimestampMicrosecondType>(array, to_type.clone())
}
Nanosecond => {
cast_array_data::<TimestampNanosecondType>(array, to_type.clone())
}
}
}
(Timestamp(from_unit, _), Timestamp(to_unit, _)) => {
let time_array = Int64Array::from(array.data().clone());
let from_size = time_unit_multiple(&from_unit);
let to_size = time_unit_multiple(&to_unit);
// we either divide or multiply, depending on size of each unit
// units are never the same when the types are the same
let converted = if from_size >= to_size {
divide(
&time_array,
&Int64Array::from(vec![from_size / to_size; array.len()]),
)?
} else {
multiply(
&time_array,
&Int64Array::from(vec![to_size / from_size; array.len()]),
)?
};
let array_ref = Arc::new(converted) as ArrayRef;
use TimeUnit::*;
match to_unit {
Second => {
cast_array_data::<TimestampSecondType>(&array_ref, to_type.clone())
}
Millisecond => cast_array_data::<TimestampMillisecondType>(
&array_ref,
to_type.clone(),
),
Microsecond => cast_array_data::<TimestampMicrosecondType>(
&array_ref,
to_type.clone(),
),
Nanosecond => cast_array_data::<TimestampNanosecondType>(
&array_ref,
to_type.clone(),
),
}
}
(Timestamp(from_unit, _), Date32) => {
let time_array = Int64Array::from(array.data().clone());
let from_size = time_unit_multiple(&from_unit) * SECONDS_IN_DAY;
let mut b = Date32Builder::new(array.len());
for i in 0..array.len() {
if array.is_null(i) {
b.append_null()?;
} else {
b.append_value((time_array.value(i) / from_size) as i32)?;
}
}
Ok(Arc::new(b.finish()) as ArrayRef)
}
(Timestamp(from_unit, _), Date64) => {
let from_size = time_unit_multiple(&from_unit);
let to_size = MILLISECONDS;
// Scale time_array by (to_size / from_size) using a
// single integer operation, but need to avoid integer
// math rounding down to zero
match to_size.cmp(&from_size) {
std::cmp::Ordering::Less => {
let time_array = Date64Array::from(array.data().clone());
Ok(Arc::new(divide(
&time_array,
&Date64Array::from(vec![from_size / to_size; array.len()]),
)?) as ArrayRef)
}
std::cmp::Ordering::Equal => {
cast_array_data::<Date64Type>(array, to_type.clone())
}
std::cmp::Ordering::Greater => {
let time_array = Date64Array::from(array.data().clone());
Ok(Arc::new(multiply(
&time_array,
&Date64Array::from(vec![to_size / from_size; array.len()]),
)?) as ArrayRef)
}
}
}
// date64 to timestamp might not make sense,
(Int64, Duration(to_unit)) => {
use TimeUnit::*;
match to_unit {
Second => cast_array_data::<DurationSecondType>(array, to_type.clone()),
Millisecond => {
cast_array_data::<DurationMillisecondType>(array, to_type.clone())
}
Microsecond => {
cast_array_data::<DurationMicrosecondType>(array, to_type.clone())
}
Nanosecond => {
cast_array_data::<DurationNanosecondType>(array, to_type.clone())
}
}
}
// null to primitive/flat types
(Null, Int32) => Ok(Arc::new(Int32Array::from(vec![None; array.len()]))),
(_, _) => Err(ArrowError::CastError(format!(
"Casting from {:?} to {:?} not supported",
from_type, to_type,
))),
}
}
/// Get the time unit as a multiple of a second
const fn time_unit_multiple(unit: &TimeUnit) -> i64 {
match unit {
TimeUnit::Second => 1,
TimeUnit::Millisecond => MILLISECONDS,
TimeUnit::Microsecond => MICROSECONDS,
TimeUnit::Nanosecond => NANOSECONDS,
}
}
/// Number of seconds in a day
const SECONDS_IN_DAY: i64 = 86_400;
/// Number of milliseconds in a second
const MILLISECONDS: i64 = 1_000;
/// Number of microseconds in a second
const MICROSECONDS: i64 = 1_000_000;
/// Number of nanoseconds in a second
const NANOSECONDS: i64 = 1_000_000_000;
/// Number of milliseconds in a day
const MILLISECONDS_IN_DAY: i64 = SECONDS_IN_DAY * MILLISECONDS;
/// Number of days between 0001-01-01 and 1970-01-01
const EPOCH_DAYS_FROM_CE: i32 = 719_163;
/// Cast an array by changing its array_data type to the desired type
///
/// Arrays should have the same primitive data type, otherwise this should fail.
/// We do not perform this check on primitive data types as we only use this
/// function internally, where it is guaranteed to be infallible.
#[allow(clippy::unnecessary_wraps)]
fn cast_array_data<TO>(array: &ArrayRef, to_type: DataType) -> Result<ArrayRef>
where
TO: ArrowNumericType,
{
let data = ArrayData::new(
to_type,
array.len(),
Some(array.null_count()),
array.data().null_bitmap().clone().map(|bitmap| bitmap.bits),
array.data().offset(),
array.data().buffers().to_vec(),
vec![],
);
Ok(Arc::new(PrimitiveArray::<TO>::from(data)) as ArrayRef)
}
/// Convert Array into a PrimitiveArray of type, and apply numeric cast
#[allow(clippy::unnecessary_wraps)]
fn cast_numeric_arrays<FROM, TO>(from: &ArrayRef) -> Result<ArrayRef>
where
FROM: ArrowNumericType,
TO: ArrowNumericType,
FROM::Native: num::NumCast,
TO::Native: num::NumCast,
{
Ok(Arc::new(numeric_cast::<FROM, TO>(
from.as_any()
.downcast_ref::<PrimitiveArray<FROM>>()
.unwrap(),
)))
}
/// Natural cast between numeric types
fn numeric_cast<T, R>(from: &PrimitiveArray<T>) -> PrimitiveArray<R>
where
T: ArrowNumericType,
R: ArrowNumericType,
T::Native: num::NumCast,
R::Native: num::NumCast,
{
let iter = from
.iter()
.map(|v| v.and_then(num::cast::cast::<T::Native, R::Native>));
// Soundness:
// The iterator is trustedLen because it comes from an `PrimitiveArray`.
unsafe { PrimitiveArray::<R>::from_trusted_len_iter(iter) }
}
/// Cast numeric types to Utf8
#[allow(clippy::unnecessary_wraps)]
fn cast_numeric_to_string<FROM, OffsetSize>(array: &ArrayRef) -> Result<ArrayRef>
where
FROM: ArrowNumericType,
FROM::Native: lexical_core::ToLexical,
OffsetSize: StringOffsetSizeTrait,
{
Ok(Arc::new(numeric_to_string_cast::<FROM, OffsetSize>(
array
.as_any()
.downcast_ref::<PrimitiveArray<FROM>>()
.unwrap(),
)))
}
fn numeric_to_string_cast<T, OffsetSize>(
from: &PrimitiveArray<T>,
) -> GenericStringArray<OffsetSize>
where
T: ArrowPrimitiveType + ArrowNumericType,
T::Native: lexical_core::ToLexical,
OffsetSize: StringOffsetSizeTrait,
{
from.iter()
.map(|maybe_value| maybe_value.map(lexical_to_string))
.collect()
}
/// Cast numeric types to Utf8
#[allow(clippy::unnecessary_wraps)]
fn cast_string_to_numeric<T, Offset: StringOffsetSizeTrait>(
from: &ArrayRef,
cast_options: &CastOptions,
) -> Result<ArrayRef>
where
T: ArrowNumericType,
<T as ArrowPrimitiveType>::Native: lexical_core::FromLexical,
{
Ok(Arc::new(string_to_numeric_cast::<T, Offset>(
from.as_any()
.downcast_ref::<GenericStringArray<Offset>>()
.unwrap(),
cast_options,
)?))
}
fn string_to_numeric_cast<T, Offset: StringOffsetSizeTrait>(
from: &GenericStringArray<Offset>,
cast_options: &CastOptions,
) -> Result<PrimitiveArray<T>>
where
T: ArrowNumericType,
<T as ArrowPrimitiveType>::Native: lexical_core::FromLexical,
{
if cast_options.safe {
let iter = (0..from.len()).map(|i| {
if from.is_null(i) {
None
} else {
lexical_core::parse(from.value(i).as_bytes()).ok()
}
});
// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.
Ok(unsafe { PrimitiveArray::<T>::from_trusted_len_iter(iter) })
} else {
let vec = (0..from.len())
.map(|i| {
if from.is_null(i) {
Ok(None)
} else {
let string = from.value(i);
let result = lexical_core::parse(string.as_bytes());
Some(result.map_err(|_| {
ArrowError::CastError(format!(
"Cannot cast string '{}' to value of {} type",
string,
std::any::type_name::<T>()
))
}))
.transpose()
}
})
.collect::<Result<Vec<_>>>()?;
// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.
Ok(unsafe { PrimitiveArray::<T>::from_trusted_len_iter(vec.iter()) })
}
}
/// Casts generic string arrays to Date32Array
#[allow(clippy::unnecessary_wraps)]
fn cast_string_to_date32<Offset: StringOffsetSizeTrait>(
array: &dyn Array,
cast_options: &CastOptions,
) -> Result<ArrayRef> {
use chrono::Datelike;
let string_array = array
.as_any()
.downcast_ref::<GenericStringArray<Offset>>()
.unwrap();
let array = if cast_options.safe {
let iter = (0..string_array.len()).map(|i| {
if string_array.is_null(i) {
None
} else {
string_array
.value(i)
.parse::<chrono::NaiveDate>()
.map(|date| date.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
.ok()
}
});
// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.
unsafe { Date32Array::from_trusted_len_iter(iter) }
} else {
let vec = (0..string_array.len())
.map(|i| {
if string_array.is_null(i) {
Ok(None)
} else {
let string = string_array
.value(i);
let result = string
.parse::<chrono::NaiveDate>()
.map(|date| date.num_days_from_ce() - EPOCH_DAYS_FROM_CE);
Some(result.map_err(|_| {
ArrowError::CastError(
format!("Cannot cast string '{}' to value of arrow::datatypes::types::Date32Type type", string),
)
}))
.transpose()
}
})
.collect::<Result<Vec<Option<i32>>>>()?;
// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.
unsafe { Date32Array::from_trusted_len_iter(vec.iter()) }
};
Ok(Arc::new(array) as ArrayRef)
}
/// Casts generic string arrays to Date64Array
#[allow(clippy::unnecessary_wraps)]
fn cast_string_to_date64<Offset: StringOffsetSizeTrait>(
array: &dyn Array,
cast_options: &CastOptions,
) -> Result<ArrayRef> {
let string_array = array
.as_any()
.downcast_ref::<GenericStringArray<Offset>>()
.unwrap();
let array = if cast_options.safe {
let iter = (0..string_array.len()).map(|i| {
if string_array.is_null(i) {
None
} else {
string_array
.value(i)
.parse::<chrono::NaiveDateTime>()
.map(|datetime| datetime.timestamp_millis())
.ok()
}
});
// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.
unsafe { Date64Array::from_trusted_len_iter(iter) }
} else {
let vec = (0..string_array.len())
.map(|i| {
if string_array.is_null(i) {
Ok(None)
} else {
let string = string_array
.value(i);
let result = string
.parse::<chrono::NaiveDateTime>()
.map(|datetime| datetime.timestamp_millis());
Some(result.map_err(|_| {
ArrowError::CastError(
format!("Cannot cast string '{}' to value of arrow::datatypes::types::Date64Type type", string),
)
}))
.transpose()
}
})
.collect::<Result<Vec<Option<i64>>>>()?;
// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.
unsafe { Date64Array::from_trusted_len_iter(vec.iter()) }
};
Ok(Arc::new(array) as ArrayRef)
}
/// Casts generic string arrays to TimeStampNanosecondArray
#[allow(clippy::unnecessary_wraps)]
fn cast_string_to_timestamp_ns<Offset: StringOffsetSizeTrait>(
array: &dyn Array,
cast_options: &CastOptions,
) -> Result<ArrayRef> {
let string_array = array
.as_any()
.downcast_ref::<GenericStringArray<Offset>>()
.unwrap();
let array = if cast_options.safe {
let iter = (0..string_array.len()).map(|i| {
if string_array.is_null(i) {
None
} else {
string_to_timestamp_nanos(string_array.value(i)).ok()
}
});
// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.
unsafe { TimestampNanosecondArray::from_trusted_len_iter(iter) }
} else {
let vec = (0..string_array.len())
.map(|i| {
if string_array.is_null(i) {
Ok(None)
} else {
let result = string_to_timestamp_nanos(string_array.value(i));
Some(result).transpose()
}
})
.collect::<Result<Vec<Option<i64>>>>()?;
// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an `StringArray`.
unsafe { TimestampNanosecondArray::from_trusted_len_iter(vec.iter()) }
};
Ok(Arc::new(array) as ArrayRef)
}
/// Cast numeric types to Boolean
///
/// Any zero value returns `false` while non-zero returns `true`
fn cast_numeric_to_bool<FROM>(from: &ArrayRef) -> Result<ArrayRef>
where
FROM: ArrowNumericType,
{
numeric_to_bool_cast::<FROM>(
from.as_any()
.downcast_ref::<PrimitiveArray<FROM>>()
.unwrap(),
)
.map(|to| Arc::new(to) as ArrayRef)
}
fn numeric_to_bool_cast<T>(from: &PrimitiveArray<T>) -> Result<BooleanArray>
where
T: ArrowPrimitiveType + ArrowNumericType,
{
let mut b = BooleanBuilder::new(from.len());
for i in 0..from.len() {
if from.is_null(i) {
b.append_null()?;
} else if from.value(i) != T::default_value() {
b.append_value(true)?;
} else {
b.append_value(false)?;
}
}
Ok(b.finish())
}
/// Cast Boolean types to numeric
///
/// `false` returns 0 while `true` returns 1
#[allow(clippy::unnecessary_wraps)]
fn cast_bool_to_numeric<TO>(
from: &ArrayRef,
cast_options: &CastOptions,
) -> Result<ArrayRef>
where
TO: ArrowNumericType,
TO::Native: num::cast::NumCast,
{
Ok(Arc::new(bool_to_numeric_cast::<TO>(
from.as_any().downcast_ref::<BooleanArray>().unwrap(),
cast_options,
)))
}
fn bool_to_numeric_cast<T>(
from: &BooleanArray,
_cast_options: &CastOptions,
) -> PrimitiveArray<T>
where
T: ArrowNumericType,
T::Native: num::NumCast,
{
let iter = (0..from.len()).map(|i| {
if from.is_null(i) {
None
} else if from.value(i) {
// a workaround to cast a primitive to T::Native, infallible
num::cast::cast(1)
} else {
Some(T::default_value())
}
});
// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from a Range
unsafe { PrimitiveArray::<T>::from_trusted_len_iter(iter) }
}
/// Attempts to cast an `ArrayDictionary` with index type K into
/// `to_type` for supported types.
///
/// K is the key type
fn dictionary_cast<K: ArrowDictionaryKeyType>(
array: &ArrayRef,
to_type: &DataType,
cast_options: &CastOptions,
) -> Result<ArrayRef> {
use DataType::*;
match to_type {
Dictionary(to_index_type, to_value_type) => {
let dict_array = array
.as_any()
.downcast_ref::<DictionaryArray<K>>()
.ok_or_else(|| {
ArrowError::ComputeError(
"Internal Error: Cannot cast dictionary to DictionaryArray of expected type".to_string(),
)
})?;
let keys_array: ArrayRef = Arc::new(dict_array.keys_array());
let values_array = dict_array.values();
let cast_keys = cast_with_options(&keys_array, to_index_type, &cast_options)?;
let cast_values =
cast_with_options(values_array, to_value_type, &cast_options)?;
// Failure to cast keys (because they don't fit in the
// target type) results in NULL values;
if cast_keys.null_count() > keys_array.null_count() {
return Err(ArrowError::ComputeError(format!(
"Could not convert {} dictionary indexes from {:?} to {:?}",
cast_keys.null_count() - keys_array.null_count(),
keys_array.data_type(),
to_index_type
)));
}
// keys are data, child_data is values (dictionary)
let data = ArrayData::new(
to_type.clone(),
cast_keys.len(),
Some(cast_keys.null_count()),
cast_keys
.data()
.null_bitmap()
.clone()
.map(|bitmap| bitmap.bits),
cast_keys.data().offset(),
cast_keys.data().buffers().to_vec(),
vec![cast_values.data().clone()],
);
// create the appropriate array type
let new_array: ArrayRef = match **to_index_type {
Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
_ => {
return Err(ArrowError::CastError(format!(
"Unsupported type {:?} for dictionary index",
to_index_type
)))
}
};
Ok(new_array)
}
_ => unpack_dictionary::<K>(array, to_type, cast_options),
}
}
// Unpack a dictionary where the keys are of type <K> into a flattened array of type to_type
fn unpack_dictionary<K>(
array: &ArrayRef,
to_type: &DataType,
cast_options: &CastOptions,
) -> Result<ArrayRef>
where
K: ArrowDictionaryKeyType,
{
let dict_array = array
.as_any()
.downcast_ref::<DictionaryArray<K>>()
.ok_or_else(|| {
ArrowError::ComputeError(
"Internal Error: Cannot cast dictionary to DictionaryArray of expected type".to_string(),
)
})?;
// attempt to cast the dict values to the target type
// use the take kernel to expand out the dictionary
let cast_dict_values =
cast_with_options(&dict_array.values(), to_type, cast_options)?;
// Note take requires first casting the indices to u32
let keys_array: ArrayRef = Arc::new(dict_array.keys_array());
let indicies = cast_with_options(&keys_array, &DataType::UInt32, cast_options)?;
let u32_indicies =
indicies
.as_any()
.downcast_ref::<UInt32Array>()
.ok_or_else(|| {
ArrowError::ComputeError(
"Internal Error: Cannot cast dict indices to UInt32".to_string(),
)
})?;
take(cast_dict_values.as_ref(), u32_indicies, None)
}
/// Attempts to encode an array into an `ArrayDictionary` with index
/// type K and value (dictionary) type value_type
///
/// K is the key type
fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
array: &ArrayRef,
dict_value_type: &DataType,
cast_options: &CastOptions,
) -> Result<ArrayRef> {
use DataType::*;
match *dict_value_type {
Int8 => pack_numeric_to_dictionary::<K, Int8Type>(
array,
dict_value_type,
cast_options,
),
Int16 => pack_numeric_to_dictionary::<K, Int16Type>(
array,
dict_value_type,
cast_options,
),
Int32 => pack_numeric_to_dictionary::<K, Int32Type>(
array,
dict_value_type,
cast_options,
),
Int64 => pack_numeric_to_dictionary::<K, Int64Type>(
array,
dict_value_type,
cast_options,
),
UInt8 => pack_numeric_to_dictionary::<K, UInt8Type>(
array,
dict_value_type,
cast_options,
),
UInt16 => pack_numeric_to_dictionary::<K, UInt16Type>(
array,
dict_value_type,
cast_options,
),
UInt32 => pack_numeric_to_dictionary::<K, UInt32Type>(
array,
dict_value_type,
cast_options,
),
UInt64 => pack_numeric_to_dictionary::<K, UInt64Type>(
array,
dict_value_type,
cast_options,
),
Utf8 => pack_string_to_dictionary::<K>(array, cast_options),
_ => Err(ArrowError::CastError(format!(
"Unsupported output type for dictionary packing: {:?}",
dict_value_type
))),
}
}
// Packs the data from the primitive array of type <V> to a
// DictionaryArray with keys of type K and values of value_type V
fn pack_numeric_to_dictionary<K, V>(
array: &ArrayRef,
dict_value_type: &DataType,
cast_options: &CastOptions,
) -> Result<ArrayRef>
where
K: ArrowDictionaryKeyType,
V: ArrowNumericType,
{
// attempt to cast the source array values to the target value type (the dictionary values type)
let cast_values = cast_with_options(array, &dict_value_type, cast_options)?;
let values = cast_values
.as_any()
.downcast_ref::<PrimitiveArray<V>>()
.unwrap();
let keys_builder = PrimitiveBuilder::<K>::new(values.len());
let values_builder = PrimitiveBuilder::<V>::new(values.len());
let mut b = PrimitiveDictionaryBuilder::new(keys_builder, values_builder);
// copy each element one at a time
for i in 0..values.len() {
if values.is_null(i) {
b.append_null()?;
} else {
b.append(values.value(i))?;
}
}
Ok(Arc::new(b.finish()))
}
// Packs the data as a StringDictionaryArray, if possible, with the
// key types of K
fn pack_string_to_dictionary<K>(
array: &ArrayRef,
cast_options: &CastOptions,
) -> Result<ArrayRef>
where
K: ArrowDictionaryKeyType,
{
let cast_values = cast_with_options(array, &DataType::Utf8, cast_options)?;
let values = cast_values.as_any().downcast_ref::<StringArray>().unwrap();
let keys_builder = PrimitiveBuilder::<K>::new(values.len());
let values_builder = StringBuilder::new(values.len());
let mut b = StringDictionaryBuilder::new(keys_builder, values_builder);
// copy each element one at a time
for i in 0..values.len() {
if values.is_null(i) {
b.append_null()?;
} else {
b.append(values.value(i))?;
}
}
Ok(Arc::new(b.finish()))
}
/// Helper function that takes a primitive array and casts to a (generic) list array.
fn cast_primitive_to_list<OffsetSize: OffsetSizeTrait + NumCast>(
array: &ArrayRef,
to: &Field,
to_type: &DataType,
cast_options: &CastOptions,
) -> Result<ArrayRef> {
// cast primitive to list's primitive
let cast_array = cast_with_options(array, to.data_type(), cast_options)?;
// create offsets, where if array.len() = 2, we have [0,1,2]
// Safety:
// Length of range can be trusted.
// Note: could not yet create a generic range in stable Rust.
let offsets = unsafe {
MutableBuffer::from_trusted_len_iter(
(0..=array.len()).map(|i| OffsetSize::from(i).expect("integer")),
)
};
let list_data = ArrayData::new(
to_type.clone(),
array.len(),
Some(cast_array.null_count()),
cast_array
.data()
.null_bitmap()
.clone()
.map(|bitmap| bitmap.bits),
0,
vec![offsets.into()],
vec![cast_array.data().clone()],
);
let list_array =
Arc::new(GenericListArray::<OffsetSize>::from(list_data)) as ArrayRef;
Ok(list_array)
}
/// Helper function that takes an Generic list container and casts the inner datatype.
fn cast_list_inner<OffsetSize: OffsetSizeTrait>(
array: &Arc<dyn Array>,
to: &Field,
to_type: &DataType,
cast_options: &CastOptions,
) -> Result<ArrayRef> {
let data = array.data_ref();
let underlying_array = make_array(data.child_data()[0].clone());
let cast_array = cast_with_options(&underlying_array, to.data_type(), cast_options)?;
let array_data = ArrayData::new(
to_type.clone(),
array.len(),
Some(cast_array.null_count()),
cast_array
.data()
.null_bitmap()
.clone()
.map(|bitmap| bitmap.bits),
array.offset(),
// reuse offset buffer
data.buffers().to_vec(),
vec![cast_array.data().clone()],
);
let list = GenericListArray::<OffsetSize>::from(array_data);
Ok(Arc::new(list) as ArrayRef)
}
/// Helper function to cast from `Utf8` to `LargeUtf8` and vice versa. If the `LargeUtf8` is too large for
/// a `Utf8` array it will return an Error.
fn cast_str_container<OffsetSizeFrom, OffsetSizeTo>(array: &dyn Array) -> Result<ArrayRef>
where
OffsetSizeFrom: StringOffsetSizeTrait + ToPrimitive,
OffsetSizeTo: StringOffsetSizeTrait + NumCast + ArrowNativeType,
{
let str_array = array
.as_any()
.downcast_ref::<GenericStringArray<OffsetSizeFrom>>()
.unwrap();
let list_data = array.data();
let str_values_buf = str_array.value_data();
let offsets = unsafe { list_data.buffers()[0].typed_data::<OffsetSizeFrom>() };
let mut offset_builder = BufferBuilder::<OffsetSizeTo>::new(offsets.len());
offsets.iter().try_for_each::<_, Result<_>>(|offset| {
let offset = OffsetSizeTo::from(*offset).ok_or_else(|| {
ArrowError::ComputeError(
"large-utf8 array too large to cast to utf8-array".into(),
)
})?;
offset_builder.append(offset);
Ok(())
})?;
let offset_buffer = offset_builder.finish();
let dtype = if matches!(std::mem::size_of::<OffsetSizeTo>(), 8) {
DataType::LargeUtf8
} else {
DataType::Utf8
};
let mut builder = ArrayData::builder(dtype)
.offset(array.offset())
.len(array.len())
.add_buffer(offset_buffer)
.add_buffer(str_values_buf);
if let Some(buf) = list_data.null_buffer() {
builder = builder.null_bit_buffer(buf.clone())
}
let data = builder.build();
Ok(Arc::new(GenericStringArray::<OffsetSizeTo>::from(data)))
}
/// Cast the container type of List/Largelist array but not the inner types.
/// This function can leave the value data intact and only has to cast the offset dtypes.
fn cast_list_container<OffsetSizeFrom, OffsetSizeTo>(
array: &dyn Array,
_cast_options: &CastOptions,
) -> Result<ArrayRef>
where
OffsetSizeFrom: OffsetSizeTrait + ToPrimitive,
OffsetSizeTo: OffsetSizeTrait + NumCast,
{
let data = array.data_ref();
// the value data stored by the list
let value_data = data.child_data()[0].clone();
let out_dtype = match array.data_type() {
DataType::List(value_type) => {
assert_eq!(
std::mem::size_of::<OffsetSizeFrom>(),
std::mem::size_of::<i32>()
);
assert_eq!(
std::mem::size_of::<OffsetSizeTo>(),
std::mem::size_of::<i64>()
);
DataType::LargeList(value_type.clone())
}
DataType::LargeList(value_type) => {
assert_eq!(
std::mem::size_of::<OffsetSizeFrom>(),
std::mem::size_of::<i64>()
);
assert_eq!(
std::mem::size_of::<OffsetSizeTo>(),
std::mem::size_of::<i32>()
);
if value_data.len() > i32::MAX as usize {
return Err(ArrowError::ComputeError(
"LargeList too large to cast to List".into(),
));
}
DataType::List(value_type.clone())
}
// implementation error
_ => unreachable!(),
};
// Safety:
// The first buffer is the offsets and they are aligned to OffSetSizeFrom: (i64 or i32)
// Justification:
// The safe variant data.buffer::<OffsetSizeFrom> take the offset into account and we
// cannot create a list array with offsets starting at non zero.
let offsets = unsafe { data.buffers()[0].as_slice().align_to::<OffsetSizeFrom>() }.1;
let iter = offsets.iter().map(|idx| {
let idx: OffsetSizeTo = NumCast::from(*idx).unwrap();
idx
});
// SAFETY
// A slice produces a trusted length iterator
let offset_buffer = unsafe { Buffer::from_trusted_len_iter(iter) };
// wrap up
let mut builder = ArrayData::builder(out_dtype)
.offset(array.offset())
.len(array.len())
.add_buffer(offset_buffer)
.add_child_data(value_data);
if let Some(buf) = data.null_buffer() {
builder = builder.null_bit_buffer(buf.clone())
}
let data = builder.build();
Ok(make_array(data))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{buffer::Buffer, util::display::array_value_to_string};
#[test]
fn test_cast_i32_to_f64() {
let a = Int32Array::from(vec![5, 6, 7, 8, 9]);
let array = Arc::new(a) as ArrayRef;
let b = cast(&array, &DataType::Float64).unwrap();
let c = b.as_any().downcast_ref::<Float64Array>().unwrap();
assert!(5.0 - c.value(0) < f64::EPSILON);
assert!(6.0 - c.value(1) < f64::EPSILON);
assert!(7.0 - c.value(2) < f64::EPSILON);
assert!(8.0 - c.value(3) < f64::EPSILON);
assert!(9.0 - c.value(4) < f64::EPSILON);
}
#[test]
fn test_cast_i32_to_u8() {
let a = Int32Array::from(vec![-5, 6, -7, 8, 100000000]);
let array = Arc::new(a) as ArrayRef;
let b = cast(&array, &DataType::UInt8).unwrap();
let c = b.as_any().downcast_ref::<UInt8Array>().unwrap();
assert_eq!(false, c.is_valid(0));
assert_eq!(6, c.value(1));
assert_eq!(false, c.is_valid(2));
assert_eq!(8, c.value(3));
// overflows return None
assert_eq!(false, c.is_valid(4));
}
#[test]
fn test_cast_i32_to_u8_sliced() {
let a = Int32Array::from(vec![-5, 6, -7, 8, 100000000]);
let array = Arc::new(a) as ArrayRef;
assert_eq!(0, array.offset());
let array = array.slice(2, 3);
assert_eq!(2, array.offset());
let b = cast(&array, &DataType::UInt8).unwrap();
assert_eq!(3, b.len());
assert_eq!(0, b.offset());
let c = b.as_any().downcast_ref::<UInt8Array>().unwrap();
assert_eq!(false, c.is_valid(0));
assert_eq!(8, c.value(1));
// overflows return None
assert_eq!(false, c.is_valid(2));
}
#[test]
fn test_cast_i32_to_i32() {
let a = Int32Array::from(vec![5, 6, 7, 8, 9]);
let array = Arc::new(a) as ArrayRef;
let b = cast(&array, &DataType::Int32).unwrap();
let c = b.as_any().downcast_ref::<Int32Array>().unwrap();
assert_eq!(5, c.value(0));
assert_eq!(6, c.value(1));
assert_eq!(7, c.value(2));
assert_eq!(8, c.value(3));
assert_eq!(9, c.value(4));
}
#[test]
fn test_cast_i32_to_list_i32() {
let a = Int32Array::from(vec![5, 6, 7, 8, 9]);
let array = Arc::new(a) as ArrayRef;
let b = cast(
&array,
&DataType::List(Box::new(Field::new("item", DataType::Int32, true))),
)
.unwrap();
assert_eq!(5, b.len());
let arr = b.as_any().downcast_ref::<ListArray>().unwrap();
assert_eq!(&[0, 1, 2, 3, 4, 5], arr.value_offsets());
assert_eq!(1, arr.value_length(0));
assert_eq!(1, arr.value_length(1));
assert_eq!(1, arr.value_length(2));
assert_eq!(1, arr.value_length(3));
assert_eq!(1, arr.value_length(4));
let values = arr.values();
let c = values.as_any().downcast_ref::<Int32Array>().unwrap();
assert_eq!(5, c.value(0));
assert_eq!(6, c.value(1));
assert_eq!(7, c.value(2));
assert_eq!(8, c.value(3));
assert_eq!(9, c.value(4));
}
#[test]
fn test_cast_i32_to_list_i32_nullable() {
let a = Int32Array::from(vec![Some(5), None, Some(7), Some(8), Some(9)]);
let array = Arc::new(a) as ArrayRef;
let b = cast(
&array,
&DataType::List(Box::new(Field::new("item", DataType::Int32, true))),
)
.unwrap();
assert_eq!(5, b.len());
assert_eq!(1, b.null_count());
let arr = b.as_any().downcast_ref::<ListArray>().unwrap();
assert_eq!(&[0, 1, 2, 3, 4, 5], arr.value_offsets());
assert_eq!(1, arr.value_length(0));
assert_eq!(1, arr.value_length(1));
assert_eq!(1, arr.value_length(2));
assert_eq!(1, arr.value_length(3));
assert_eq!(1, arr.value_length(4));
let values = arr.values();
let c = values.as_any().downcast_ref::<Int32Array>().unwrap();
assert_eq!(1, c.null_count());
assert_eq!(5, c.value(0));
assert_eq!(false, c.is_valid(1));
assert_eq!(7, c.value(2));
assert_eq!(8, c.value(3));
assert_eq!(9, c.value(4));
}
#[test]
fn test_cast_i32_to_list_f64_nullable_sliced() {
let a = Int32Array::from(vec![Some(5), None, Some(7), Some(8), None, Some(10)]);
let array = Arc::new(a) as ArrayRef;
let array = array.slice(2, 4);
let b = cast(
&array,
&DataType::List(Box::new(Field::new("item", DataType::Float64, true))),
)
.unwrap();
assert_eq!(4, b.len());
assert_eq!(1, b.null_count());
let arr = b.as_any().downcast_ref::<ListArray>().unwrap();
assert_eq!(&[0, 1, 2, 3, 4], arr.value_offsets());
assert_eq!(1, arr.value_length(0));
assert_eq!(1, arr.value_length(1));
assert_eq!(1, arr.value_length(2));
assert_eq!(1, arr.value_length(3));
let values = arr.values();
let c = values.as_any().downcast_ref::<Float64Array>().unwrap();
assert_eq!(1, c.null_count());
assert!(7.0 - c.value(0) < f64::EPSILON);
assert!(8.0 - c.value(1) < f64::EPSILON);
assert_eq!(false, c.is_valid(2));
assert!(10.0 - c.value(3) < f64::EPSILON);
}
#[test]
fn test_cast_utf8_to_i32() {
let a = StringArray::from(vec!["5", "6", "seven", "8", "9.1"]);
let array = Arc::new(a) as ArrayRef;
let b = cast(&array, &DataType::Int32).unwrap();
let c = b.as_any().downcast_ref::<Int32Array>().unwrap();
assert_eq!(5, c.value(0));
assert_eq!(6, c.value(1));
assert_eq!(false, c.is_valid(2));
assert_eq!(8, c.value(3));
assert_eq!(false, c.is_valid(4));
}
#[test]
fn test_cast_with_options_utf8_to_i32() {
let a = StringArray::from(vec!["5", "6", "seven", "8", "9.1"]);
let array = Arc::new(a) as ArrayRef;
let result =
cast_with_options(&array, &DataType::Int32, &CastOptions { safe: false });
match result {
Ok(_) => panic!("expected error"),
Err(e) => {
assert!(e.to_string().contains(
"Cast error: Cannot cast string 'seven' to value of arrow::datatypes::types::Int32Type type"
))
}
}
}
#[test]
fn test_cast_bool_to_i32() {
let a = BooleanArray::from(vec![Some(true), Some(false), None]);
let array = Arc::new(a) as ArrayRef;
let b = cast(&array, &DataType::Int32).unwrap();
let c = b.as_any().downcast_ref::<Int32Array>().unwrap();
assert_eq!(1, c.value(0));
assert_eq!(0, c.value(1));
assert_eq!(false, c.is_valid(2));
}
#[test]
fn test_cast_bool_to_f64() {
let a = BooleanArray::from(vec![Some(true), Some(false), None]);
let array = Arc::new(a) as ArrayRef;
let b = cast(&array, &DataType::Float64).unwrap();
let c = b.as_any().downcast_ref::<Float64Array>().unwrap();
assert!(1.0 - c.value(0) < f64::EPSILON);
assert!(0.0 - c.value(1) < f64::EPSILON);
assert_eq!(false, c.is_valid(2));
}
#[test]
#[should_panic(
expected = "Casting from Int32 to Timestamp(Microsecond, None) not supported"
)]
fn test_cast_int32_to_timestamp() {
let a = Int32Array::from(vec![Some(2), Some(10), None]);
let array = Arc::new(a) as ArrayRef;
cast(&array, &DataType::Timestamp(TimeUnit::Microsecond, None)).unwrap();
}
#[test]
fn test_cast_list_i32_to_list_u16() {
// Construct a value array
let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 100000000])
.data()
.clone();
let value_offsets = Buffer::from_slice_ref(&[0, 3, 6, 8]);
// Construct a list array from the above two
let list_data_type =
DataType::List(Box::new(Field::new("item", DataType::Int32, true)));
let list_data = ArrayData::builder(list_data_type)
.len(3)
.add_buffer(value_offsets)
.add_child_data(value_data)
.build();
let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef;
let cast_array = cast(
&list_array,
&DataType::List(Box::new(Field::new("item", DataType::UInt16, true))),
)
.unwrap();
// 3 negative values should get lost when casting to unsigned,
// 1 value should overflow
assert_eq!(4, cast_array.null_count());
// offsets should be the same
assert_eq!(
list_array.data().buffers().to_vec(),
cast_array.data().buffers().to_vec()
);
let array = cast_array
.as_ref()
.as_any()
.downcast_ref::<ListArray>()
.unwrap();
assert_eq!(DataType::UInt16, array.value_type());
assert_eq!(4, array.values().null_count());
assert_eq!(3, array.value_length(0));
assert_eq!(3, array.value_length(1));
assert_eq!(2, array.value_length(2));
let values = array.values();
let u16arr = values.as_any().downcast_ref::<UInt16Array>().unwrap();
assert_eq!(8, u16arr.len());
assert_eq!(4, u16arr.null_count());
assert_eq!(0, u16arr.value(0));
assert_eq!(0, u16arr.value(1));
assert_eq!(0, u16arr.value(2));
assert_eq!(false, u16arr.is_valid(3));
assert_eq!(false, u16arr.is_valid(4));
assert_eq!(false, u16arr.is_valid(5));
assert_eq!(2, u16arr.value(6));
assert_eq!(false, u16arr.is_valid(7));
}
#[test]
#[should_panic(
expected = "Casting from Int32 to Timestamp(Microsecond, None) not supported"
)]
fn test_cast_list_i32_to_list_timestamp() {
// Construct a value array
let value_data = Int32Array::from(vec![0, 0, 0, -1, -2, -1, 2, 8, 100000000])
.data()
.clone();
let value_offsets = Buffer::from_slice_ref(&[0, 3, 6, 9]);
// Construct a list array from the above two
let list_data_type =
DataType::List(Box::new(Field::new("item", DataType::Int32, true)));
let list_data = ArrayData::builder(list_data_type)
.len(3)
.add_buffer(value_offsets)
.add_child_data(value_data)
.build();
let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef;
cast(
&list_array,
&DataType::List(Box::new(Field::new(
"item",
DataType::Timestamp(TimeUnit::Microsecond, None),
true,
))),
)
.unwrap();
}
#[test]
fn test_cast_date32_to_date64() {
let a = Date32Array::from(vec![10000, 17890]);
let array = Arc::new(a) as ArrayRef;
let b = cast(&array, &DataType::Date64).unwrap();
let c = b.as_any().downcast_ref::<Date64Array>().unwrap();
assert_eq!(864000000000, c.value(0));
assert_eq!(1545696000000, c.value(1));
}
#[test]
fn test_cast_date64_to_date32() {
let a = Date64Array::from(vec![Some(864000000005), Some(1545696000001), None]);
let array = Arc::new(a) as ArrayRef;
let b = cast(&array, &DataType::Date32).unwrap();
let c = b.as_any().downcast_ref::<Date32Array>().unwrap();
assert_eq!(10000, c.value(0));
assert_eq!(17890, c.value(1));
assert!(c.is_null(2));
}
#[test]
fn test_cast_string_to_timestamp() {
let a1 = Arc::new(StringArray::from(vec![
Some("2020-09-08T12:00:00+00:00"),
Some("Not a valid date"),
None,
])) as ArrayRef;
let a2 = Arc::new(LargeStringArray::from(vec![
Some("2020-09-08T12:00:00+00:00"),
Some("Not a valid date"),
None,
])) as ArrayRef;
for array in &[a1, a2] {
let b =
cast(array, &DataType::Timestamp(TimeUnit::Nanosecond, None)).unwrap();
let c = b
.as_any()
.downcast_ref::<TimestampNanosecondArray>()
.unwrap();
assert_eq!(1599566400000000000, c.value(0));
assert!(c.is_null(1));
assert!(c.is_null(2));
}
}
#[test]
fn test_cast_date32_to_int32() {
let a = Date32Array::from(vec![10000, 17890]);
let array = Arc::new(a) as ArrayRef;
let b = cast(&array, &DataType::Int32).unwrap();
let c = b.as_any().downcast_ref::<Int32Array>().unwrap();
assert_eq!(10000, c.value(0));
assert_eq!(17890, c.value(1));
}
#[test]
fn test_cast_int32_to_date32() {
let a = Int32Array::from(vec![10000, 17890]);
let array = Arc::new(a) as ArrayRef;
let b = cast(&array, &DataType::Date32).unwrap();
let c = b.as_any().downcast_ref::<Date32Array>().unwrap();
assert_eq!(10000, c.value(0));
assert_eq!(17890, c.value(1));
}
#[test]
fn test_cast_timestamp_to_date32() {
let a = TimestampMillisecondArray::from_opt_vec(
vec![Some(864000000005), Some(1545696000001), None],
Some(String::from("UTC")),
);
let array = Arc::new(a) as ArrayRef;
let b = cast(&array, &DataType::Date32).unwrap();
let c = b.as_any().downcast_ref::<Date32Array>().unwrap();
assert_eq!(10000, c.value(0));
assert_eq!(17890, c.value(1));
assert!(c.is_null(2));
}
#[test]
fn test_cast_timestamp_to_date64() {
let a = TimestampMillisecondArray::from_opt_vec(
vec![Some(864000000005), Some(1545696000001), None],
None,
);
let array = Arc::new(a) as ArrayRef;
let b = cast(&array, &DataType::Date64).unwrap();
let c = b.as_any().downcast_ref::<Date64Array>().unwrap();
assert_eq!(864000000005, c.value(0));
assert_eq!(1545696000001, c.value(1));
assert!(c.is_null(2));
}
#[test]
fn test_cast_timestamp_to_i64() {
let a = TimestampMillisecondArray::from_opt_vec(
vec![Some(864000000005), Some(1545696000001), None],
Some("UTC".to_string()),
);
let array = Arc::new(a) as ArrayRef;
let b = cast(&array, &DataType::Int64).unwrap();
let c = b.as_any().downcast_ref::<Int64Array>().unwrap();
assert_eq!(&DataType::Int64, c.data_type());
assert_eq!(864000000005, c.value(0));
assert_eq!(1545696000001, c.value(1));
assert!(c.is_null(2));
}
#[test]
fn test_cast_between_timestamps() {
let a = TimestampMillisecondArray::from_opt_vec(
vec![Some(864000003005), Some(1545696002001), None],
None,
);
let array = Arc::new(a) as ArrayRef;
let b = cast(&array, &DataType::Timestamp(TimeUnit::Second, None)).unwrap();
let c = b.as_any().downcast_ref::<TimestampSecondArray>().unwrap();
assert_eq!(864000003, c.value(0));
assert_eq!(1545696002, c.value(1));
assert!(c.is_null(2));
}
#[test]
fn test_cast_to_strings() {
let a = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef;
let out = cast(&a, &DataType::Utf8).unwrap();
let out = out
.as_any()
.downcast_ref::<StringArray>()
.unwrap()
.into_iter()
.collect::<Vec<_>>();
assert_eq!(out, vec![Some("1"), Some("2"), Some("3")]);
let out = cast(&a, &DataType::LargeUtf8).unwrap();
let out = out
.as_any()
.downcast_ref::<LargeStringArray>()
.unwrap()
.into_iter()
.collect::<Vec<_>>();
assert_eq!(out, vec![Some("1"), Some("2"), Some("3")]);
}
#[test]
fn test_str_to_str_casts() {
for data in vec![
vec![Some("foo"), Some("bar"), Some("ham")],
vec![Some("foo"), None, Some("bar")],
] {
let a = Arc::new(LargeStringArray::from(data.clone())) as ArrayRef;
let to = cast(&a, &DataType::Utf8).unwrap();
let expect = a
.as_any()
.downcast_ref::<LargeStringArray>()
.unwrap()
.into_iter()
.collect::<Vec<_>>();
let out = to
.as_any()
.downcast_ref::<StringArray>()
.unwrap()
.into_iter()
.collect::<Vec<_>>();
assert_eq!(expect, out);
let a = Arc::new(StringArray::from(data)) as ArrayRef;
let to = cast(&a, &DataType::LargeUtf8).unwrap();
let expect = a
.as_any()
.downcast_ref::<StringArray>()
.unwrap()
.into_iter()
.collect::<Vec<_>>();
let out = to
.as_any()
.downcast_ref::<LargeStringArray>()
.unwrap()
.into_iter()
.collect::<Vec<_>>();
assert_eq!(expect, out);
}
}
#[test]
fn test_cast_from_f64() {
let f64_values: Vec<f64> = vec![
std::i64::MIN as f64,
std::i32::MIN as f64,
std::i16::MIN as f64,
std::i8::MIN as f64,
0_f64,
std::u8::MAX as f64,
std::u16::MAX as f64,
std::u32::MAX as f64,
std::u64::MAX as f64,
];
let f64_array: ArrayRef = Arc::new(Float64Array::from(f64_values));
let f64_expected = vec![
"-9223372036854776000.0",
"-2147483648.0",
"-32768.0",
"-128.0",
"0.0",
"255.0",
"65535.0",
"4294967295.0",
"18446744073709552000.0",
];
assert_eq!(
f64_expected,
get_cast_values::<Float64Type>(&f64_array, &DataType::Float64)
);
let f32_expected = vec![
"-9223372000000000000.0",
"-2147483600.0",
"-32768.0",
"-128.0",
"0.0",
"255.0",
"65535.0",
"4294967300.0",
"18446744000000000000.0",
];
assert_eq!(
f32_expected,
get_cast_values::<Float32Type>(&f64_array, &DataType::Float32)
);
let i64_expected = vec![
"-9223372036854775808",
"-2147483648",
"-32768",
"-128",
"0",
"255",
"65535",