blob: a21ad5bbbcc302c1d2092b625c72552ea6ed6f9c [file]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! [`ColumnarValue`] represents the result of evaluating an expression.
use arrow::array::{Array, ArrayRef, NullArray};
use arrow::compute::{kernels, CastOptions};
use arrow::datatypes::DataType;
use arrow::util::pretty::pretty_format_columns;
use datafusion_common::format::DEFAULT_CAST_OPTIONS;
use datafusion_common::{internal_err, Result, ScalarValue};
use std::fmt;
use std::sync::Arc;
/// The result of evaluating an expression.
///
/// [`ColumnarValue::Scalar`] represents a single value repeated any number of
/// times. This is an important performance optimization for handling values
/// that do not change across rows.
///
/// [`ColumnarValue::Array`] represents a column of data, stored as an Arrow
/// [`ArrayRef`]
///
/// A slice of `ColumnarValue`s logically represents a table, with each column
/// having the same number of rows. This means that all `Array`s are the same
/// length.
///
/// # Example
///
/// A `ColumnarValue::Array` with an array of 5 elements and a
/// `ColumnarValue::Scalar` with the value 100
///
/// ```text
/// ┌──────────────┐
/// │ ┌──────────┐ │
/// │ │ "A" │ │
/// │ ├──────────┤ │
/// │ │ "B" │ │
/// │ ├──────────┤ │
/// │ │ "C" │ │
/// │ ├──────────┤ │
/// │ │ "D" │ │ ┌──────────────┐
/// │ ├──────────┤ │ │ ┌──────────┐ │
/// │ │ "E" │ │ │ │ 100 │ │
/// │ └──────────┘ │ │ └──────────┘ │
/// └──────────────┘ └──────────────┘
///
/// ColumnarValue:: ColumnarValue::
/// Array Scalar
/// ```
///
/// Logically represents the following table:
///
/// | Column 1| Column 2 |
/// | ------- | -------- |
/// | A | 100 |
/// | B | 100 |
/// | C | 100 |
/// | D | 100 |
/// | E | 100 |
///
/// # Performance Notes
///
/// When implementing functions or operators, it is important to consider the
/// performance implications of handling scalar values.
///
/// Because all functions must handle [`ArrayRef`], it is
/// convenient to convert [`ColumnarValue::Scalar`]s using
/// [`Self::into_array`]. For example, [`ColumnarValue::values_to_arrays`]
/// converts multiple columnar values into arrays of the same length.
///
/// However, it is often much more performant to provide a different,
/// implementation that handles scalar values differently
#[derive(Clone, Debug)]
pub enum ColumnarValue {
/// Array of values
Array(ArrayRef),
/// A single value
Scalar(ScalarValue),
}
impl From<ArrayRef> for ColumnarValue {
fn from(value: ArrayRef) -> Self {
ColumnarValue::Array(value)
}
}
impl From<ScalarValue> for ColumnarValue {
fn from(value: ScalarValue) -> Self {
ColumnarValue::Scalar(value)
}
}
impl ColumnarValue {
pub fn data_type(&self) -> DataType {
match self {
ColumnarValue::Array(array_value) => array_value.data_type().clone(),
ColumnarValue::Scalar(scalar_value) => scalar_value.data_type(),
}
}
/// Convert a columnar value into an Arrow [`ArrayRef`] with the specified
/// number of rows. [`Self::Scalar`] is converted by repeating the same
/// scalar multiple times which is not as efficient as handling the scalar
/// directly.
///
/// See [`Self::values_to_arrays`] to convert multiple columnar values into
/// arrays of the same length.
///
/// # Errors
///
/// Errors if `self` is a Scalar that fails to be converted into an array of size
pub fn into_array(self, num_rows: usize) -> Result<ArrayRef> {
Ok(match self {
ColumnarValue::Array(array) => array,
ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(num_rows)?,
})
}
/// Convert a columnar value into an Arrow [`ArrayRef`] with the specified
/// number of rows. [`Self::Scalar`] is converted by repeating the same
/// scalar multiple times which is not as efficient as handling the scalar
/// directly.
///
/// See [`Self::values_to_arrays`] to convert multiple columnar values into
/// arrays of the same length.
///
/// # Errors
///
/// Errors if `self` is a Scalar that fails to be converted into an array of size
pub fn to_array(&self, num_rows: usize) -> Result<ArrayRef> {
Ok(match self {
ColumnarValue::Array(array) => Arc::clone(array),
ColumnarValue::Scalar(scalar) => scalar.to_array_of_size(num_rows)?,
})
}
/// Null columnar values are implemented as a null array in order to pass batch
/// num_rows
pub fn create_null_array(num_rows: usize) -> Self {
ColumnarValue::Array(Arc::new(NullArray::new(num_rows)))
}
/// Converts [`ColumnarValue`]s to [`ArrayRef`]s with the same length.
///
/// # Performance Note
///
/// This function expands any [`ScalarValue`] to an array. This expansion
/// permits using a single function in terms of arrays, but it can be
/// inefficient compared to handling the scalar value directly.
///
/// Thus, it is recommended to provide specialized implementations for
/// scalar values if performance is a concern.
///
/// # Errors
///
/// If there are multiple array arguments that have different lengths
pub fn values_to_arrays(args: &[ColumnarValue]) -> Result<Vec<ArrayRef>> {
if args.is_empty() {
return Ok(vec![]);
}
let mut array_len = None;
for arg in args {
array_len = match (arg, array_len) {
(ColumnarValue::Array(a), None) => Some(a.len()),
(ColumnarValue::Array(a), Some(array_len)) => {
if array_len == a.len() {
Some(array_len)
} else {
return internal_err!(
"Arguments has mixed length. Expected length: {array_len}, found length: {}", a.len()
);
}
}
(ColumnarValue::Scalar(_), array_len) => array_len,
}
}
// If array_len is none, it means there are only scalars, so make a 1 element array
let inferred_length = array_len.unwrap_or(1);
let args = args
.iter()
.map(|arg| arg.to_array(inferred_length))
.collect::<Result<Vec<_>>>()?;
Ok(args)
}
/// Cast's this [ColumnarValue] to the specified `DataType`
pub fn cast_to(
&self,
cast_type: &DataType,
cast_options: Option<&CastOptions<'static>>,
) -> Result<ColumnarValue> {
let cast_options = cast_options.cloned().unwrap_or(DEFAULT_CAST_OPTIONS);
match self {
ColumnarValue::Array(array) => Ok(ColumnarValue::Array(
kernels::cast::cast_with_options(array, cast_type, &cast_options)?,
)),
ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar(
scalar.cast_to_with_options(cast_type, &cast_options)?,
)),
}
}
}
// Implement Display trait for ColumnarValue
impl fmt::Display for ColumnarValue {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let formatted = match self {
ColumnarValue::Array(array) => {
pretty_format_columns("ColumnarValue(ArrayRef)", &[Arc::clone(array)])
}
ColumnarValue::Scalar(_) => {
if let Ok(array) = self.to_array(1) {
pretty_format_columns("ColumnarValue(ScalarValue)", &[array])
} else {
return write!(f, "Error formatting columnar value");
}
}
};
if let Ok(formatted) = formatted {
write!(f, "{formatted}")
} else {
write!(f, "Error formatting columnar value")
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use arrow::array::Int32Array;
#[test]
fn values_to_arrays() {
// (input, expected)
let cases = vec![
// empty
TestCase {
input: vec![],
expected: vec![],
},
// one array of length 3
TestCase {
input: vec![ColumnarValue::Array(make_array(1, 3))],
expected: vec![make_array(1, 3)],
},
// two arrays length 3
TestCase {
input: vec![
ColumnarValue::Array(make_array(1, 3)),
ColumnarValue::Array(make_array(2, 3)),
],
expected: vec![make_array(1, 3), make_array(2, 3)],
},
// array and scalar
TestCase {
input: vec![
ColumnarValue::Array(make_array(1, 3)),
ColumnarValue::Scalar(ScalarValue::Int32(Some(100))),
],
expected: vec![
make_array(1, 3),
make_array(100, 3), // scalar is expanded
],
},
// scalar and array
TestCase {
input: vec![
ColumnarValue::Scalar(ScalarValue::Int32(Some(100))),
ColumnarValue::Array(make_array(1, 3)),
],
expected: vec![
make_array(100, 3), // scalar is expanded
make_array(1, 3),
],
},
// multiple scalars and array
TestCase {
input: vec![
ColumnarValue::Scalar(ScalarValue::Int32(Some(100))),
ColumnarValue::Array(make_array(1, 3)),
ColumnarValue::Scalar(ScalarValue::Int32(Some(200))),
],
expected: vec![
make_array(100, 3), // scalar is expanded
make_array(1, 3),
make_array(200, 3), // scalar is expanded
],
},
];
for case in cases {
case.run();
}
}
#[test]
#[should_panic(
expected = "Arguments has mixed length. Expected length: 3, found length: 4"
)]
fn values_to_arrays_mixed_length() {
ColumnarValue::values_to_arrays(&[
ColumnarValue::Array(make_array(1, 3)),
ColumnarValue::Array(make_array(2, 4)),
])
.unwrap();
}
#[test]
#[should_panic(
expected = "Arguments has mixed length. Expected length: 3, found length: 7"
)]
fn values_to_arrays_mixed_length_and_scalar() {
ColumnarValue::values_to_arrays(&[
ColumnarValue::Array(make_array(1, 3)),
ColumnarValue::Scalar(ScalarValue::Int32(Some(100))),
ColumnarValue::Array(make_array(2, 7)),
])
.unwrap();
}
struct TestCase {
input: Vec<ColumnarValue>,
expected: Vec<ArrayRef>,
}
impl TestCase {
fn run(self) {
let Self { input, expected } = self;
assert_eq!(
ColumnarValue::values_to_arrays(&input).unwrap(),
expected,
"\ninput: {input:?}\nexpected: {expected:?}"
);
}
}
/// Makes an array of length `len` with all elements set to `val`
fn make_array(val: i32, len: usize) -> ArrayRef {
Arc::new(Int32Array::from(vec![val; len]))
}
#[test]
fn test_display_scalar() {
let column = ColumnarValue::from(ScalarValue::from("foo"));
assert_eq!(
column.to_string(),
concat!(
"+----------------------------+\n",
"| ColumnarValue(ScalarValue) |\n",
"+----------------------------+\n",
"| foo |\n",
"+----------------------------+"
)
);
}
#[test]
fn test_display_array() {
let array: ArrayRef = Arc::new(Int32Array::from_iter_values(vec![1, 2, 3]));
let column = ColumnarValue::from(array);
assert_eq!(
column.to_string(),
concat!(
"+-------------------------+\n",
"| ColumnarValue(ArrayRef) |\n",
"+-------------------------+\n",
"| 1 |\n",
"| 2 |\n",
"| 3 |\n",
"+-------------------------+"
)
);
}
}