blob: 59ee527e5f8c4c0d3f1c97822370f9ebe81c69f9 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use std::any::Any;
use std::convert::{From, TryFrom};
use std::fmt;
use std::iter::IntoIterator;
use std::mem;
use super::{make_array, Array, ArrayData, ArrayRef};
use crate::datatypes::DataType;
use crate::error::{ArrowError, Result};
use crate::{
buffer::{buffer_bin_or, Buffer},
datatypes::Field,
};
/// A nested array type where each child (called *field*) is represented by a separate
/// array.
pub struct StructArray {
data: ArrayData,
pub(crate) boxed_fields: Vec<ArrayRef>,
}
impl StructArray {
/// Returns the field at `pos`.
pub fn column(&self, pos: usize) -> &ArrayRef {
&self.boxed_fields[pos]
}
/// Return the number of fields in this struct array
pub fn num_columns(&self) -> usize {
self.boxed_fields.len()
}
/// Returns the fields of the struct array
pub fn columns(&self) -> Vec<&ArrayRef> {
self.boxed_fields.iter().collect()
}
/// Returns child array refs of the struct array
pub fn columns_ref(&self) -> Vec<ArrayRef> {
self.boxed_fields.clone()
}
/// Return field names in this struct array
pub fn column_names(&self) -> Vec<&str> {
match self.data.data_type() {
DataType::Struct(fields) => fields
.iter()
.map(|f| f.name().as_str())
.collect::<Vec<&str>>(),
_ => unreachable!("Struct array's data type is not struct!"),
}
}
/// Return child array whose field name equals to column_name
///
/// Note: A schema can currently have duplicate field names, in which case
/// the first field will always be selected.
/// This issue will be addressed in [ARROW-11178](https://issues.apache.org/jira/browse/ARROW-11178)
pub fn column_by_name(&self, column_name: &str) -> Option<&ArrayRef> {
self.column_names()
.iter()
.position(|c| c == &column_name)
.map(|pos| self.column(pos))
}
}
impl From<ArrayData> for StructArray {
fn from(data: ArrayData) -> Self {
let mut boxed_fields = vec![];
for cd in data.child_data() {
let child_data = if data.offset() != 0 || data.len() != cd.len() {
cd.slice(data.offset(), data.len())
} else {
cd.clone()
};
boxed_fields.push(make_array(child_data));
}
Self { data, boxed_fields }
}
}
impl TryFrom<Vec<(&str, ArrayRef)>> for StructArray {
type Error = ArrowError;
/// builds a StructArray from a vector of names and arrays.
/// This errors if the values have a different length.
/// An entry is set to Null when all values are null.
fn try_from(values: Vec<(&str, ArrayRef)>) -> Result<Self> {
let values_len = values.len();
// these will be populated
let mut fields = Vec::with_capacity(values_len);
let mut child_data = Vec::with_capacity(values_len);
// len: the size of the arrays.
let mut len: Option<usize> = None;
// null: the null mask of the arrays.
let mut null: Option<Buffer> = None;
for (field_name, array) in values {
let child_datum = array.data();
let child_datum_len = child_datum.len();
if let Some(len) = len {
if len != child_datum_len {
return Err(ArrowError::InvalidArgumentError(
format!("Array of field \"{}\" has length {}, but previous elements have length {}.
All arrays in every entry in a struct array must have the same length.", field_name, child_datum_len, len)
));
}
} else {
len = Some(child_datum_len)
}
child_data.push(child_datum.clone());
fields.push(Field::new(
field_name,
array.data_type().clone(),
child_datum.null_buffer().is_some(),
));
if let Some(child_null_buffer) = child_datum.null_buffer() {
let child_datum_offset = child_datum.offset();
null = Some(if let Some(null_buffer) = &null {
buffer_bin_or(
null_buffer,
0,
child_null_buffer,
child_datum_offset,
child_datum_len,
)
} else {
child_null_buffer.bit_slice(child_datum_offset, child_datum_len)
});
} else if null.is_some() {
// when one of the fields has no nulls, them there is no null in the array
null = None;
}
}
let len = len.unwrap();
let mut builder = ArrayData::builder(DataType::Struct(fields))
.len(len)
.child_data(child_data);
if let Some(null_buffer) = null {
builder = builder.null_bit_buffer(null_buffer);
}
Ok(StructArray::from(builder.build()))
}
}
impl Array for StructArray {
fn as_any(&self) -> &Any {
self
}
fn data(&self) -> &ArrayData {
&self.data
}
/// Returns the length (i.e., number of elements) of this array
fn len(&self) -> usize {
self.data_ref().len()
}
/// Returns the total number of bytes of memory occupied by the buffers owned by this [StructArray].
fn get_buffer_memory_size(&self) -> usize {
self.data.get_buffer_memory_size()
}
/// Returns the total number of bytes of memory occupied physically by this [StructArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size() + mem::size_of_val(self)
}
}
impl From<Vec<(Field, ArrayRef)>> for StructArray {
fn from(v: Vec<(Field, ArrayRef)>) -> Self {
let (field_types, field_values): (Vec<_>, Vec<_>) = v.into_iter().unzip();
// Check the length of the child arrays
let length = field_values[0].len();
for i in 1..field_values.len() {
assert_eq!(
length,
field_values[i].len(),
"all child arrays of a StructArray must have the same length"
);
assert_eq!(
field_types[i].data_type(),
field_values[i].data().data_type(),
"the field data types must match the array data in a StructArray"
)
}
let data = ArrayData::builder(DataType::Struct(field_types))
.child_data(field_values.into_iter().map(|a| a.data().clone()).collect())
.len(length)
.build();
Self::from(data)
}
}
impl fmt::Debug for StructArray {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "StructArray\n[\n")?;
for (child_index, name) in self.column_names().iter().enumerate() {
let column = self.column(child_index);
writeln!(
f,
"-- child {}: \"{}\" ({:?})",
child_index,
name,
column.data_type()
)?;
fmt::Debug::fmt(column, f)?;
writeln!(f)?;
}
write!(f, "]")
}
}
impl From<(Vec<(Field, ArrayRef)>, Buffer)> for StructArray {
fn from(pair: (Vec<(Field, ArrayRef)>, Buffer)) -> Self {
let (field_types, field_values): (Vec<_>, Vec<_>) = pair.0.into_iter().unzip();
// Check the length of the child arrays
let length = field_values[0].len();
for i in 1..field_values.len() {
assert_eq!(
length,
field_values[i].len(),
"all child arrays of a StructArray must have the same length"
);
assert_eq!(
field_types[i].data_type(),
field_values[i].data().data_type(),
"the field data types must match the array data in a StructArray"
)
}
let data = ArrayData::builder(DataType::Struct(field_types))
.null_bit_buffer(pair.1)
.child_data(field_values.into_iter().map(|a| a.data().clone()).collect())
.len(length)
.build();
Self::from(data)
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::Arc;
use crate::{
array::BooleanArray, array::Float32Array, array::Float64Array, array::Int32Array,
array::StringArray, bitmap::Bitmap,
};
use crate::{
array::Int64Array,
datatypes::{DataType, Field},
};
use crate::{buffer::Buffer, datatypes::ToByteSlice};
#[test]
fn test_struct_array_builder() {
let array = BooleanArray::from(vec![false, false, true, true]);
let boolean_data = array.data();
let array = Int64Array::from(vec![42, 28, 19, 31]);
let int_data = array.data();
let fields = vec![
Field::new("a", DataType::Boolean, false),
Field::new("b", DataType::Int64, false),
];
let struct_array_data = ArrayData::builder(DataType::Struct(fields))
.len(4)
.add_child_data(boolean_data.clone())
.add_child_data(int_data.clone())
.build();
let struct_array = StructArray::from(struct_array_data);
assert_eq!(boolean_data, struct_array.column(0).data());
assert_eq!(int_data, struct_array.column(1).data());
}
#[test]
fn test_struct_array_from() {
let boolean = Arc::new(BooleanArray::from(vec![false, false, true, true]));
let int = Arc::new(Int32Array::from(vec![42, 28, 19, 31]));
let struct_array = StructArray::from(vec![
(
Field::new("b", DataType::Boolean, false),
boolean.clone() as ArrayRef,
),
(
Field::new("c", DataType::Int32, false),
int.clone() as ArrayRef,
),
]);
assert_eq!(struct_array.column(0).as_ref(), boolean.as_ref());
assert_eq!(struct_array.column(1).as_ref(), int.as_ref());
assert_eq!(4, struct_array.len());
assert_eq!(0, struct_array.null_count());
assert_eq!(0, struct_array.offset());
}
/// validates that the in-memory representation follows [the spec](https://arrow.apache.org/docs/format/Columnar.html#struct-layout)
#[test]
fn test_struct_array_from_vec() {
let strings: ArrayRef = Arc::new(StringArray::from(vec![
Some("joe"),
None,
None,
Some("mark"),
]));
let ints: ArrayRef =
Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)]));
let arr =
StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())])
.unwrap();
let struct_data = arr.data();
assert_eq!(4, struct_data.len());
assert_eq!(1, struct_data.null_count());
assert_eq!(
// 00001011
&Some(Bitmap::from(Buffer::from(&[11_u8]))),
struct_data.null_bitmap()
);
let expected_string_data = ArrayData::builder(DataType::Utf8)
.len(4)
.null_bit_buffer(Buffer::from(&[9_u8]))
.add_buffer(Buffer::from(&[0, 3, 3, 3, 7].to_byte_slice()))
.add_buffer(Buffer::from(b"joemark"))
.build();
let expected_int_data = ArrayData::builder(DataType::Int32)
.len(4)
.null_bit_buffer(Buffer::from(&[11_u8]))
.add_buffer(Buffer::from(&[1, 2, 0, 4].to_byte_slice()))
.build();
assert_eq!(&expected_string_data, arr.column(0).data());
// TODO: implement equality for ArrayData
assert_eq!(expected_int_data.len(), arr.column(1).data().len());
assert_eq!(
expected_int_data.null_count(),
arr.column(1).data().null_count()
);
assert_eq!(
expected_int_data.null_bitmap(),
arr.column(1).data().null_bitmap()
);
let expected_value_buf = expected_int_data.buffers()[0].clone();
let actual_value_buf = arr.column(1).data().buffers()[0].clone();
for i in 0..expected_int_data.len() {
if !expected_int_data.is_null(i) {
assert_eq!(
expected_value_buf.as_slice()[i * 4..(i + 1) * 4],
actual_value_buf.as_slice()[i * 4..(i + 1) * 4]
);
}
}
}
#[test]
fn test_struct_array_from_vec_error() {
let strings: ArrayRef = Arc::new(StringArray::from(vec![
Some("joe"),
None,
None,
// 3 elements, not 4
]));
let ints: ArrayRef =
Arc::new(Int32Array::from(vec![Some(1), Some(2), None, Some(4)]));
let arr =
StructArray::try_from(vec![("f1", strings.clone()), ("f2", ints.clone())]);
match arr {
Err(ArrowError::InvalidArgumentError(e)) => {
assert!(e.starts_with("Array of field \"f2\" has length 4, but previous elements have length 3."));
}
_ => panic!("This test got an unexpected error type"),
};
}
#[test]
#[should_panic(
expected = "the field data types must match the array data in a StructArray"
)]
fn test_struct_array_from_mismatched_types() {
StructArray::from(vec![
(
Field::new("b", DataType::Int16, false),
Arc::new(BooleanArray::from(vec![false, false, true, true]))
as Arc<Array>,
),
(
Field::new("c", DataType::Utf8, false),
Arc::new(Int32Array::from(vec![42, 28, 19, 31])),
),
]);
}
#[test]
fn test_struct_array_slice() {
let boolean_data = ArrayData::builder(DataType::Boolean)
.len(5)
.add_buffer(Buffer::from([0b00010000]))
.null_bit_buffer(Buffer::from([0b00010001]))
.build();
let int_data = ArrayData::builder(DataType::Int32)
.len(5)
.add_buffer(Buffer::from([0, 28, 42, 0, 0].to_byte_slice()))
.null_bit_buffer(Buffer::from([0b00000110]))
.build();
let mut field_types = vec![];
field_types.push(Field::new("a", DataType::Boolean, false));
field_types.push(Field::new("b", DataType::Int32, false));
let struct_array_data = ArrayData::builder(DataType::Struct(field_types))
.len(5)
.add_child_data(boolean_data.clone())
.add_child_data(int_data.clone())
.null_bit_buffer(Buffer::from([0b00010111]))
.build();
let struct_array = StructArray::from(struct_array_data);
assert_eq!(5, struct_array.len());
assert_eq!(1, struct_array.null_count());
assert!(struct_array.is_valid(0));
assert!(struct_array.is_valid(1));
assert!(struct_array.is_valid(2));
assert!(struct_array.is_null(3));
assert!(struct_array.is_valid(4));
assert_eq!(&boolean_data, struct_array.column(0).data());
assert_eq!(&int_data, struct_array.column(1).data());
let c0 = struct_array.column(0);
let c0 = c0.as_any().downcast_ref::<BooleanArray>().unwrap();
assert_eq!(5, c0.len());
assert_eq!(3, c0.null_count());
assert!(c0.is_valid(0));
assert_eq!(false, c0.value(0));
assert!(c0.is_null(1));
assert!(c0.is_null(2));
assert!(c0.is_null(3));
assert!(c0.is_valid(4));
assert_eq!(true, c0.value(4));
let c1 = struct_array.column(1);
let c1 = c1.as_any().downcast_ref::<Int32Array>().unwrap();
assert_eq!(5, c1.len());
assert_eq!(3, c1.null_count());
assert!(c1.is_null(0));
assert!(c1.is_valid(1));
assert_eq!(28, c1.value(1));
assert!(c1.is_valid(2));
assert_eq!(42, c1.value(2));
assert!(c1.is_null(3));
assert!(c1.is_null(4));
let sliced_array = struct_array.slice(2, 3);
let sliced_array = sliced_array.as_any().downcast_ref::<StructArray>().unwrap();
assert_eq!(3, sliced_array.len());
assert_eq!(2, sliced_array.offset());
assert_eq!(1, sliced_array.null_count());
assert!(sliced_array.is_valid(0));
assert!(sliced_array.is_null(1));
assert!(sliced_array.is_valid(2));
let sliced_c0 = sliced_array.column(0);
let sliced_c0 = sliced_c0.as_any().downcast_ref::<BooleanArray>().unwrap();
assert_eq!(3, sliced_c0.len());
assert_eq!(2, sliced_c0.offset());
assert!(sliced_c0.is_null(0));
assert!(sliced_c0.is_null(1));
assert!(sliced_c0.is_valid(2));
assert_eq!(true, sliced_c0.value(2));
let sliced_c1 = sliced_array.column(1);
let sliced_c1 = sliced_c1.as_any().downcast_ref::<Int32Array>().unwrap();
assert_eq!(3, sliced_c1.len());
assert_eq!(2, sliced_c1.offset());
assert!(sliced_c1.is_valid(0));
assert_eq!(42, sliced_c1.value(0));
assert!(sliced_c1.is_null(1));
assert!(sliced_c1.is_null(2));
}
#[test]
#[should_panic(
expected = "all child arrays of a StructArray must have the same length"
)]
fn test_invalid_struct_child_array_lengths() {
StructArray::from(vec![
(
Field::new("b", DataType::Float32, false),
Arc::new(Float32Array::from(vec![1.1])) as Arc<Array>,
),
(
Field::new("c", DataType::Float64, false),
Arc::new(Float64Array::from(vec![2.2, 3.3])),
),
]);
}
}