| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| //! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates |
| //! common attributes and operations for Arrow array. |
| |
| use crate::datatypes::{ |
| validate_decimal256_precision, validate_decimal_precision, DataType, IntervalUnit, |
| UnionMode, |
| }; |
| use crate::error::{ArrowError, Result}; |
| use crate::util::bit_iterator::BitSliceIterator; |
| use crate::{bitmap::Bitmap, datatypes::ArrowNativeType}; |
| use crate::{ |
| buffer::{Buffer, MutableBuffer}, |
| util::bit_util, |
| }; |
| use half::f16; |
| use num::BigInt; |
| use std::convert::TryInto; |
| use std::mem; |
| use std::ops::Range; |
| use std::sync::Arc; |
| |
| use super::equal::equal; |
| |
| #[inline] |
| pub(crate) fn contains_nulls( |
| null_bit_buffer: Option<&Buffer>, |
| offset: usize, |
| len: usize, |
| ) -> bool { |
| match null_bit_buffer { |
| Some(buffer) => match BitSliceIterator::new(buffer, offset, len).next() { |
| Some((start, end)) => start != 0 || end != len, |
| None => len != 0, // No non-null values |
| }, |
| None => false, // No null buffer |
| } |
| } |
| |
| #[inline] |
| pub(crate) fn count_nulls( |
| null_bit_buffer: Option<&Buffer>, |
| offset: usize, |
| len: usize, |
| ) -> usize { |
| if let Some(buf) = null_bit_buffer { |
| len - buf.count_set_bits_offset(offset, len) |
| } else { |
| 0 |
| } |
| } |
| |
| /// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots). |
| #[inline] |
| pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] { |
| let empty_buffer = MutableBuffer::new(0); |
| match data_type { |
| DataType::Null => [empty_buffer, MutableBuffer::new(0)], |
| DataType::Boolean => { |
| let bytes = bit_util::ceil(capacity, 8); |
| let buffer = MutableBuffer::new(bytes); |
| [buffer, empty_buffer] |
| } |
| DataType::UInt8 => [ |
| MutableBuffer::new(capacity * mem::size_of::<u8>()), |
| empty_buffer, |
| ], |
| DataType::UInt16 => [ |
| MutableBuffer::new(capacity * mem::size_of::<u16>()), |
| empty_buffer, |
| ], |
| DataType::UInt32 => [ |
| MutableBuffer::new(capacity * mem::size_of::<u32>()), |
| empty_buffer, |
| ], |
| DataType::UInt64 => [ |
| MutableBuffer::new(capacity * mem::size_of::<u64>()), |
| empty_buffer, |
| ], |
| DataType::Int8 => [ |
| MutableBuffer::new(capacity * mem::size_of::<i8>()), |
| empty_buffer, |
| ], |
| DataType::Int16 => [ |
| MutableBuffer::new(capacity * mem::size_of::<i16>()), |
| empty_buffer, |
| ], |
| DataType::Int32 => [ |
| MutableBuffer::new(capacity * mem::size_of::<i32>()), |
| empty_buffer, |
| ], |
| DataType::Int64 => [ |
| MutableBuffer::new(capacity * mem::size_of::<i64>()), |
| empty_buffer, |
| ], |
| DataType::Float16 => [ |
| MutableBuffer::new(capacity * mem::size_of::<f16>()), |
| empty_buffer, |
| ], |
| DataType::Float32 => [ |
| MutableBuffer::new(capacity * mem::size_of::<f32>()), |
| empty_buffer, |
| ], |
| DataType::Float64 => [ |
| MutableBuffer::new(capacity * mem::size_of::<f64>()), |
| empty_buffer, |
| ], |
| DataType::Date32 | DataType::Time32(_) => [ |
| MutableBuffer::new(capacity * mem::size_of::<i32>()), |
| empty_buffer, |
| ], |
| DataType::Date64 |
| | DataType::Time64(_) |
| | DataType::Duration(_) |
| | DataType::Timestamp(_, _) => [ |
| MutableBuffer::new(capacity * mem::size_of::<i64>()), |
| empty_buffer, |
| ], |
| DataType::Interval(IntervalUnit::YearMonth) => [ |
| MutableBuffer::new(capacity * mem::size_of::<i32>()), |
| empty_buffer, |
| ], |
| DataType::Interval(IntervalUnit::DayTime) => [ |
| MutableBuffer::new(capacity * mem::size_of::<i64>()), |
| empty_buffer, |
| ], |
| DataType::Interval(IntervalUnit::MonthDayNano) => [ |
| MutableBuffer::new(capacity * mem::size_of::<i128>()), |
| empty_buffer, |
| ], |
| DataType::Utf8 | DataType::Binary => { |
| let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>()); |
| // safety: `unsafe` code assumes that this buffer is initialized with one element |
| buffer.push(0i32); |
| [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())] |
| } |
| DataType::LargeUtf8 | DataType::LargeBinary => { |
| let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>()); |
| // safety: `unsafe` code assumes that this buffer is initialized with one element |
| buffer.push(0i64); |
| [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())] |
| } |
| DataType::List(_) | DataType::Map(_, _) => { |
| // offset buffer always starts with a zero |
| let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>()); |
| buffer.push(0i32); |
| [buffer, empty_buffer] |
| } |
| DataType::LargeList(_) => { |
| // offset buffer always starts with a zero |
| let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>()); |
| buffer.push(0i64); |
| [buffer, empty_buffer] |
| } |
| DataType::FixedSizeBinary(size) => { |
| [MutableBuffer::new(capacity * *size as usize), empty_buffer] |
| } |
| DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { |
| DataType::UInt8 => [ |
| MutableBuffer::new(capacity * mem::size_of::<u8>()), |
| empty_buffer, |
| ], |
| DataType::UInt16 => [ |
| MutableBuffer::new(capacity * mem::size_of::<u16>()), |
| empty_buffer, |
| ], |
| DataType::UInt32 => [ |
| MutableBuffer::new(capacity * mem::size_of::<u32>()), |
| empty_buffer, |
| ], |
| DataType::UInt64 => [ |
| MutableBuffer::new(capacity * mem::size_of::<u64>()), |
| empty_buffer, |
| ], |
| DataType::Int8 => [ |
| MutableBuffer::new(capacity * mem::size_of::<i8>()), |
| empty_buffer, |
| ], |
| DataType::Int16 => [ |
| MutableBuffer::new(capacity * mem::size_of::<i16>()), |
| empty_buffer, |
| ], |
| DataType::Int32 => [ |
| MutableBuffer::new(capacity * mem::size_of::<i32>()), |
| empty_buffer, |
| ], |
| DataType::Int64 => [ |
| MutableBuffer::new(capacity * mem::size_of::<i64>()), |
| empty_buffer, |
| ], |
| _ => unreachable!(), |
| }, |
| DataType::FixedSizeList(_, _) | DataType::Struct(_) => { |
| [empty_buffer, MutableBuffer::new(0)] |
| } |
| DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => [ |
| MutableBuffer::new(capacity * mem::size_of::<u8>()), |
| empty_buffer, |
| ], |
| DataType::Union(_, _, mode) => { |
| let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>()); |
| match mode { |
| UnionMode::Sparse => [type_ids, empty_buffer], |
| UnionMode::Dense => { |
| let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>()); |
| [type_ids, offsets] |
| } |
| } |
| } |
| } |
| } |
| |
| /// Maps 2 [`MutableBuffer`]s into a vector of [Buffer]s whose size depends on `data_type`. |
| #[inline] |
| pub(crate) fn into_buffers( |
| data_type: &DataType, |
| buffer1: MutableBuffer, |
| buffer2: MutableBuffer, |
| ) -> Vec<Buffer> { |
| match data_type { |
| DataType::Null | DataType::Struct(_) | DataType::FixedSizeList(_, _) => vec![], |
| DataType::Utf8 |
| | DataType::Binary |
| | DataType::LargeUtf8 |
| | DataType::LargeBinary => vec![buffer1.into(), buffer2.into()], |
| DataType::Union(_, _, mode) => { |
| match mode { |
| // Based on Union's DataTypeLayout |
| UnionMode::Sparse => vec![buffer1.into()], |
| UnionMode::Dense => vec![buffer1.into(), buffer2.into()], |
| } |
| } |
| _ => vec![buffer1.into()], |
| } |
| } |
| |
| /// An generic representation of Arrow array data which encapsulates common attributes and |
| /// operations for Arrow array. Specific operations for different arrays types (e.g., |
| /// primitive, list, struct) are implemented in `Array`. |
| #[derive(Debug, Clone)] |
| pub struct ArrayData { |
| /// The data type for this array data |
| data_type: DataType, |
| |
| /// The number of elements in this array data |
| len: usize, |
| |
| /// The number of null elements in this array data |
| null_count: usize, |
| |
| /// The offset into this array data, in number of items |
| offset: usize, |
| |
| /// The buffers for this array data. Note that depending on the array types, this |
| /// could hold different kinds of buffers (e.g., value buffer, value offset buffer) |
| /// at different positions. |
| buffers: Vec<Buffer>, |
| |
| /// The child(ren) of this array. Only non-empty for nested types, currently |
| /// `ListArray` and `StructArray`. |
| child_data: Vec<ArrayData>, |
| |
| /// The null bitmap. A `None` value for this indicates all values are non-null in |
| /// this array. |
| null_bitmap: Option<Bitmap>, |
| } |
| |
| pub type ArrayDataRef = Arc<ArrayData>; |
| |
| impl ArrayData { |
| /// Create a new ArrayData instance; |
| /// |
| /// If `null_count` is not specified, the number of nulls in |
| /// null_bit_buffer is calculated. |
| /// |
| /// If the number of nulls is 0 then the null_bit_buffer |
| /// is set to `None`. |
| /// |
| /// # Safety |
| /// |
| /// The input values *must* form a valid Arrow array for |
| /// `data_type`, or undefined behavior can results. |
| /// |
| /// Note: This is a low level API and most users of the arrow |
| /// crate should create arrays using the methods in the `array` |
| /// module. |
| #[allow(clippy::let_and_return)] |
| pub unsafe fn new_unchecked( |
| data_type: DataType, |
| len: usize, |
| null_count: Option<usize>, |
| null_bit_buffer: Option<Buffer>, |
| offset: usize, |
| buffers: Vec<Buffer>, |
| child_data: Vec<ArrayData>, |
| ) -> Self { |
| let null_count = match null_count { |
| None => count_nulls(null_bit_buffer.as_ref(), offset, len), |
| Some(null_count) => null_count, |
| }; |
| let null_bitmap = null_bit_buffer.filter(|_| null_count > 0).map(Bitmap::from); |
| let new_self = Self { |
| data_type, |
| len, |
| null_count, |
| offset, |
| buffers, |
| child_data, |
| null_bitmap, |
| }; |
| |
| // Provide a force_validate mode |
| #[cfg(feature = "force_validate")] |
| new_self.validate_full().unwrap(); |
| new_self |
| } |
| |
| /// Create a new ArrayData, validating that the provided buffers |
| /// form a valid Arrow array of the specified data type. |
| /// |
| /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer |
| /// is set to `None`. |
| /// |
| /// Note: This is a low level API and most users of the arrow |
| /// crate should create arrays using the methods in the `array` |
| /// module. |
| pub fn try_new( |
| data_type: DataType, |
| len: usize, |
| null_bit_buffer: Option<Buffer>, |
| offset: usize, |
| buffers: Vec<Buffer>, |
| child_data: Vec<ArrayData>, |
| ) -> Result<Self> { |
| // we must check the length of `null_bit_buffer` first |
| // because we use this buffer to calculate `null_count` |
| // in `Self::new_unchecked`. |
| if let Some(null_bit_buffer) = null_bit_buffer.as_ref() { |
| let needed_len = bit_util::ceil(len + offset, 8); |
| if null_bit_buffer.len() < needed_len { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "null_bit_buffer size too small. got {} needed {}", |
| null_bit_buffer.len(), |
| needed_len |
| ))); |
| } |
| } |
| // Safety justification: `validate_full` is called below |
| let new_self = unsafe { |
| Self::new_unchecked( |
| data_type, |
| len, |
| None, |
| null_bit_buffer, |
| offset, |
| buffers, |
| child_data, |
| ) |
| }; |
| |
| // As the data is not trusted, do a full validation of its contents |
| new_self.validate_full()?; |
| Ok(new_self) |
| } |
| |
| /// Returns a builder to construct a `ArrayData` instance. |
| #[inline] |
| pub const fn builder(data_type: DataType) -> ArrayDataBuilder { |
| ArrayDataBuilder::new(data_type) |
| } |
| |
| /// Returns a reference to the data type of this array data |
| #[inline] |
| pub const fn data_type(&self) -> &DataType { |
| &self.data_type |
| } |
| |
| /// Updates the [DataType] of this ArrayData/ |
| /// |
| /// panic's if the new DataType is not compatible with the |
| /// existing type. |
| /// |
| /// Note: currently only changing a [DataType::Decimal128]s or |
| /// [DataType::Decimal256]s precision and scale are supported |
| #[inline] |
| pub(crate) fn with_data_type(mut self, new_data_type: DataType) -> Self { |
| if matches!(self.data_type, DataType::Decimal128(_, _)) { |
| assert!( |
| matches!(new_data_type, DataType::Decimal128(_, _)), |
| "only 128-bit DecimalType is supported for new datatype" |
| ); |
| } else if matches!(self.data_type, DataType::Decimal256(_, _)) { |
| assert!( |
| matches!(new_data_type, DataType::Decimal256(_, _)), |
| "only 256-bit DecimalType is supported for new datatype" |
| ); |
| } else { |
| panic!("only DecimalType is supported.") |
| } |
| |
| self.data_type = new_data_type; |
| self |
| } |
| |
| /// Returns a slice of buffers for this array data |
| pub fn buffers(&self) -> &[Buffer] { |
| &self.buffers[..] |
| } |
| |
| /// Returns a slice of children data arrays |
| pub fn child_data(&self) -> &[ArrayData] { |
| &self.child_data[..] |
| } |
| |
| /// Returns whether the element at index `i` is null |
| pub fn is_null(&self, i: usize) -> bool { |
| if let Some(ref b) = self.null_bitmap { |
| return !b.is_set(self.offset + i); |
| } |
| false |
| } |
| |
| /// Returns a reference to the null bitmap of this array data |
| #[inline] |
| pub const fn null_bitmap(&self) -> Option<&Bitmap> { |
| self.null_bitmap.as_ref() |
| } |
| |
| /// Returns a reference to the null buffer of this array data. |
| pub fn null_buffer(&self) -> Option<&Buffer> { |
| self.null_bitmap().as_ref().map(|b| b.buffer_ref()) |
| } |
| |
| /// Returns whether the element at index `i` is not null |
| pub fn is_valid(&self, i: usize) -> bool { |
| if let Some(ref b) = self.null_bitmap { |
| return b.is_set(self.offset + i); |
| } |
| true |
| } |
| |
| /// Returns the length (i.e., number of elements) of this array |
| #[inline] |
| pub const fn len(&self) -> usize { |
| self.len |
| } |
| |
| // Returns whether array data is empty |
| #[inline] |
| pub const fn is_empty(&self) -> bool { |
| self.len == 0 |
| } |
| |
| /// Returns the offset of this array |
| #[inline] |
| pub const fn offset(&self) -> usize { |
| self.offset |
| } |
| |
| /// Returns the total number of nulls in this array |
| #[inline] |
| pub const fn null_count(&self) -> usize { |
| self.null_count |
| } |
| |
| /// Returns the total number of bytes of memory occupied by the buffers owned by this [ArrayData]. |
| pub fn get_buffer_memory_size(&self) -> usize { |
| let mut size = 0; |
| for buffer in &self.buffers { |
| size += buffer.capacity(); |
| } |
| if let Some(bitmap) = &self.null_bitmap { |
| size += bitmap.get_buffer_memory_size() |
| } |
| for child in &self.child_data { |
| size += child.get_buffer_memory_size(); |
| } |
| size |
| } |
| |
| /// Returns the total number of bytes of memory occupied physically by this [ArrayData]. |
| pub fn get_array_memory_size(&self) -> usize { |
| let mut size = mem::size_of_val(self); |
| |
| // Calculate rest of the fields top down which contain actual data |
| for buffer in &self.buffers { |
| size += mem::size_of::<Buffer>(); |
| size += buffer.capacity(); |
| } |
| if let Some(bitmap) = &self.null_bitmap { |
| // this includes the size of the bitmap struct itself, since it is stored directly in |
| // this struct we already counted those bytes in the size_of_val(self) above |
| size += bitmap.get_array_memory_size(); |
| size -= mem::size_of::<Bitmap>(); |
| } |
| for child in &self.child_data { |
| size += child.get_array_memory_size(); |
| } |
| |
| size |
| } |
| |
| /// Creates a zero-copy slice of itself. This creates a new [ArrayData] |
| /// with a different offset, len and a shifted null bitmap. |
| /// |
| /// # Panics |
| /// |
| /// Panics if `offset + length > self.len()`. |
| pub fn slice(&self, offset: usize, length: usize) -> ArrayData { |
| assert!((offset + length) <= self.len()); |
| |
| if let DataType::Struct(_) = self.data_type() { |
| // Slice into children |
| let new_offset = self.offset + offset; |
| let new_data = ArrayData { |
| data_type: self.data_type().clone(), |
| len: length, |
| null_count: count_nulls(self.null_buffer(), new_offset, length), |
| offset: new_offset, |
| buffers: self.buffers.clone(), |
| // Slice child data, to propagate offsets down to them |
| child_data: self |
| .child_data() |
| .iter() |
| .map(|data| data.slice(offset, length)) |
| .collect(), |
| null_bitmap: self.null_bitmap().cloned(), |
| }; |
| |
| new_data |
| } else { |
| let mut new_data = self.clone(); |
| |
| new_data.len = length; |
| new_data.offset = offset + self.offset; |
| |
| new_data.null_count = |
| count_nulls(new_data.null_buffer(), new_data.offset, new_data.len); |
| |
| new_data |
| } |
| } |
| |
| /// Returns the `buffer` as a slice of type `T` starting at self.offset |
| /// # Panics |
| /// This function panics if: |
| /// * the buffer is not byte-aligned with type T, or |
| /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable) |
| #[inline] |
| pub(crate) fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] { |
| let values = unsafe { self.buffers[buffer].as_slice().align_to::<T>() }; |
| if !values.0.is_empty() || !values.2.is_empty() { |
| panic!("The buffer is not byte-aligned with its interpretation") |
| }; |
| assert_ne!(self.data_type, DataType::Boolean); |
| &values.1[self.offset..] |
| } |
| |
| /// Returns a new empty [ArrayData] valid for `data_type`. |
| pub fn new_empty(data_type: &DataType) -> Self { |
| let buffers = new_buffers(data_type, 0); |
| let [buffer1, buffer2] = buffers; |
| let buffers = into_buffers(data_type, buffer1, buffer2); |
| |
| let child_data = match data_type { |
| DataType::Null |
| | DataType::Boolean |
| | DataType::UInt8 |
| | DataType::UInt16 |
| | DataType::UInt32 |
| | DataType::UInt64 |
| | DataType::Int8 |
| | DataType::Int16 |
| | DataType::Int32 |
| | DataType::Int64 |
| | DataType::Float16 |
| | DataType::Float32 |
| | DataType::Float64 |
| | DataType::Date32 |
| | DataType::Date64 |
| | DataType::Time32(_) |
| | DataType::Time64(_) |
| | DataType::Duration(_) |
| | DataType::Timestamp(_, _) |
| | DataType::Utf8 |
| | DataType::Binary |
| | DataType::LargeUtf8 |
| | DataType::LargeBinary |
| | DataType::Interval(_) |
| | DataType::FixedSizeBinary(_) |
| | DataType::Decimal128(_, _) |
| | DataType::Decimal256(_, _) => vec![], |
| DataType::List(field) => { |
| vec![Self::new_empty(field.data_type())] |
| } |
| DataType::FixedSizeList(field, _) => { |
| vec![Self::new_empty(field.data_type())] |
| } |
| DataType::LargeList(field) => { |
| vec![Self::new_empty(field.data_type())] |
| } |
| DataType::Struct(fields) => fields |
| .iter() |
| .map(|field| Self::new_empty(field.data_type())) |
| .collect(), |
| DataType::Map(field, _) => { |
| vec![Self::new_empty(field.data_type())] |
| } |
| DataType::Union(fields, _, _) => fields |
| .iter() |
| .map(|field| Self::new_empty(field.data_type())) |
| .collect(), |
| DataType::Dictionary(_, data_type) => { |
| vec![Self::new_empty(data_type)] |
| } |
| }; |
| |
| // Data was constructed correctly above |
| unsafe { |
| Self::new_unchecked( |
| data_type.clone(), |
| 0, |
| Some(0), |
| None, |
| 0, |
| buffers, |
| child_data, |
| ) |
| } |
| } |
| |
| /// "cheap" validation of an `ArrayData`. Ensures buffers are |
| /// sufficiently sized to store `len` + `offset` total elements of |
| /// `data_type` and performs other inexpensive consistency checks. |
| /// |
| /// This check is "cheap" in the sense that it does not validate the |
| /// contents of the buffers (e.g. that all offsets for UTF8 arrays |
| /// are within the bounds of the values buffer). |
| /// |
| /// See [ArrayData::validate_full] to validate fully the offset content |
| /// and the validitiy of utf8 data |
| pub fn validate(&self) -> Result<()> { |
| // Need at least this mich space in each buffer |
| let len_plus_offset = self.len + self.offset; |
| |
| // Check that the data layout conforms to the spec |
| let layout = layout(&self.data_type); |
| |
| if !layout.can_contain_null_mask && self.null_bitmap.is_some() { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "Arrays of type {:?} cannot contain a null bitmask", |
| self.data_type, |
| ))); |
| } |
| |
| if self.buffers.len() != layout.buffers.len() { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "Expected {} buffers in array of type {:?}, got {}", |
| layout.buffers.len(), |
| self.data_type, |
| self.buffers.len(), |
| ))); |
| } |
| |
| for (i, (buffer, spec)) in |
| self.buffers.iter().zip(layout.buffers.iter()).enumerate() |
| { |
| match spec { |
| BufferSpec::FixedWidth { byte_width } => { |
| let min_buffer_size = len_plus_offset |
| .checked_mul(*byte_width) |
| .expect("integer overflow computing min buffer size"); |
| |
| if buffer.len() < min_buffer_size { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}", |
| min_buffer_size, i, self.data_type, buffer.len() |
| ))); |
| } |
| } |
| BufferSpec::VariableWidth => { |
| // not cheap to validate (need to look at the |
| // data). Partially checked in validate_offsets |
| // called below. Can check with `validate_full` |
| } |
| BufferSpec::BitMap => { |
| let min_buffer_size = bit_util::ceil(len_plus_offset, 8); |
| if buffer.len() < min_buffer_size { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}", |
| min_buffer_size, i, self.data_type, buffer.len() |
| ))); |
| } |
| } |
| BufferSpec::AlwaysNull => { |
| // Nothing to validate |
| } |
| } |
| } |
| |
| if self.null_count > self.len { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "null_count {} for an array exceeds length of {} elements", |
| self.null_count, self.len |
| ))); |
| } |
| |
| // check null bit buffer size |
| if let Some(null_bit_map) = self.null_bitmap.as_ref() { |
| let null_bit_buffer = null_bit_map.buffer_ref(); |
| let needed_len = bit_util::ceil(len_plus_offset, 8); |
| if null_bit_buffer.len() < needed_len { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "null_bit_buffer size too small. got {} needed {}", |
| null_bit_buffer.len(), |
| needed_len |
| ))); |
| } |
| } else if self.null_count > 0 { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "Array of type {} has {} nulls but no null bitmap", |
| self.data_type, self.null_count |
| ))); |
| } |
| |
| self.validate_child_data()?; |
| |
| // Additional Type specific checks |
| match &self.data_type { |
| DataType::Utf8 | DataType::Binary => { |
| self.validate_offsets::<i32>(self.buffers[1].len())?; |
| } |
| DataType::LargeUtf8 | DataType::LargeBinary => { |
| self.validate_offsets::<i64>(self.buffers[1].len())?; |
| } |
| DataType::Dictionary(key_type, _value_type) => { |
| // At the moment, constructing a DictionaryArray will also check this |
| if !DataType::is_dictionary_key_type(key_type) { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "Dictionary key type must be integer, but was {}", |
| key_type |
| ))); |
| } |
| } |
| _ => {} |
| }; |
| |
| Ok(()) |
| } |
| |
| /// Returns a reference to the data in `buffer` as a typed slice |
| /// (typically `&[i32]` or `&[i64]`) after validating. The |
| /// returned slice is guaranteed to have at least `self.len + 1` |
| /// entries. |
| /// |
| /// For an empty array, the `buffer` can also be empty. |
| fn typed_offsets<T: ArrowNativeType + num::Num>(&self) -> Result<&[T]> { |
| // An empty list-like array can have 0 offsets |
| if self.len == 0 && self.buffers[0].is_empty() { |
| return Ok(&[]); |
| } |
| |
| self.typed_buffer(0, self.len + 1) |
| } |
| |
| /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating |
| fn typed_buffer<T: ArrowNativeType + num::Num>( |
| &self, |
| idx: usize, |
| len: usize, |
| ) -> Result<&[T]> { |
| let buffer = &self.buffers[idx]; |
| |
| let required_len = (len + self.offset) * std::mem::size_of::<T>(); |
| |
| if buffer.len() < required_len { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "Buffer {} of {} isn't large enough. Expected {} bytes got {}", |
| idx, |
| self.data_type, |
| required_len, |
| buffer.len() |
| ))); |
| } |
| |
| Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len]) |
| } |
| |
| /// Does a cheap sanity check that the `self.len` values in `buffer` are valid |
| /// offsets (of type T) into some other buffer of `values_length` bytes long |
| fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>( |
| &self, |
| values_length: usize, |
| ) -> Result<()> { |
| // Justification: buffer size was validated above |
| let offsets = self.typed_offsets::<T>()?; |
| if offsets.is_empty() { |
| return Ok(()); |
| } |
| |
| let first_offset = offsets[0].to_usize().ok_or_else(|| { |
| ArrowError::InvalidArgumentError(format!( |
| "Error converting offset[0] ({}) to usize for {}", |
| offsets[0], self.data_type |
| )) |
| })?; |
| |
| let last_offset = offsets[self.len].to_usize().ok_or_else(|| { |
| ArrowError::InvalidArgumentError(format!( |
| "Error converting offset[{}] ({}) to usize for {}", |
| self.len, offsets[self.len], self.data_type |
| )) |
| })?; |
| |
| if first_offset > values_length { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "First offset {} of {} is larger than values length {}", |
| first_offset, self.data_type, values_length, |
| ))); |
| } |
| |
| if last_offset > values_length { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "Last offset {} of {} is larger than values length {}", |
| last_offset, self.data_type, values_length, |
| ))); |
| } |
| |
| if first_offset > last_offset { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "First offset {} in {} is smaller than last offset {}", |
| first_offset, self.data_type, last_offset, |
| ))); |
| } |
| |
| Ok(()) |
| } |
| |
| /// Validates the layout of `child_data` ArrayData structures |
| fn validate_child_data(&self) -> Result<()> { |
| match &self.data_type { |
| DataType::List(field) | DataType::Map(field, _) => { |
| let values_data = self.get_single_valid_child_data(field.data_type())?; |
| self.validate_offsets::<i32>(values_data.len)?; |
| Ok(()) |
| } |
| DataType::LargeList(field) => { |
| let values_data = self.get_single_valid_child_data(field.data_type())?; |
| self.validate_offsets::<i64>(values_data.len)?; |
| Ok(()) |
| } |
| DataType::FixedSizeList(field, list_size) => { |
| let values_data = self.get_single_valid_child_data(field.data_type())?; |
| |
| let list_size: usize = (*list_size).try_into().map_err(|_| { |
| ArrowError::InvalidArgumentError(format!( |
| "{} has a negative list_size {}", |
| self.data_type, list_size |
| )) |
| })?; |
| |
| let expected_values_len = self.len |
| .checked_mul(list_size) |
| .expect("integer overflow computing expected number of expected values in FixedListSize"); |
| |
| if values_data.len < expected_values_len { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}", |
| values_data.len, list_size, list_size, self.data_type |
| ))); |
| } |
| |
| Ok(()) |
| } |
| DataType::Struct(fields) => { |
| self.validate_num_child_data(fields.len())?; |
| for (i, field) in fields.iter().enumerate() { |
| let field_data = self.get_valid_child_data(i, field.data_type())?; |
| |
| // Ensure child field has sufficient size |
| if field_data.len < self.len { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})", |
| self.data_type, i, field.name(), field_data.len, self.len |
| ))); |
| } |
| } |
| Ok(()) |
| } |
| DataType::Union(fields, _, mode) => { |
| self.validate_num_child_data(fields.len())?; |
| |
| for (i, field) in fields.iter().enumerate() { |
| let field_data = self.get_valid_child_data(i, field.data_type())?; |
| |
| if mode == &UnionMode::Sparse |
| && field_data.len < (self.len + self.offset) |
| { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "Sparse union child array #{} has length smaller than expected for union array ({} < {})", |
| i, field_data.len, self.len + self.offset |
| ))); |
| } |
| } |
| Ok(()) |
| } |
| DataType::Dictionary(_key_type, value_type) => { |
| self.get_single_valid_child_data(value_type)?; |
| Ok(()) |
| } |
| _ => { |
| // other types do not have child data |
| if !self.child_data.is_empty() { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "Expected no child arrays for type {} but got {}", |
| self.data_type, |
| self.child_data.len() |
| ))); |
| } |
| Ok(()) |
| } |
| } |
| } |
| |
| /// Ensures that this array data has a single child_data with the |
| /// expected type, and calls `validate()` on it. Returns a |
| /// reference to that child_data |
| fn get_single_valid_child_data( |
| &self, |
| expected_type: &DataType, |
| ) -> Result<&ArrayData> { |
| self.validate_num_child_data(1)?; |
| self.get_valid_child_data(0, expected_type) |
| } |
| |
| /// Returns `Err` if self.child_data does not have exactly `expected_len` elements |
| fn validate_num_child_data(&self, expected_len: usize) -> Result<()> { |
| if self.child_data().len() != expected_len { |
| Err(ArrowError::InvalidArgumentError(format!( |
| "Value data for {} should contain {} child data array(s), had {}", |
| self.data_type(), |
| expected_len, |
| self.child_data.len() |
| ))) |
| } else { |
| Ok(()) |
| } |
| } |
| |
| /// Ensures that `child_data[i]` has the expected type, calls |
| /// `validate()` on it, and returns a reference to that child_data |
| fn get_valid_child_data( |
| &self, |
| i: usize, |
| expected_type: &DataType, |
| ) -> Result<&ArrayData> { |
| let values_data = self.child_data |
| .get(i) |
| .ok_or_else(|| { |
| ArrowError::InvalidArgumentError(format!( |
| "{} did not have enough child arrays. Expected at least {} but had only {}", |
| self.data_type, i+1, self.child_data.len() |
| )) |
| })?; |
| |
| if expected_type != &values_data.data_type { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "Child type mismatch for {}. Expected {} but child data had {}", |
| self.data_type, expected_type, values_data.data_type |
| ))); |
| } |
| |
| values_data.validate()?; |
| Ok(values_data) |
| } |
| |
| /// "expensive" validation that ensures: |
| /// |
| /// 1. Null count is correct |
| /// 2. All offsets are valid |
| /// 3. All String data is valid UTF-8 |
| /// 4. All dictionary offsets are valid |
| /// |
| /// Does not (yet) check |
| /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85) |
| /// Note calls `validate()` internally |
| pub fn validate_full(&self) -> Result<()> { |
| // Check all buffer sizes prior to looking at them more deeply in this function |
| self.validate()?; |
| |
| let null_bitmap_buffer = self |
| .null_bitmap |
| .as_ref() |
| .map(|null_bitmap| null_bitmap.buffer_ref()); |
| |
| let actual_null_count = count_nulls(null_bitmap_buffer, self.offset, self.len); |
| if actual_null_count != self.null_count { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "null_count value ({}) doesn't match actual number of nulls in array ({})", |
| self.null_count, actual_null_count |
| ))); |
| } |
| |
| self.validate_values()?; |
| |
| // validate all children recursively |
| self.child_data |
| .iter() |
| .enumerate() |
| .try_for_each(|(i, child_data)| { |
| child_data.validate_full().map_err(|e| { |
| ArrowError::InvalidArgumentError(format!( |
| "{} child #{} invalid: {}", |
| self.data_type, i, e |
| )) |
| }) |
| })?; |
| |
| Ok(()) |
| } |
| |
| pub fn validate_values(&self) -> Result<()> { |
| match &self.data_type { |
| DataType::Decimal128(p, _) => { |
| let values_buffer: &[i128] = self.typed_buffer(0, self.len)?; |
| for value in values_buffer { |
| validate_decimal_precision(*value, *p)?; |
| } |
| Ok(()) |
| } |
| DataType::Decimal256(p, _) => { |
| let values = self.buffers()[0].as_slice(); |
| for pos in 0..self.len() { |
| let offset = pos * 32; |
| let raw_bytes = &values[offset..offset + 32]; |
| let integer = BigInt::from_signed_bytes_le(raw_bytes); |
| validate_decimal256_precision(&integer, *p)?; |
| } |
| Ok(()) |
| } |
| DataType::Utf8 => self.validate_utf8::<i32>(), |
| DataType::LargeUtf8 => self.validate_utf8::<i64>(), |
| DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()), |
| DataType::LargeBinary => { |
| self.validate_offsets_full::<i64>(self.buffers[1].len()) |
| } |
| DataType::List(_) | DataType::Map(_, _) => { |
| let child = &self.child_data[0]; |
| self.validate_offsets_full::<i32>(child.len) |
| } |
| DataType::LargeList(_) => { |
| let child = &self.child_data[0]; |
| self.validate_offsets_full::<i64>(child.len) |
| } |
| DataType::Union(_, _, _) => { |
| // Validate Union Array as part of implementing new Union semantics |
| // See comments in `ArrayData::validate()` |
| // https://github.com/apache/arrow-rs/issues/85 |
| // |
| // TODO file follow on ticket for full union validation |
| Ok(()) |
| } |
| DataType::Dictionary(key_type, _value_type) => { |
| let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap(); |
| let max_value = dictionary_length - 1; |
| match key_type.as_ref() { |
| DataType::UInt8 => self.check_bounds::<u8>(max_value), |
| DataType::UInt16 => self.check_bounds::<u16>(max_value), |
| DataType::UInt32 => self.check_bounds::<u32>(max_value), |
| DataType::UInt64 => self.check_bounds::<u64>(max_value), |
| DataType::Int8 => self.check_bounds::<i8>(max_value), |
| DataType::Int16 => self.check_bounds::<i16>(max_value), |
| DataType::Int32 => self.check_bounds::<i32>(max_value), |
| DataType::Int64 => self.check_bounds::<i64>(max_value), |
| _ => unreachable!(), |
| } |
| } |
| _ => { |
| // No extra validation check required for other types |
| Ok(()) |
| } |
| } |
| } |
| |
| /// Calls the `validate(item_index, range)` function for each of |
| /// the ranges specified in the arrow offsets buffer of type |
| /// `T`. Also validates that each offset is smaller than |
| /// `offset_limit` |
| /// |
| /// For an empty array, the offsets buffer can either be empty |
| /// or contain a single `0`. |
| /// |
| /// For example, the offsets buffer contained `[1, 2, 4]`, this |
| /// function would call `validate([1,2])`, and `validate([2,4])` |
| fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<()> |
| where |
| T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display, |
| V: Fn(usize, Range<usize>) -> Result<()>, |
| { |
| self.typed_offsets::<T>()? |
| .iter() |
| .enumerate() |
| .map(|(i, x)| { |
| // check if the offset can be converted to usize |
| let r = x.to_usize().ok_or_else(|| { |
| ArrowError::InvalidArgumentError(format!( |
| "Offset invariant failure: Could not convert offset {} to usize at position {}", |
| x, i))} |
| ); |
| // check if the offset exceeds the limit |
| match r { |
| Ok(n) if n <= offset_limit => Ok((i, n)), |
| Ok(_) => Err(ArrowError::InvalidArgumentError(format!( |
| "Offset invariant failure: offset at position {} out of bounds: {} > {}", |
| i, x, offset_limit)) |
| ), |
| Err(e) => Err(e), |
| } |
| }) |
| .scan(0_usize, |start, end| { |
| // check offsets are monotonically increasing |
| match end { |
| Ok((i, end)) if *start <= end => { |
| let range = Some(Ok((i, *start..end))); |
| *start = end; |
| range |
| } |
| Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!( |
| "Offset invariant failure: non-monotonic offset at slot {}: {} > {}", |
| i - 1, start, end)) |
| )), |
| Err(err) => Some(Err(err)), |
| } |
| }) |
| .skip(1) // the first element is meaningless |
| .try_for_each(|res: Result<(usize, Range<usize>)>| { |
| let (item_index, range) = res?; |
| validate(item_index-1, range) |
| }) |
| } |
| |
| /// Ensures that all strings formed by the offsets in `buffers[0]` |
| /// into `buffers[1]` are valid utf8 sequences |
| fn validate_utf8<T>(&self) -> Result<()> |
| where |
| T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display, |
| { |
| let values_buffer = &self.buffers[1].as_slice(); |
| if let Ok(values_str) = std::str::from_utf8(values_buffer) { |
| // Validate Offsets are correct |
| self.validate_each_offset::<T, _>( |
| values_buffer.len(), |
| |string_index, range| { |
| if !values_str.is_char_boundary(range.start) |
| || !values_str.is_char_boundary(range.end) |
| { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "incomplete utf-8 byte sequence from index {}", |
| string_index |
| ))); |
| } |
| Ok(()) |
| }, |
| ) |
| } else { |
| // find specific offset that failed utf8 validation |
| self.validate_each_offset::<T, _>( |
| values_buffer.len(), |
| |string_index, range| { |
| std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| { |
| ArrowError::InvalidArgumentError(format!( |
| "Invalid UTF8 sequence at string index {} ({:?}): {}", |
| string_index, range, e |
| )) |
| })?; |
| Ok(()) |
| }, |
| ) |
| } |
| } |
| |
| /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are |
| /// between `0` and `offset_limit` |
| fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<()> |
| where |
| T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display, |
| { |
| self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| { |
| // No validation applied to each value, but the iteration |
| // itself applies bounds checking to each range |
| Ok(()) |
| }) |
| } |
| |
| /// Validates that each value in self.buffers (typed as T) |
| /// is within the range [0, max_value], inclusive |
| fn check_bounds<T>(&self, max_value: i64) -> Result<()> |
| where |
| T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display, |
| { |
| let required_len = self.len + self.offset; |
| let buffer = &self.buffers[0]; |
| |
| // This should have been checked as part of `validate()` prior |
| // to calling `validate_full()` but double check to be sure |
| assert!(buffer.len() / std::mem::size_of::<T>() >= required_len); |
| |
| // Justification: buffer size was validated above |
| let indexes: &[T] = |
| &buffer.typed_data::<T>()[self.offset..self.offset + self.len]; |
| |
| indexes.iter().enumerate().try_for_each(|(i, &dict_index)| { |
| // Do not check the value is null (value can be arbitrary) |
| if self.is_null(i) { |
| return Ok(()); |
| } |
| let dict_index: i64 = dict_index.try_into().map_err(|_| { |
| ArrowError::InvalidArgumentError(format!( |
| "Value at position {} out of bounds: {} (can not convert to i64)", |
| i, dict_index |
| )) |
| })?; |
| |
| if dict_index < 0 || dict_index > max_value { |
| return Err(ArrowError::InvalidArgumentError(format!( |
| "Value at position {} out of bounds: {} (should be in [0, {}])", |
| i, dict_index, max_value |
| ))); |
| } |
| Ok(()) |
| }) |
| } |
| |
| /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons |
| /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may |
| /// return false when the arrays are logically equal |
| pub fn ptr_eq(&self, other: &Self) -> bool { |
| if self.offset != other.offset |
| || self.len != other.len |
| || self.null_count != other.null_count |
| || self.data_type != other.data_type |
| || self.buffers.len() != other.buffers.len() |
| || self.child_data.len() != other.child_data.len() |
| { |
| return false; |
| } |
| |
| match (&self.null_bitmap, &other.null_bitmap) { |
| (Some(a), Some(b)) if a.bits.as_ptr() != b.bits.as_ptr() => return false, |
| (Some(_), None) | (None, Some(_)) => return false, |
| _ => {} |
| }; |
| |
| if !self |
| .buffers |
| .iter() |
| .zip(other.buffers.iter()) |
| .all(|(a, b)| a.as_ptr() == b.as_ptr()) |
| { |
| return false; |
| } |
| |
| self.child_data |
| .iter() |
| .zip(other.child_data.iter()) |
| .all(|(a, b)| a.ptr_eq(b)) |
| } |
| |
| /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`] |
| pub fn into_builder(self) -> ArrayDataBuilder { |
| self.into() |
| } |
| } |
| |
| /// Return the expected [`DataTypeLayout`] Arrays of this data |
| /// type are expected to have |
| pub(crate) fn layout(data_type: &DataType) -> DataTypeLayout { |
| // based on C/C++ implementation in |
| // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc) |
| use std::mem::size_of; |
| match data_type { |
| DataType::Null => DataTypeLayout { |
| buffers: vec![], |
| can_contain_null_mask: false, |
| }, |
| DataType::Boolean => DataTypeLayout { |
| buffers: vec![BufferSpec::BitMap], |
| can_contain_null_mask: true, |
| }, |
| DataType::Int8 => DataTypeLayout::new_fixed_width(size_of::<i8>()), |
| DataType::Int16 => DataTypeLayout::new_fixed_width(size_of::<i16>()), |
| DataType::Int32 => DataTypeLayout::new_fixed_width(size_of::<i32>()), |
| DataType::Int64 => DataTypeLayout::new_fixed_width(size_of::<i64>()), |
| DataType::UInt8 => DataTypeLayout::new_fixed_width(size_of::<u8>()), |
| DataType::UInt16 => DataTypeLayout::new_fixed_width(size_of::<u16>()), |
| DataType::UInt32 => DataTypeLayout::new_fixed_width(size_of::<u32>()), |
| DataType::UInt64 => DataTypeLayout::new_fixed_width(size_of::<u64>()), |
| DataType::Float16 => DataTypeLayout::new_fixed_width(size_of::<f16>()), |
| DataType::Float32 => DataTypeLayout::new_fixed_width(size_of::<f32>()), |
| DataType::Float64 => DataTypeLayout::new_fixed_width(size_of::<f64>()), |
| DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width(size_of::<i64>()), |
| DataType::Date32 => DataTypeLayout::new_fixed_width(size_of::<i32>()), |
| DataType::Date64 => DataTypeLayout::new_fixed_width(size_of::<i64>()), |
| DataType::Time32(_) => DataTypeLayout::new_fixed_width(size_of::<i32>()), |
| DataType::Time64(_) => DataTypeLayout::new_fixed_width(size_of::<i64>()), |
| DataType::Interval(IntervalUnit::YearMonth) => { |
| DataTypeLayout::new_fixed_width(size_of::<i32>()) |
| } |
| DataType::Interval(IntervalUnit::DayTime) => { |
| DataTypeLayout::new_fixed_width(size_of::<i64>()) |
| } |
| DataType::Interval(IntervalUnit::MonthDayNano) => { |
| DataTypeLayout::new_fixed_width(size_of::<i128>()) |
| } |
| DataType::Duration(_) => DataTypeLayout::new_fixed_width(size_of::<i64>()), |
| DataType::Binary => DataTypeLayout::new_binary(size_of::<i32>()), |
| DataType::FixedSizeBinary(bytes_per_value) => { |
| let bytes_per_value: usize = (*bytes_per_value) |
| .try_into() |
| .expect("negative size for fixed size binary"); |
| DataTypeLayout::new_fixed_width(bytes_per_value) |
| } |
| DataType::LargeBinary => DataTypeLayout::new_binary(size_of::<i64>()), |
| DataType::Utf8 => DataTypeLayout::new_binary(size_of::<i32>()), |
| DataType::LargeUtf8 => DataTypeLayout::new_binary(size_of::<i64>()), |
| DataType::List(_) => DataTypeLayout::new_fixed_width(size_of::<i32>()), |
| DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data |
| DataType::LargeList(_) => DataTypeLayout::new_fixed_width(size_of::<i32>()), |
| DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child data, |
| DataType::Union(_, _, mode) => { |
| let type_ids = BufferSpec::FixedWidth { |
| byte_width: size_of::<i8>(), |
| }; |
| |
| DataTypeLayout { |
| buffers: match mode { |
| UnionMode::Sparse => { |
| vec![type_ids] |
| } |
| UnionMode::Dense => { |
| vec![ |
| type_ids, |
| BufferSpec::FixedWidth { |
| byte_width: size_of::<i32>(), |
| }, |
| ] |
| } |
| }, |
| can_contain_null_mask: false, |
| } |
| } |
| DataType::Dictionary(key_type, _value_type) => layout(key_type), |
| DataType::Decimal128(_, _) => { |
| // Decimals are always some fixed width; The rust implementation |
| // always uses 16 bytes / size of i128 |
| DataTypeLayout::new_fixed_width(size_of::<i128>()) |
| } |
| DataType::Decimal256(_, _) => { |
| // Decimals are always some fixed width. |
| DataTypeLayout::new_fixed_width(32) |
| } |
| DataType::Map(_, _) => { |
| // same as ListType |
| DataTypeLayout::new_fixed_width(size_of::<i32>()) |
| } |
| } |
| } |
| |
| /// Layout specification for a data type |
| #[derive(Debug, PartialEq)] |
| // Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91 |
| pub(crate) struct DataTypeLayout { |
| /// A vector of buffer layout specifications, one for each expected buffer |
| pub buffers: Vec<BufferSpec>, |
| |
| /// Can contain a null bitmask |
| pub can_contain_null_mask: bool, |
| } |
| |
| impl DataTypeLayout { |
| /// Describes a basic numeric array where each element has a fixed width |
| pub fn new_fixed_width(byte_width: usize) -> Self { |
| Self { |
| buffers: vec![BufferSpec::FixedWidth { byte_width }], |
| can_contain_null_mask: true, |
| } |
| } |
| |
| /// Describes arrays which have no data of their own |
| /// (e.g. FixedSizeList). Note such arrays may still have a Null |
| /// Bitmap |
| pub fn new_empty() -> Self { |
| Self { |
| buffers: vec![], |
| can_contain_null_mask: true, |
| } |
| } |
| |
| /// Describes a basic numeric array where each element has a fixed |
| /// with offset buffer of `offset_byte_width` bytes, followed by a |
| /// variable width data buffer |
| pub fn new_binary(offset_byte_width: usize) -> Self { |
| Self { |
| buffers: vec![ |
| // offsets |
| BufferSpec::FixedWidth { |
| byte_width: offset_byte_width, |
| }, |
| // values |
| BufferSpec::VariableWidth, |
| ], |
| can_contain_null_mask: true, |
| } |
| } |
| } |
| |
| /// Layout specification for a single data type buffer |
| #[derive(Debug, PartialEq)] |
| pub(crate) enum BufferSpec { |
| /// each element has a fixed width |
| FixedWidth { byte_width: usize }, |
| /// Variable width, such as string data for utf8 data |
| VariableWidth, |
| /// Buffer holds a bitmap. |
| /// |
| /// Note: Unlike the C++ implementation, the null/validity buffer |
| /// is handled specially rather than as another of the buffers in |
| /// the spec, so this variant is only used for the Boolean type. |
| BitMap, |
| /// Buffer is always null. Unused currently in Rust implementation, |
| /// (used in C++ for Union type) |
| #[allow(dead_code)] |
| AlwaysNull, |
| } |
| |
| impl PartialEq for ArrayData { |
| fn eq(&self, other: &Self) -> bool { |
| equal(self, other) |
| } |
| } |
| |
| /// Builder for `ArrayData` type |
| #[derive(Debug)] |
| pub struct ArrayDataBuilder { |
| data_type: DataType, |
| len: usize, |
| null_count: Option<usize>, |
| null_bit_buffer: Option<Buffer>, |
| offset: usize, |
| buffers: Vec<Buffer>, |
| child_data: Vec<ArrayData>, |
| } |
| |
| impl ArrayDataBuilder { |
| #[inline] |
| pub const fn new(data_type: DataType) -> Self { |
| Self { |
| data_type, |
| len: 0, |
| null_count: None, |
| null_bit_buffer: None, |
| offset: 0, |
| buffers: vec![], |
| child_data: vec![], |
| } |
| } |
| |
| pub fn data_type(self, data_type: DataType) -> Self { |
| Self { data_type, ..self } |
| } |
| |
| #[inline] |
| #[allow(clippy::len_without_is_empty)] |
| pub const fn len(mut self, n: usize) -> Self { |
| self.len = n; |
| self |
| } |
| |
| pub fn null_count(mut self, null_count: usize) -> Self { |
| self.null_count = Some(null_count); |
| self |
| } |
| |
| pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self { |
| self.null_bit_buffer = buf; |
| self |
| } |
| |
| #[inline] |
| pub const fn offset(mut self, n: usize) -> Self { |
| self.offset = n; |
| self |
| } |
| |
| pub fn buffers(mut self, v: Vec<Buffer>) -> Self { |
| self.buffers = v; |
| self |
| } |
| |
| pub fn add_buffer(mut self, b: Buffer) -> Self { |
| self.buffers.push(b); |
| self |
| } |
| |
| pub fn child_data(mut self, v: Vec<ArrayData>) -> Self { |
| self.child_data = v; |
| self |
| } |
| |
| pub fn add_child_data(mut self, r: ArrayData) -> Self { |
| self.child_data.push(r); |
| self |
| } |
| |
| /// Creates an array data, without any validation |
| /// |
| /// # Safety |
| /// |
| /// The same caveats as [`ArrayData::new_unchecked`] |
| /// apply. |
| pub unsafe fn build_unchecked(self) -> ArrayData { |
| ArrayData::new_unchecked( |
| self.data_type, |
| self.len, |
| self.null_count, |
| self.null_bit_buffer, |
| self.offset, |
| self.buffers, |
| self.child_data, |
| ) |
| } |
| |
| /// Creates an array data, validating all inputs |
| pub fn build(self) -> Result<ArrayData> { |
| ArrayData::try_new( |
| self.data_type, |
| self.len, |
| self.null_bit_buffer, |
| self.offset, |
| self.buffers, |
| self.child_data, |
| ) |
| } |
| } |
| |
| impl From<ArrayData> for ArrayDataBuilder { |
| fn from(d: ArrayData) -> Self { |
| // TODO: Store Bitmap on ArrayData (#1799) |
| let null_bit_buffer = d.null_buffer().cloned(); |
| Self { |
| null_bit_buffer, |
| data_type: d.data_type, |
| len: d.len, |
| null_count: Some(d.null_count), |
| offset: d.offset, |
| buffers: d.buffers, |
| child_data: d.child_data, |
| } |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| use std::ptr::NonNull; |
| |
| use crate::array::{ |
| make_array, Array, BooleanBuilder, Decimal128Builder, FixedSizeListBuilder, |
| Int32Array, Int32Builder, Int64Array, StringArray, StructBuilder, UInt64Array, |
| UInt8Builder, |
| }; |
| use crate::buffer::Buffer; |
| use crate::datatypes::Field; |
| use crate::util::bit_util; |
| |
| #[test] |
| fn test_builder() { |
| // Buffer needs to be at least 25 long |
| let v = (0..25).collect::<Vec<i32>>(); |
| let b1 = Buffer::from_slice_ref(&v); |
| let arr_data = ArrayData::builder(DataType::Int32) |
| .len(20) |
| .offset(5) |
| .add_buffer(b1) |
| .null_bit_buffer(Some(Buffer::from(vec![ |
| 0b01011111, 0b10110101, 0b01100011, 0b00011110, |
| ]))) |
| .build() |
| .unwrap(); |
| |
| assert_eq!(20, arr_data.len()); |
| assert_eq!(10, arr_data.null_count()); |
| assert_eq!(5, arr_data.offset()); |
| assert_eq!(1, arr_data.buffers().len()); |
| assert_eq!( |
| Buffer::from_slice_ref(&v).as_slice(), |
| arr_data.buffers()[0].as_slice() |
| ); |
| } |
| |
| #[test] |
| fn test_builder_with_child_data() { |
| let child_arr_data = ArrayData::try_new( |
| DataType::Int32, |
| 5, |
| None, |
| 0, |
| vec![Buffer::from_slice_ref(&[1i32, 2, 3, 4, 5])], |
| vec![], |
| ) |
| .unwrap(); |
| |
| let data_type = DataType::Struct(vec![Field::new("x", DataType::Int32, true)]); |
| |
| let arr_data = ArrayData::builder(data_type) |
| .len(5) |
| .offset(0) |
| .add_child_data(child_arr_data.clone()) |
| .build() |
| .unwrap(); |
| |
| assert_eq!(5, arr_data.len()); |
| assert_eq!(1, arr_data.child_data().len()); |
| assert_eq!(child_arr_data, arr_data.child_data()[0]); |
| } |
| |
| #[test] |
| fn test_null_count() { |
| let mut bit_v: [u8; 2] = [0; 2]; |
| bit_util::set_bit(&mut bit_v, 0); |
| bit_util::set_bit(&mut bit_v, 3); |
| bit_util::set_bit(&mut bit_v, 10); |
| let arr_data = ArrayData::builder(DataType::Int32) |
| .len(16) |
| .add_buffer(make_i32_buffer(16)) |
| .null_bit_buffer(Some(Buffer::from(bit_v))) |
| .build() |
| .unwrap(); |
| assert_eq!(13, arr_data.null_count()); |
| |
| // Test with offset |
| let mut bit_v: [u8; 2] = [0; 2]; |
| bit_util::set_bit(&mut bit_v, 0); |
| bit_util::set_bit(&mut bit_v, 3); |
| bit_util::set_bit(&mut bit_v, 10); |
| let arr_data = ArrayData::builder(DataType::Int32) |
| .len(12) |
| .offset(2) |
| .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space, |
| .null_bit_buffer(Some(Buffer::from(bit_v))) |
| .build() |
| .unwrap(); |
| assert_eq!(10, arr_data.null_count()); |
| } |
| |
| #[test] |
| fn test_null_buffer_ref() { |
| let mut bit_v: [u8; 2] = [0; 2]; |
| bit_util::set_bit(&mut bit_v, 0); |
| bit_util::set_bit(&mut bit_v, 3); |
| bit_util::set_bit(&mut bit_v, 10); |
| let arr_data = ArrayData::builder(DataType::Int32) |
| .len(16) |
| .add_buffer(make_i32_buffer(16)) |
| .null_bit_buffer(Some(Buffer::from(bit_v))) |
| .build() |
| .unwrap(); |
| assert!(arr_data.null_buffer().is_some()); |
| assert_eq!(&bit_v, arr_data.null_buffer().unwrap().as_slice()); |
| } |
| |
| #[test] |
| fn test_slice() { |
| let mut bit_v: [u8; 2] = [0; 2]; |
| bit_util::set_bit(&mut bit_v, 0); |
| bit_util::set_bit(&mut bit_v, 3); |
| bit_util::set_bit(&mut bit_v, 10); |
| let data = ArrayData::builder(DataType::Int32) |
| .len(16) |
| .add_buffer(make_i32_buffer(16)) |
| .null_bit_buffer(Some(Buffer::from(bit_v))) |
| .build() |
| .unwrap(); |
| let new_data = data.slice(1, 15); |
| assert_eq!(data.len() - 1, new_data.len()); |
| assert_eq!(1, new_data.offset()); |
| assert_eq!(data.null_count(), new_data.null_count()); |
| |
| // slice of a slice (removes one null) |
| let new_data = new_data.slice(1, 14); |
| assert_eq!(data.len() - 2, new_data.len()); |
| assert_eq!(2, new_data.offset()); |
| assert_eq!(data.null_count() - 1, new_data.null_count()); |
| } |
| |
| #[test] |
| fn test_equality() { |
| let int_data = ArrayData::builder(DataType::Int32) |
| .len(1) |
| .add_buffer(make_i32_buffer(1)) |
| .build() |
| .unwrap(); |
| |
| let float_data = ArrayData::builder(DataType::Float32) |
| .len(1) |
| .add_buffer(make_f32_buffer(1)) |
| .build() |
| .unwrap(); |
| assert_ne!(int_data, float_data); |
| assert!(!int_data.ptr_eq(&float_data)); |
| assert!(int_data.ptr_eq(&int_data)); |
| |
| let int_data_clone = int_data.clone(); |
| assert_eq!(int_data, int_data_clone); |
| assert!(int_data.ptr_eq(&int_data_clone)); |
| assert!(int_data_clone.ptr_eq(&int_data)); |
| |
| let int_data_slice = int_data_clone.slice(1, 0); |
| assert!(int_data_slice.ptr_eq(&int_data_slice)); |
| assert!(!int_data.ptr_eq(&int_data_slice)); |
| assert!(!int_data_slice.ptr_eq(&int_data)); |
| |
| let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); |
| let offsets_buffer = Buffer::from_slice_ref(&[0_i32, 2_i32, 2_i32, 5_i32]); |
| let string_data = ArrayData::try_new( |
| DataType::Utf8, |
| 3, |
| Some(Buffer::from_iter(vec![true, false, true])), |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| |
| assert_ne!(float_data, string_data); |
| assert!(!float_data.ptr_eq(&string_data)); |
| |
| assert!(string_data.ptr_eq(&string_data)); |
| |
| let string_data_cloned = string_data.clone(); |
| assert!(string_data_cloned.ptr_eq(&string_data)); |
| assert!(string_data.ptr_eq(&string_data_cloned)); |
| |
| let string_data_slice = string_data.slice(1, 2); |
| assert!(string_data_slice.ptr_eq(&string_data_slice)); |
| assert!(!string_data_slice.ptr_eq(&string_data)) |
| } |
| |
| #[test] |
| fn test_count_nulls() { |
| let null_buffer = Some(Buffer::from(vec![0b00010110, 0b10011111])); |
| let count = count_nulls(null_buffer.as_ref(), 0, 16); |
| assert_eq!(count, 7); |
| |
| let count = count_nulls(null_buffer.as_ref(), 4, 8); |
| assert_eq!(count, 3); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Need at least 80 bytes in buffers[0] in array of type Int64, but got 8" |
| )] |
| fn test_buffer_too_small() { |
| let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); |
| // should fail as the declared size (10*8 = 80) is larger than the underlying bfufer (8) |
| ArrayData::try_new(DataType::Int64, 10, None, 0, vec![buffer], vec![]).unwrap(); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Need at least 16 bytes in buffers[0] in array of type Int64, but got 8" |
| )] |
| fn test_buffer_too_small_offset() { |
| let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); |
| // should fail -- size is ok, but also has offset |
| ArrayData::try_new(DataType::Int64, 1, None, 1, vec![buffer], vec![]).unwrap(); |
| } |
| |
| #[test] |
| #[should_panic(expected = "Expected 1 buffers in array of type Int64, got 2")] |
| fn test_bad_number_of_buffers() { |
| let buffer1 = Buffer::from_slice_ref(&[0i32, 2i32]); |
| let buffer2 = Buffer::from_slice_ref(&[0i32, 2i32]); |
| ArrayData::try_new(DataType::Int64, 1, None, 0, vec![buffer1, buffer2], vec![]) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic(expected = "integer overflow computing min buffer size")] |
| fn test_fixed_width_overflow() { |
| let buffer = Buffer::from_slice_ref(&[0i32, 2i32]); |
| ArrayData::try_new(DataType::Int64, usize::MAX, None, 0, vec![buffer], vec![]) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic(expected = "null_bit_buffer size too small. got 1 needed 2")] |
| fn test_bitmap_too_small() { |
| let buffer = make_i32_buffer(9); |
| let null_bit_buffer = Buffer::from(vec![0b11111111]); |
| |
| ArrayData::try_new( |
| DataType::Int32, |
| 9, |
| Some(null_bit_buffer), |
| 0, |
| vec![buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| // Test creating a dictionary with a non integer type |
| #[test] |
| #[should_panic(expected = "Dictionary key type must be integer, but was Utf8")] |
| fn test_non_int_dictionary() { |
| let i32_buffer = Buffer::from_slice_ref(&[0i32, 2i32]); |
| let data_type = |
| DataType::Dictionary(Box::new(DataType::Utf8), Box::new(DataType::Int32)); |
| let child_data = ArrayData::try_new( |
| DataType::Int32, |
| 1, |
| None, |
| 0, |
| vec![i32_buffer.clone()], |
| vec![], |
| ) |
| .unwrap(); |
| ArrayData::try_new( |
| data_type, |
| 1, |
| None, |
| 0, |
| vec![i32_buffer.clone(), i32_buffer], |
| vec![child_data], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic(expected = "Expected LargeUtf8 but child data had Utf8")] |
| fn test_mismatched_dictionary_types() { |
| // test w/ dictionary created with a child array data that has type different than declared |
| let string_array: StringArray = |
| vec![Some("foo"), Some("bar")].into_iter().collect(); |
| let i32_buffer = Buffer::from_slice_ref(&[0i32, 1i32]); |
| // Dict says LargeUtf8 but array is Utf8 |
| let data_type = DataType::Dictionary( |
| Box::new(DataType::Int32), |
| Box::new(DataType::LargeUtf8), |
| ); |
| let child_data = string_array.into_data(); |
| ArrayData::try_new(data_type, 1, None, 0, vec![i32_buffer], vec![child_data]) |
| .unwrap(); |
| } |
| |
| #[test] |
| fn test_empty_utf8_array_with_empty_offsets_buffer() { |
| let data_buffer = Buffer::from(&[]); |
| let offsets_buffer = Buffer::from(&[]); |
| ArrayData::try_new( |
| DataType::Utf8, |
| 0, |
| None, |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| fn test_empty_utf8_array_with_single_zero_offset() { |
| let data_buffer = Buffer::from(&[]); |
| let offsets_buffer = Buffer::from_slice_ref(&[0i32]); |
| ArrayData::try_new( |
| DataType::Utf8, |
| 0, |
| None, |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic(expected = "First offset 1 of Utf8 is larger than values length 0")] |
| fn test_empty_utf8_array_with_invalid_offset() { |
| let data_buffer = Buffer::from(&[]); |
| let offsets_buffer = Buffer::from_slice_ref(&[1i32]); |
| ArrayData::try_new( |
| DataType::Utf8, |
| 0, |
| None, |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| fn test_empty_utf8_array_with_non_zero_offset() { |
| let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); |
| let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2, 6, 0]); |
| ArrayData::try_new( |
| DataType::Utf8, |
| 0, |
| None, |
| 3, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 8 bytes got 4" |
| )] |
| fn test_empty_large_utf8_array_with_wrong_type_offsets() { |
| let data_buffer = Buffer::from(&[]); |
| let offsets_buffer = Buffer::from_slice_ref(&[0i32]); |
| ArrayData::try_new( |
| DataType::LargeUtf8, |
| 0, |
| None, |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Buffer 0 of Utf8 isn't large enough. Expected 12 bytes got 8" |
| )] |
| fn test_validate_offsets_i32() { |
| let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); |
| let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32]); |
| ArrayData::try_new( |
| DataType::Utf8, |
| 2, |
| None, |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Buffer 0 of LargeUtf8 isn't large enough. Expected 24 bytes got 16" |
| )] |
| fn test_validate_offsets_i64() { |
| let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); |
| let offsets_buffer = Buffer::from_slice_ref(&[0i64, 2i64]); |
| ArrayData::try_new( |
| DataType::LargeUtf8, |
| 2, |
| None, |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic(expected = "Error converting offset[0] (-2) to usize for Utf8")] |
| fn test_validate_offsets_negative_first_i32() { |
| let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); |
| let offsets_buffer = Buffer::from_slice_ref(&[-2i32, 1i32, 3i32]); |
| ArrayData::try_new( |
| DataType::Utf8, |
| 2, |
| None, |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic(expected = "Error converting offset[2] (-3) to usize for Utf8")] |
| fn test_validate_offsets_negative_last_i32() { |
| let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); |
| let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, -3i32]); |
| ArrayData::try_new( |
| DataType::Utf8, |
| 2, |
| None, |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic(expected = "First offset 4 in Utf8 is smaller than last offset 3")] |
| fn test_validate_offsets_range_too_small() { |
| let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); |
| // start offset is larger than end |
| let offsets_buffer = Buffer::from_slice_ref(&[4i32, 2i32, 3i32]); |
| ArrayData::try_new( |
| DataType::Utf8, |
| 2, |
| None, |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic(expected = "Last offset 10 of Utf8 is larger than values length 6")] |
| fn test_validate_offsets_range_too_large() { |
| let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); |
| // 10 is off the end of the buffer |
| let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2i32, 10i32]); |
| ArrayData::try_new( |
| DataType::Utf8, |
| 2, |
| None, |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic(expected = "First offset 10 of Utf8 is larger than values length 6")] |
| fn test_validate_offsets_first_too_large() { |
| let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); |
| // 10 is off the end of the buffer |
| let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 10i32]); |
| ArrayData::try_new( |
| DataType::Utf8, |
| 2, |
| None, |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| fn test_validate_offsets_first_too_large_skipped() { |
| let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); |
| // 10 is off the end of the buffer, but offset starts at 1 so it is skipped |
| let offsets_buffer = Buffer::from_slice_ref(&[10i32, 2i32, 3i32, 4i32]); |
| let data = ArrayData::try_new( |
| DataType::Utf8, |
| 2, |
| None, |
| 1, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| let array: StringArray = data.into(); |
| let expected: StringArray = vec![Some("c"), Some("d")].into_iter().collect(); |
| assert_eq!(array, expected); |
| } |
| |
| #[test] |
| #[should_panic(expected = "Last offset 8 of Utf8 is larger than values length 6")] |
| fn test_validate_offsets_last_too_large() { |
| let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); |
| // 10 is off the end of the buffer |
| let offsets_buffer = Buffer::from_slice_ref(&[5i32, 7i32, 8i32]); |
| ArrayData::try_new( |
| DataType::Utf8, |
| 2, |
| None, |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Values length 4 is less than the length (2) multiplied by the value size (2) for FixedSizeList" |
| )] |
| fn test_validate_fixed_size_list() { |
| // child has 4 elements, |
| let child_array = vec![Some(1), Some(2), Some(3), None] |
| .into_iter() |
| .collect::<Int32Array>(); |
| |
| // but claim we have 3 elements for a fixed size of 2 |
| // 10 is off the end of the buffer |
| let field = Field::new("field", DataType::Int32, true); |
| ArrayData::try_new( |
| DataType::FixedSizeList(Box::new(field), 2), |
| 3, |
| None, |
| 0, |
| vec![], |
| vec![child_array.into_data()], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic(expected = "Child type mismatch for Struct")] |
| fn test_validate_struct_child_type() { |
| let field1 = vec![Some(1), Some(2), Some(3), None] |
| .into_iter() |
| .collect::<Int32Array>(); |
| |
| // validate the the type of struct fields matches child fields |
| ArrayData::try_new( |
| DataType::Struct(vec![Field::new("field1", DataType::Int64, true)]), |
| 3, |
| None, |
| 0, |
| vec![], |
| vec![field1.into_data()], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "child array #0 for field field1 has length smaller than expected for struct array (4 < 6)" |
| )] |
| fn test_validate_struct_child_length() { |
| // field length only has 4 items, but array claims to have 6 |
| let field1 = vec![Some(1), Some(2), Some(3), None] |
| .into_iter() |
| .collect::<Int32Array>(); |
| |
| ArrayData::try_new( |
| DataType::Struct(vec![Field::new("field1", DataType::Int32, true)]), |
| 6, |
| None, |
| 0, |
| vec![], |
| vec![field1.into_data()], |
| ) |
| .unwrap(); |
| } |
| |
| /// Test that the array of type `data_type` that has invalid utf8 data errors |
| fn check_utf8_validation<T: ArrowNativeType>(data_type: DataType) { |
| // 0x80 is a utf8 continuation sequence and is not a valid utf8 sequence itself |
| let data_buffer = Buffer::from_slice_ref(&[b'a', b'a', 0x80, 0x00]); |
| let offsets: Vec<T> = [0, 2, 3] |
| .iter() |
| .map(|&v| T::from_usize(v).unwrap()) |
| .collect(); |
| |
| let offsets_buffer = Buffer::from_slice_ref(&offsets); |
| ArrayData::try_new( |
| data_type, |
| 2, |
| None, |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic(expected = "Invalid UTF8 sequence at string index 1 (2..3)")] |
| fn test_validate_utf8_content() { |
| check_utf8_validation::<i32>(DataType::Utf8); |
| } |
| |
| #[test] |
| #[should_panic(expected = "Invalid UTF8 sequence at string index 1 (2..3)")] |
| fn test_validate_large_utf8_content() { |
| check_utf8_validation::<i64>(DataType::LargeUtf8); |
| } |
| |
| /// Tests that offsets are at valid codepoint boundaries |
| fn check_utf8_char_boundary<T: ArrowNativeType>(data_type: DataType) { |
| let data_buffer = Buffer::from("🙀".as_bytes()); |
| let offsets: Vec<T> = [0, 1, data_buffer.len()] |
| .iter() |
| .map(|&v| T::from_usize(v).unwrap()) |
| .collect(); |
| |
| let offsets_buffer = Buffer::from_slice_ref(&offsets); |
| ArrayData::try_new( |
| data_type, |
| 2, |
| None, |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic(expected = "incomplete utf-8 byte sequence from index 0")] |
| fn test_validate_utf8_char_boundary() { |
| check_utf8_char_boundary::<i32>(DataType::Utf8); |
| } |
| |
| #[test] |
| #[should_panic(expected = "incomplete utf-8 byte sequence from index 0")] |
| fn test_validate_large_utf8_char_boundary() { |
| check_utf8_char_boundary::<i64>(DataType::LargeUtf8); |
| } |
| |
| /// Test that the array of type `data_type` that has invalid indexes (out of bounds) |
| fn check_index_out_of_bounds_validation<T: ArrowNativeType>(data_type: DataType) { |
| let data_buffer = Buffer::from_slice_ref(&[b'a', b'b', b'c', b'd']); |
| // First two offsets are fine, then 5 is out of bounds |
| let offsets: Vec<T> = [0, 1, 2, 5, 2] |
| .iter() |
| .map(|&v| T::from_usize(v).unwrap()) |
| .collect(); |
| |
| let offsets_buffer = Buffer::from_slice_ref(&offsets); |
| ArrayData::try_new( |
| data_type, |
| 4, |
| None, |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" |
| )] |
| fn test_validate_utf8_out_of_bounds() { |
| check_index_out_of_bounds_validation::<i32>(DataType::Utf8); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" |
| )] |
| fn test_validate_large_utf8_out_of_bounds() { |
| check_index_out_of_bounds_validation::<i64>(DataType::LargeUtf8); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" |
| )] |
| fn test_validate_binary_out_of_bounds() { |
| check_index_out_of_bounds_validation::<i32>(DataType::Binary); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" |
| )] |
| fn test_validate_large_binary_out_of_bounds() { |
| check_index_out_of_bounds_validation::<i64>(DataType::LargeBinary); |
| } |
| |
| // validate that indexes don't go bacwards check indexes that go backwards |
| fn check_index_backwards_validation<T: ArrowNativeType>(data_type: DataType) { |
| let data_buffer = Buffer::from_slice_ref(&[b'a', b'b', b'c', b'd']); |
| // First three offsets are fine, then 1 goes backwards |
| let offsets: Vec<T> = [0, 1, 2, 2, 1] |
| .iter() |
| .map(|&v| T::from_usize(v).unwrap()) |
| .collect(); |
| |
| let offsets_buffer = Buffer::from_slice_ref(&offsets); |
| ArrayData::try_new( |
| data_type, |
| 4, |
| None, |
| 0, |
| vec![offsets_buffer, data_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" |
| )] |
| fn test_validate_utf8_index_backwards() { |
| check_index_backwards_validation::<i32>(DataType::Utf8); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" |
| )] |
| fn test_validate_large_utf8_index_backwards() { |
| check_index_backwards_validation::<i64>(DataType::LargeUtf8); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" |
| )] |
| fn test_validate_binary_index_backwards() { |
| check_index_backwards_validation::<i32>(DataType::Binary); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Offset invariant failure: non-monotonic offset at slot 3: 2 > 1" |
| )] |
| fn test_validate_large_binary_index_backwards() { |
| check_index_backwards_validation::<i64>(DataType::LargeBinary); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Value at position 1 out of bounds: 3 (should be in [0, 1])" |
| )] |
| fn test_validate_dictionary_index_too_large() { |
| let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); |
| |
| // 3 is not a valid index into the values (only 0 and 1) |
| let keys: Int32Array = [Some(1), Some(3)].into_iter().collect(); |
| |
| let data_type = DataType::Dictionary( |
| Box::new(keys.data_type().clone()), |
| Box::new(values.data_type().clone()), |
| ); |
| |
| ArrayData::try_new( |
| data_type, |
| 2, |
| None, |
| 0, |
| vec![keys.data().buffers[0].clone()], |
| vec![values.into_data()], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Value at position 1 out of bounds: -1 (should be in [0, 1]" |
| )] |
| fn test_validate_dictionary_index_negative() { |
| let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); |
| |
| // -1 is not a valid index at all! |
| let keys: Int32Array = [Some(1), Some(-1)].into_iter().collect(); |
| |
| let data_type = DataType::Dictionary( |
| Box::new(keys.data_type().clone()), |
| Box::new(values.data_type().clone()), |
| ); |
| |
| ArrayData::try_new( |
| data_type, |
| 2, |
| None, |
| 0, |
| vec![keys.data().buffers[0].clone()], |
| vec![values.into_data()], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| fn test_validate_dictionary_index_negative_but_not_referenced() { |
| let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); |
| |
| // -1 is not a valid index at all, but the array is length 1 |
| // so the -1 should not be looked at |
| let keys: Int32Array = [Some(1), Some(-1)].into_iter().collect(); |
| |
| let data_type = DataType::Dictionary( |
| Box::new(keys.data_type().clone()), |
| Box::new(values.data_type().clone()), |
| ); |
| |
| // Expect this not to panic |
| ArrayData::try_new( |
| data_type, |
| 1, |
| None, |
| 0, |
| vec![keys.data().buffers[0].clone()], |
| vec![values.into_data()], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Value at position 0 out of bounds: 18446744073709551615 (can not convert to i64)" |
| )] |
| fn test_validate_dictionary_index_giant_negative() { |
| let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); |
| |
| // -1 is not a valid index at all! |
| let keys: UInt64Array = [Some(u64::MAX), Some(1)].into_iter().collect(); |
| |
| let data_type = DataType::Dictionary( |
| Box::new(keys.data_type().clone()), |
| Box::new(values.data_type().clone()), |
| ); |
| |
| ArrayData::try_new( |
| data_type, |
| 2, |
| None, |
| 0, |
| vec![keys.data().buffers[0].clone()], |
| vec![values.into_data()], |
| ) |
| .unwrap(); |
| } |
| |
| /// Test that the list of type `data_type` generates correct offset out of bounds errors |
| fn check_list_offsets<T: ArrowNativeType>(data_type: DataType) { |
| let values: Int32Array = |
| [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); |
| |
| // 5 is an invalid offset into a list of only three values |
| let offsets: Vec<T> = [0, 2, 5, 4] |
| .iter() |
| .map(|&v| T::from_usize(v).unwrap()) |
| .collect(); |
| let offsets_buffer = Buffer::from_slice_ref(&offsets); |
| |
| ArrayData::try_new( |
| data_type, |
| 3, |
| None, |
| 0, |
| vec![offsets_buffer], |
| vec![values.into_data()], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4" |
| )] |
| fn test_validate_list_offsets() { |
| let field_type = Field::new("f", DataType::Int32, true); |
| check_list_offsets::<i32>(DataType::List(Box::new(field_type))); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4" |
| )] |
| fn test_validate_large_list_offsets() { |
| let field_type = Field::new("f", DataType::Int32, true); |
| check_list_offsets::<i64>(DataType::LargeList(Box::new(field_type))); |
| } |
| |
| /// Test that the list of type `data_type` generates correct errors for negative offsets |
| #[test] |
| #[should_panic( |
| expected = "Offset invariant failure: Could not convert offset -1 to usize at position 2" |
| )] |
| fn test_validate_list_negative_offsets() { |
| let values: Int32Array = |
| [Some(1), Some(2), Some(3), Some(4)].into_iter().collect(); |
| let field_type = Field::new("f", values.data_type().clone(), true); |
| let data_type = DataType::List(Box::new(field_type)); |
| |
| // -1 is an invalid offset any way you look at it |
| let offsets: Vec<i32> = vec![0, 2, -1, 4]; |
| let offsets_buffer = Buffer::from_slice_ref(&offsets); |
| |
| ArrayData::try_new( |
| data_type, |
| 3, |
| None, |
| 0, |
| vec![offsets_buffer], |
| vec![values.into_data()], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Value at position 1 out of bounds: -1 (should be in [0, 1])" |
| )] |
| /// test that children are validated recursively (aka bugs in child data of struct also are flagged) |
| fn test_validate_recursive() { |
| // Form invalid dictionary array |
| let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect(); |
| // -1 is not a valid index |
| let keys: Int32Array = [Some(1), Some(-1), Some(1)].into_iter().collect(); |
| |
| let dict_data_type = DataType::Dictionary( |
| Box::new(keys.data_type().clone()), |
| Box::new(values.data_type().clone()), |
| ); |
| |
| // purposely create an invalid child data |
| let dict_data = unsafe { |
| ArrayData::new_unchecked( |
| dict_data_type, |
| 2, |
| None, |
| None, |
| 0, |
| vec![keys.data().buffers[0].clone()], |
| vec![values.into_data()], |
| ) |
| }; |
| |
| // Now, try and create a struct with this invalid child data (and expect an error) |
| let data_type = |
| DataType::Struct(vec![Field::new("d", dict_data.data_type().clone(), true)]); |
| |
| ArrayData::try_new(data_type, 1, None, 0, vec![], vec![dict_data]).unwrap(); |
| } |
| |
| /// returns a buffer initialized with some constant value for tests |
| fn make_i32_buffer(n: usize) -> Buffer { |
| Buffer::from_slice_ref(&vec![42i32; n]) |
| } |
| |
| /// returns a buffer initialized with some constant value for tests |
| fn make_f32_buffer(n: usize) -> Buffer { |
| Buffer::from_slice_ref(&vec![42f32; n]) |
| } |
| |
| #[test] |
| #[should_panic(expected = "Expected Int64 but child data had Int32")] |
| fn test_validate_union_different_types() { |
| let field1 = vec![Some(1), Some(2)].into_iter().collect::<Int32Array>(); |
| |
| let field2 = vec![Some(1), Some(2)].into_iter().collect::<Int32Array>(); |
| |
| let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); |
| |
| ArrayData::try_new( |
| DataType::Union( |
| vec![ |
| Field::new("field1", DataType::Int32, true), |
| Field::new("field2", DataType::Int64, true), // data is int32 |
| ], |
| vec![0, 1], |
| UnionMode::Sparse, |
| ), |
| 2, |
| None, |
| 0, |
| vec![type_ids], |
| vec![field1.into_data(), field2.into_data()], |
| ) |
| .unwrap(); |
| } |
| |
| // sparse with wrong sized children |
| #[test] |
| #[should_panic( |
| expected = "Sparse union child array #1 has length smaller than expected for union array (1 < 2)" |
| )] |
| fn test_validate_union_sparse_different_child_len() { |
| let field1 = vec![Some(1), Some(2)].into_iter().collect::<Int32Array>(); |
| |
| // field 2 only has 1 item but array should have 2 |
| let field2 = vec![Some(1)].into_iter().collect::<Int64Array>(); |
| |
| let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); |
| |
| ArrayData::try_new( |
| DataType::Union( |
| vec![ |
| Field::new("field1", DataType::Int32, true), |
| Field::new("field2", DataType::Int64, true), |
| ], |
| vec![0, 1], |
| UnionMode::Sparse, |
| ), |
| 2, |
| None, |
| 0, |
| vec![type_ids], |
| vec![field1.into_data(), field2.into_data()], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic(expected = "Expected 2 buffers in array of type Union")] |
| fn test_validate_union_dense_without_offsets() { |
| let field1 = vec![Some(1), Some(2)].into_iter().collect::<Int32Array>(); |
| |
| let field2 = vec![Some(1)].into_iter().collect::<Int64Array>(); |
| |
| let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); |
| |
| ArrayData::try_new( |
| DataType::Union( |
| vec![ |
| Field::new("field1", DataType::Int32, true), |
| Field::new("field2", DataType::Int64, true), |
| ], |
| vec![0, 1], |
| UnionMode::Dense, |
| ), |
| 2, |
| None, |
| 0, |
| vec![type_ids], // need offsets buffer here too |
| vec![field1.into_data(), field2.into_data()], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| #[should_panic( |
| expected = "Need at least 8 bytes in buffers[1] in array of type Union" |
| )] |
| fn test_validate_union_dense_with_bad_len() { |
| let field1 = vec![Some(1), Some(2)].into_iter().collect::<Int32Array>(); |
| |
| let field2 = vec![Some(1)].into_iter().collect::<Int64Array>(); |
| |
| let type_ids = Buffer::from_slice_ref(&[0i8, 1i8]); |
| let offsets = Buffer::from_slice_ref(&[0i32]); // should have 2 offsets, but only have 1 |
| |
| ArrayData::try_new( |
| DataType::Union( |
| vec![ |
| Field::new("field1", DataType::Int32, true), |
| Field::new("field2", DataType::Int64, true), |
| ], |
| vec![0, 1], |
| UnionMode::Dense, |
| ), |
| 2, |
| None, |
| 0, |
| vec![type_ids, offsets], |
| vec![field1.into_data(), field2.into_data()], |
| ) |
| .unwrap(); |
| } |
| |
| #[test] |
| fn test_try_new_sliced_struct() { |
| let mut builder = StructBuilder::new( |
| vec![ |
| Field::new("a", DataType::Int32, true), |
| Field::new("b", DataType::Boolean, true), |
| ], |
| vec![ |
| Box::new(Int32Builder::new(5)), |
| Box::new(BooleanBuilder::new(5)), |
| ], |
| ); |
| |
| // struct[0] = { a: 10, b: true } |
| builder |
| .field_builder::<Int32Builder>(0) |
| .unwrap() |
| .append_option(Some(10)); |
| builder |
| .field_builder::<BooleanBuilder>(1) |
| .unwrap() |
| .append_option(Some(true)); |
| builder.append(true); |
| |
| // struct[1] = null |
| builder |
| .field_builder::<Int32Builder>(0) |
| .unwrap() |
| .append_option(None); |
| builder |
| .field_builder::<BooleanBuilder>(1) |
| .unwrap() |
| .append_option(None); |
| builder.append(false); |
| |
| // struct[2] = { a: null, b: false } |
| builder |
| .field_builder::<Int32Builder>(0) |
| .unwrap() |
| .append_option(None); |
| builder |
| .field_builder::<BooleanBuilder>(1) |
| .unwrap() |
| .append_option(Some(false)); |
| builder.append(true); |
| |
| // struct[3] = { a: 21, b: null } |
| builder |
| .field_builder::<Int32Builder>(0) |
| .unwrap() |
| .append_option(Some(21)); |
| builder |
| .field_builder::<BooleanBuilder>(1) |
| .unwrap() |
| .append_option(None); |
| builder.append(true); |
| |
| // struct[4] = { a: 18, b: false } |
| builder |
| .field_builder::<Int32Builder>(0) |
| .unwrap() |
| .append_option(Some(18)); |
| builder |
| .field_builder::<BooleanBuilder>(1) |
| .unwrap() |
| .append_option(Some(false)); |
| builder.append(true); |
| |
| let struct_array = builder.finish(); |
| let struct_array_slice = struct_array.slice(1, 3); |
| let struct_array_data = struct_array_slice.data(); |
| |
| let cloned_data = ArrayData::try_new( |
| struct_array_slice.data_type().clone(), |
| struct_array_slice.len(), |
| struct_array_data.null_buffer().cloned(), |
| struct_array_slice.offset(), |
| struct_array_data.buffers().to_vec(), |
| struct_array_data.child_data().to_vec(), |
| ) |
| .unwrap(); |
| let cloned = crate::array::make_array(cloned_data); |
| |
| assert_eq!(&struct_array_slice, &cloned); |
| } |
| |
| #[test] |
| fn test_into_buffers() { |
| let data_types = vec![ |
| DataType::Union(vec![], vec![], UnionMode::Dense), |
| DataType::Union(vec![], vec![], UnionMode::Sparse), |
| ]; |
| |
| for data_type in data_types { |
| let buffers = new_buffers(&data_type, 0); |
| let [buffer1, buffer2] = buffers; |
| let buffers = into_buffers(&data_type, buffer1, buffer2); |
| |
| let layout = layout(&data_type); |
| assert_eq!(buffers.len(), layout.buffers.len()); |
| } |
| } |
| |
| #[test] |
| fn test_string_data_from_foreign() { |
| let mut strings = "foobarfoobar".to_owned(); |
| let mut offsets = vec![0_i32, 0, 3, 6, 12]; |
| let mut bitmap = vec![0b1110_u8]; |
| |
| let strings_buffer = unsafe { |
| Buffer::from_custom_allocation( |
| NonNull::new_unchecked(strings.as_mut_ptr()), |
| strings.len(), |
| Arc::new(strings), |
| ) |
| }; |
| let offsets_buffer = unsafe { |
| Buffer::from_custom_allocation( |
| NonNull::new_unchecked(offsets.as_mut_ptr() as *mut u8), |
| offsets.len() * std::mem::size_of::<i32>(), |
| Arc::new(offsets), |
| ) |
| }; |
| let null_buffer = unsafe { |
| Buffer::from_custom_allocation( |
| NonNull::new_unchecked(bitmap.as_mut_ptr()), |
| bitmap.len(), |
| Arc::new(bitmap), |
| ) |
| }; |
| |
| let data = ArrayData::try_new( |
| DataType::Utf8, |
| 4, |
| Some(null_buffer), |
| 0, |
| vec![offsets_buffer, strings_buffer], |
| vec![], |
| ) |
| .unwrap(); |
| |
| let array = make_array(data); |
| let array = array.as_any().downcast_ref::<StringArray>().unwrap(); |
| |
| let expected = |
| StringArray::from(vec![None, Some("foo"), Some("bar"), Some("foobar")]); |
| |
| assert_eq!(array, &expected); |
| } |
| |
| #[test] |
| #[cfg(not(feature = "force_validate"))] |
| fn test_decimal_full_validation() { |
| let values_builder = UInt8Builder::new(10); |
| let byte_width = 16; |
| let mut fixed_size_builder = |
| FixedSizeListBuilder::new(values_builder, byte_width); |
| let value_as_bytes = Decimal128Builder::from_i128_to_fixed_size_bytes( |
| 123456, |
| fixed_size_builder.value_length() as usize, |
| ) |
| .unwrap(); |
| fixed_size_builder |
| .values() |
| .append_slice(value_as_bytes.as_slice()); |
| fixed_size_builder.append(true); |
| let fixed_size_array = fixed_size_builder.finish(); |
| |
| // Build ArrayData for Decimal |
| let builder = ArrayData::builder(DataType::Decimal128(5, 3)) |
| .len(fixed_size_array.len()) |
| .add_buffer(fixed_size_array.data_ref().child_data()[0].buffers()[0].clone()); |
| let array_data = unsafe { builder.build_unchecked() }; |
| let validation_result = array_data.validate_full(); |
| let error = validation_result.unwrap_err(); |
| assert_eq!( |
| "Invalid argument error: 123456 is too large to store in a Decimal128 of precision 5. Max is 99999", |
| error.to_string() |
| ); |
| } |
| |
| #[test] |
| fn test_decimal_validation() { |
| let mut builder = Decimal128Builder::new(4, 10, 4); |
| builder.append_value(10000).unwrap(); |
| builder.append_value(20000).unwrap(); |
| let array = builder.finish(); |
| |
| array.data().validate_full().unwrap(); |
| } |
| |
| #[test] |
| #[cfg(not(feature = "force_validate"))] |
| fn test_sliced_array_child() { |
| let values = Int32Array::from_iter_values([1, 2, 3]); |
| let values_sliced = values.slice(1, 2); |
| let offsets = Buffer::from_iter([1_i32, 3_i32]); |
| |
| let list_field = Field::new("element", DataType::Int32, false); |
| let data_type = DataType::List(Box::new(list_field)); |
| |
| let data = unsafe { |
| ArrayData::new_unchecked( |
| data_type, |
| 1, |
| None, |
| None, |
| 0, |
| vec![offsets], |
| vec![values_sliced.into_data()], |
| ) |
| }; |
| |
| let err = data.validate_values().unwrap_err(); |
| assert_eq!(err.to_string(), "Invalid argument error: Offset invariant failure: offset at position 1 out of bounds: 3 > 2"); |
| } |
| |
| #[test] |
| fn test_contains_nulls() { |
| let buffer: Buffer = |
| MutableBuffer::from_iter([false, false, false, true, true, false]).into(); |
| |
| assert!(contains_nulls(Some(&buffer), 0, 6)); |
| assert!(contains_nulls(Some(&buffer), 0, 3)); |
| assert!(!contains_nulls(Some(&buffer), 3, 2)); |
| assert!(!contains_nulls(Some(&buffer), 0, 0)); |
| } |
| } |