| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| //! Contains declarations to bind to the [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html). |
| //! |
| //! Generally, this module is divided in two main interfaces: |
| //! One interface maps C ABI to native Rust types, i.e. convert c-pointers, c_char, to native rust. |
| //! This is handled by [FFI_ArrowSchema] and [FFI_ArrowArray]. |
| //! |
| //! The second interface maps native Rust types to the Rust-specific implementation of Arrow such as `format` to `Datatype`, |
| //! `Buffer`, etc. This is handled by `from_ffi` and `to_ffi`. |
| //! |
| //! |
| //! Export to FFI |
| //! |
| //! ```rust |
| //! # use std::sync::Arc; |
| //! # use arrow_array::{Int32Array, Array, make_array}; |
| //! # use arrow_data::ArrayData; |
| //! # use arrow_array::ffi::{to_ffi, from_ffi}; |
| //! # use arrow_schema::ArrowError; |
| //! # fn main() -> Result<(), ArrowError> { |
| //! // create an array natively |
| //! |
| //! let array = Int32Array::from(vec![Some(1), None, Some(3)]); |
| //! let data = array.into_data(); |
| //! |
| //! // Export it |
| //! let (out_array, out_schema) = to_ffi(&data)?; |
| //! |
| //! // import it |
| //! let data = unsafe { from_ffi(out_array, &out_schema) }?; |
| //! let array = Int32Array::from(data); |
| //! |
| //! // verify |
| //! assert_eq!(array, Int32Array::from(vec![Some(1), None, Some(3)])); |
| //! # |
| //! # Ok(()) |
| //! # } |
| //! ``` |
| //! |
| //! Import from FFI |
| //! |
| //! ``` |
| //! # use std::ptr::addr_of_mut; |
| //! # use arrow_array::ffi::{from_ffi, FFI_ArrowArray}; |
| //! # use arrow_array::{ArrayRef, make_array}; |
| //! # use arrow_schema::{ArrowError, ffi::FFI_ArrowSchema}; |
| //! # |
| //! /// A foreign data container that can export to C Data interface |
| //! struct ForeignArray {}; |
| //! |
| //! impl ForeignArray { |
| //! /// Export from foreign array representation to C Data interface |
| //! /// e.g. <https://github.com/apache/arrow/blob/fc1f9ebbc4c3ae77d5cfc2f9322f4373d3d19b8a/python/pyarrow/array.pxi#L1552> |
| //! fn export_to_c(&self, array: *mut FFI_ArrowArray, schema: *mut FFI_ArrowSchema) { |
| //! // ... |
| //! } |
| //! } |
| //! |
| //! /// Import an [`ArrayRef`] from a [`ForeignArray`] |
| //! fn import_array(foreign: &ForeignArray) -> Result<ArrayRef, ArrowError> { |
| //! let mut schema = FFI_ArrowSchema::empty(); |
| //! let mut array = FFI_ArrowArray::empty(); |
| //! foreign.export_to_c(addr_of_mut!(array), addr_of_mut!(schema)); |
| //! Ok(make_array(unsafe { from_ffi(array, &schema) }?)) |
| //! } |
| //! ``` |
| |
| /* |
| # Design: |
| |
| Main assumptions: |
| * A memory region is deallocated according it its own release mechanism. |
| * Rust shares memory regions between arrays. |
| * A memory region should be deallocated when no-one is using it. |
| |
| The design of this module is as follows: |
| |
| `ArrowArray` contains two `Arc`s, one per ABI-compatible `struct`, each containing data |
| according to the C Data Interface. These Arcs are used for ref counting of the structs |
| within Rust and lifetime management. |
| |
| Each ABI-compatible `struct` knowns how to `drop` itself, calling `release`. |
| |
| To import an array, unsafely create an `ArrowArray` from two pointers using [ArrowArray::try_from_raw]. |
| To export an array, create an `ArrowArray` using [ArrowArray::try_new]. |
| */ |
| |
| use std::{mem::size_of, ptr::NonNull, sync::Arc}; |
| |
| use arrow_buffer::{Buffer, MutableBuffer, bit_util}; |
| pub use arrow_data::ffi::FFI_ArrowArray; |
| use arrow_data::{ArrayData, layout}; |
| pub use arrow_schema::ffi::FFI_ArrowSchema; |
| use arrow_schema::{ArrowError, DataType, UnionMode}; |
| |
| use crate::array::ArrayRef; |
| |
| type Result<T> = std::result::Result<T, ArrowError>; |
| |
| /// Exports an array to raw pointers of the C Data Interface provided by the consumer. |
| /// # Safety |
| /// Assumes that these pointers represent valid C Data Interfaces, both in memory |
| /// representation and lifetime via the `release` mechanism. |
| /// |
| /// This function copies the content of two FFI structs [arrow_data::ffi::FFI_ArrowArray] and |
| /// [arrow_schema::ffi::FFI_ArrowSchema] in the array to the location pointed by the raw pointers. |
| /// Usually the raw pointers are provided by the array data consumer. |
| #[deprecated( |
| since = "52.0.0", |
| note = "Use FFI_ArrowArray::new and FFI_ArrowSchema::try_from" |
| )] |
| pub unsafe fn export_array_into_raw( |
| src: ArrayRef, |
| out_array: *mut FFI_ArrowArray, |
| out_schema: *mut FFI_ArrowSchema, |
| ) -> Result<()> { |
| let data = src.to_data(); |
| let array = FFI_ArrowArray::new(&data); |
| let schema = FFI_ArrowSchema::try_from(data.data_type())?; |
| |
| unsafe { std::ptr::write_unaligned(out_array, array) }; |
| unsafe { std::ptr::write_unaligned(out_schema, schema) }; |
| |
| Ok(()) |
| } |
| |
| /// returns the number of bits that buffer `i` (in the C data interface) is expected to have. |
| /// This is set by the Arrow specification |
| fn bit_width(data_type: &DataType, i: usize) -> Result<usize> { |
| if let Some(primitive) = data_type.primitive_width() { |
| return match i { |
| 0 => Err(ArrowError::CDataInterface(format!( |
| "The datatype \"{data_type}\" doesn't expect buffer at index 0. Please verify that the C data interface is correctly implemented." |
| ))), |
| 1 => Ok(primitive * 8), |
| i => Err(ArrowError::CDataInterface(format!( |
| "The datatype \"{data_type}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." |
| ))), |
| }; |
| } |
| |
| Ok(match (data_type, i) { |
| (DataType::Boolean, 1) => 1, |
| (DataType::Boolean, _) => { |
| return Err(ArrowError::CDataInterface(format!( |
| "The datatype \"{data_type}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." |
| ))); |
| } |
| (DataType::FixedSizeBinary(num_bytes), 1) => *num_bytes as usize * u8::BITS as usize, |
| (DataType::FixedSizeList(f, num_elems), 1) => { |
| let child_bit_width = bit_width(f.data_type(), 1)?; |
| child_bit_width * (*num_elems as usize) |
| } |
| (DataType::FixedSizeBinary(_), _) | (DataType::FixedSizeList(_, _), _) => { |
| return Err(ArrowError::CDataInterface(format!( |
| "The datatype \"{data_type}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." |
| ))); |
| } |
| // Variable-size list and map have one i32 buffer. |
| // Variable-sized binaries: have two buffers. |
| // "small": first buffer is i32, second is in bytes |
| (DataType::Utf8, 1) |
| | (DataType::Binary, 1) |
| | (DataType::List(_), 1) |
| | (DataType::Map(_, _), 1) => i32::BITS as _, |
| (DataType::Utf8, 2) | (DataType::Binary, 2) => u8::BITS as _, |
| // List views have two i32 buffers, offsets and sizes |
| (DataType::ListView(_), 1) | (DataType::ListView(_), 2) => i32::BITS as _, |
| // Large list views have two i64 buffers, offsets and sizes |
| (DataType::LargeListView(_), 1) | (DataType::LargeListView(_), 2) => i64::BITS as _, |
| (DataType::List(_), _) | (DataType::Map(_, _), _) => { |
| return Err(ArrowError::CDataInterface(format!( |
| "The datatype \"{data_type}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." |
| ))); |
| } |
| (DataType::Utf8, _) | (DataType::Binary, _) => { |
| return Err(ArrowError::CDataInterface(format!( |
| "The datatype \"{data_type}\" expects 3 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." |
| ))); |
| } |
| // Variable-sized binaries: have two buffers. |
| // LargeUtf8: first buffer is i64, second is in bytes |
| (DataType::LargeUtf8, 1) | (DataType::LargeBinary, 1) | (DataType::LargeList(_), 1) => { |
| i64::BITS as _ |
| } |
| (DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) | (DataType::LargeList(_), 2) => { |
| u8::BITS as _ |
| } |
| (DataType::LargeUtf8, _) | (DataType::LargeBinary, _) | (DataType::LargeList(_), _) => { |
| return Err(ArrowError::CDataInterface(format!( |
| "The datatype \"{data_type}\" expects 3 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." |
| ))); |
| } |
| // Variable-sized views: have 3 or more buffers. |
| // Buffer 1 are the u128 views |
| // Buffers 2...N-1 are u8 byte buffers |
| (DataType::Utf8View, 1) | (DataType::BinaryView, 1) => u128::BITS as _, |
| (DataType::Utf8View, _) | (DataType::BinaryView, _) => u8::BITS as _, |
| // type ids. UnionArray doesn't have null bitmap so buffer index begins with 0. |
| (DataType::Union(_, _), 0) => i8::BITS as _, |
| // Only DenseUnion has 2nd buffer |
| (DataType::Union(_, UnionMode::Dense), 1) => i32::BITS as _, |
| (DataType::Union(_, UnionMode::Sparse), _) => { |
| return Err(ArrowError::CDataInterface(format!( |
| "The datatype \"{data_type}\" expects 1 buffer, but requested {i}. Please verify that the C data interface is correctly implemented." |
| ))); |
| } |
| (DataType::Union(_, UnionMode::Dense), _) => { |
| return Err(ArrowError::CDataInterface(format!( |
| "The datatype \"{data_type}\" expects 2 buffer, but requested {i}. Please verify that the C data interface is correctly implemented." |
| ))); |
| } |
| (_, 0) => { |
| // We don't call this `bit_width` to compute buffer length for null buffer. If any types that don't have null buffer like |
| // UnionArray, they should be handled above. |
| return Err(ArrowError::CDataInterface(format!( |
| "The datatype \"{data_type}\" doesn't expect buffer at index 0. Please verify that the C data interface is correctly implemented." |
| ))); |
| } |
| _ => { |
| return Err(ArrowError::CDataInterface(format!( |
| "The datatype \"{data_type}\" is still not supported in Rust implementation" |
| ))); |
| } |
| }) |
| } |
| |
| /// returns a new buffer corresponding to the index `i` of the FFI array. It may not exist (null pointer). |
| /// `bits` is the number of bits that the native type of this buffer has. |
| /// The size of the buffer will be `ceil(self.length * bits, 8)`. |
| /// # Panic |
| /// This function panics if `i` is larger or equal to `n_buffers`. |
| /// # Safety |
| /// This function assumes that `ceil(self.length * bits, 8)` is the size of the buffer |
| unsafe fn create_buffer( |
| owner: Arc<FFI_ArrowArray>, |
| array: &FFI_ArrowArray, |
| index: usize, |
| len: usize, |
| ) -> Option<Buffer> { |
| if array.num_buffers() == 0 { |
| return None; |
| } |
| NonNull::new(array.buffer(index) as _) |
| .map(|ptr| unsafe { Buffer::from_custom_allocation(ptr, len, owner) }) |
| } |
| |
| /// Export to the C Data Interface |
| pub fn to_ffi(data: &ArrayData) -> Result<(FFI_ArrowArray, FFI_ArrowSchema)> { |
| let array = FFI_ArrowArray::new(data); |
| let schema = FFI_ArrowSchema::try_from(data.data_type())?; |
| Ok((array, schema)) |
| } |
| |
| /// Import [ArrayData] from the C Data Interface |
| /// |
| /// # Safety |
| /// |
| /// This struct assumes that the incoming data agrees with the C data interface. |
| pub unsafe fn from_ffi(array: FFI_ArrowArray, schema: &FFI_ArrowSchema) -> Result<ArrayData> { |
| let dt = DataType::try_from(schema)?; |
| let array = Arc::new(array); |
| let tmp = ImportedArrowArray { |
| array: &array, |
| data_type: dt, |
| owner: &array, |
| }; |
| tmp.consume() |
| } |
| |
| /// Import [ArrayData] from the C Data Interface |
| /// |
| /// # Safety |
| /// |
| /// This struct assumes that the incoming data agrees with the C data interface. |
| pub unsafe fn from_ffi_and_data_type( |
| array: FFI_ArrowArray, |
| data_type: DataType, |
| ) -> Result<ArrayData> { |
| let array = Arc::new(array); |
| let tmp = ImportedArrowArray { |
| array: &array, |
| data_type, |
| owner: &array, |
| }; |
| tmp.consume() |
| } |
| |
| #[derive(Debug)] |
| struct ImportedArrowArray<'a> { |
| array: &'a FFI_ArrowArray, |
| data_type: DataType, |
| owner: &'a Arc<FFI_ArrowArray>, |
| } |
| |
| impl ImportedArrowArray<'_> { |
| fn consume(self) -> Result<ArrayData> { |
| let len = self.array.len(); |
| let offset = self.array.offset(); |
| let null_count = match &self.data_type { |
| DataType::Null => Some(0), |
| _ => self.array.null_count_opt(), |
| }; |
| |
| let data_layout = layout(&self.data_type); |
| let buffers = self.buffers(data_layout.can_contain_null_mask, data_layout.variadic)?; |
| |
| let null_bit_buffer = if data_layout.can_contain_null_mask { |
| self.null_bit_buffer() |
| } else { |
| None |
| }; |
| |
| let mut child_data = self.consume_children()?; |
| |
| if let Some(d) = self.dictionary()? { |
| // For dictionary type there should only be a single child, so we don't need to worry if |
| // there are other children added above. |
| assert!(child_data.is_empty()); |
| child_data.push(d.consume()?); |
| } |
| |
| // Should FFI be checking validity? |
| Ok(unsafe { |
| ArrayData::new_unchecked( |
| self.data_type, |
| len, |
| null_count, |
| null_bit_buffer, |
| offset, |
| buffers, |
| child_data, |
| ) |
| }) |
| } |
| |
| fn consume_children(&self) -> Result<Vec<ArrayData>> { |
| match &self.data_type { |
| DataType::List(field) |
| | DataType::FixedSizeList(field, _) |
| | DataType::LargeList(field) |
| | DataType::ListView(field) |
| | DataType::LargeListView(field) |
| | DataType::Map(field, _) => Ok([self.consume_child(0, field.data_type())?].to_vec()), |
| DataType::Struct(fields) => { |
| assert!(fields.len() == self.array.num_children()); |
| fields |
| .iter() |
| .enumerate() |
| .map(|(i, field)| self.consume_child(i, field.data_type())) |
| .collect::<Result<Vec<_>>>() |
| } |
| DataType::Union(union_fields, _) => { |
| assert!(union_fields.len() == self.array.num_children()); |
| union_fields |
| .iter() |
| .enumerate() |
| .map(|(i, (_, field))| self.consume_child(i, field.data_type())) |
| .collect::<Result<Vec<_>>>() |
| } |
| DataType::RunEndEncoded(run_ends_field, values_field) => Ok([ |
| self.consume_child(0, run_ends_field.data_type())?, |
| self.consume_child(1, values_field.data_type())?, |
| ] |
| .to_vec()), |
| _ => Ok(Vec::new()), |
| } |
| } |
| |
| fn consume_child(&self, index: usize, child_type: &DataType) -> Result<ArrayData> { |
| ImportedArrowArray { |
| array: self.array.child(index), |
| data_type: child_type.clone(), |
| owner: self.owner, |
| } |
| .consume() |
| } |
| |
| /// returns all buffers, as organized by Rust (i.e. null buffer is skipped if it's present |
| /// in the spec of the type) |
| fn buffers(&self, can_contain_null_mask: bool, variadic: bool) -> Result<Vec<Buffer>> { |
| // + 1: skip null buffer |
| let buffer_begin = can_contain_null_mask as usize; |
| let buffer_end = self.array.num_buffers() - usize::from(variadic); |
| |
| let variadic_buffer_lens = if variadic { |
| // Each views array has 1 (optional) null buffer, 1 views buffer, 1 lengths buffer. |
| // Rest are variadic. |
| let num_variadic_buffers = |
| self.array.num_buffers() - (2 + usize::from(can_contain_null_mask)); |
| if num_variadic_buffers == 0 { |
| &[] |
| } else { |
| let lengths = self.array.buffer(self.array.num_buffers() - 1); |
| // SAFETY: is lengths is non-null, then it must be valid for up to num_variadic_buffers. |
| unsafe { std::slice::from_raw_parts(lengths.cast::<i64>(), num_variadic_buffers) } |
| } |
| } else { |
| &[] |
| }; |
| |
| (buffer_begin..buffer_end) |
| .map(|index| { |
| let len = self.buffer_len(index, variadic_buffer_lens, &self.data_type)?; |
| match unsafe { create_buffer(self.owner.clone(), self.array, index, len) } { |
| Some(buf) => { |
| // External libraries may use a dangling pointer for a buffer with length 0. |
| // We respect the array length specified in the C Data Interface. Actually, |
| // if the length is incorrect, we cannot create a correct buffer even if |
| // the pointer is valid. |
| if buf.is_empty() { |
| Ok(MutableBuffer::new(0).into()) |
| } else { |
| Ok(buf) |
| } |
| } |
| None if len == 0 => { |
| // Null data buffer, which Rust doesn't allow. So create |
| // an empty buffer. |
| Ok(MutableBuffer::new(0).into()) |
| } |
| None => Err(ArrowError::CDataInterface(format!( |
| "The external buffer at position {index} is null." |
| ))), |
| } |
| }) |
| .collect() |
| } |
| |
| /// Returns the length, in bytes, of the buffer `i` (indexed according to the C data interface) |
| /// Rust implementation uses fixed-sized buffers, which require knowledge of their `len`. |
| /// for variable-sized buffers, such as the second buffer of a stringArray, we need |
| /// to fetch offset buffer's len to build the second buffer. |
| fn buffer_len( |
| &self, |
| i: usize, |
| variadic_buffer_lengths: &[i64], |
| dt: &DataType, |
| ) -> Result<usize> { |
| // Special handling for dictionary type as we only care about the key type in the case. |
| let data_type = match dt { |
| DataType::Dictionary(key_data_type, _) => key_data_type.as_ref(), |
| dt => dt, |
| }; |
| |
| // `ffi::ArrowArray` records array offset, we need to add it back to the |
| // buffer length to get the actual buffer length. |
| let length = self.array.len() + self.array.offset(); |
| |
| // Inner type is not important for buffer length. |
| Ok(match (&data_type, i) { |
| (DataType::Utf8, 1) |
| | (DataType::LargeUtf8, 1) |
| | (DataType::Binary, 1) |
| | (DataType::LargeBinary, 1) |
| | (DataType::List(_), 1) |
| | (DataType::LargeList(_), 1) |
| | (DataType::Map(_, _), 1) => { |
| // the len of the offset buffer (buffer 1) equals length + 1 |
| let bits = bit_width(data_type, i)?; |
| debug_assert_eq!(bits % 8, 0); |
| (length + 1) * (bits / 8) |
| } |
| (DataType::ListView(_), 1) |
| | (DataType::ListView(_), 2) |
| | (DataType::LargeListView(_), 1) |
| | (DataType::LargeListView(_), 2) => { |
| let bits = bit_width(data_type, i)?; |
| debug_assert_eq!(bits % 8, 0); |
| length * (bits / 8) |
| } |
| (DataType::Utf8, 2) | (DataType::Binary, 2) => { |
| if self.array.is_empty() { |
| return Ok(0); |
| } |
| |
| // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) |
| let len = self.buffer_len(1, variadic_buffer_lengths, dt)?; |
| // first buffer is the null buffer => add(1) |
| // we assume that pointer is aligned for `i32`, as Utf8 uses `i32` offsets. |
| #[allow(clippy::cast_ptr_alignment)] |
| let offset_buffer = self.array.buffer(1) as *const i32; |
| // get last offset |
| (unsafe { *offset_buffer.add(len / size_of::<i32>() - 1) }) as usize |
| } |
| (DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) => { |
| if self.array.is_empty() { |
| return Ok(0); |
| } |
| |
| // the len of the data buffer (buffer 2) equals the last value of the offset buffer (buffer 1) |
| let len = self.buffer_len(1, variadic_buffer_lengths, dt)?; |
| // first buffer is the null buffer => add(1) |
| // we assume that pointer is aligned for `i64`, as Large uses `i64` offsets. |
| #[allow(clippy::cast_ptr_alignment)] |
| let offset_buffer = self.array.buffer(1) as *const i64; |
| // get last offset |
| (unsafe { *offset_buffer.add(len / size_of::<i64>() - 1) }) as usize |
| } |
| // View types: these have variadic buffers. |
| // Buffer 1 is the views buffer, which stores 1 u128 per length of the array. |
| // Buffers 2..N-1 are the buffers holding the byte data. Their lengths are variable. |
| // Buffer N is of length (N - 2) and stores i64 containing the lengths of buffers 2..N-1 |
| (DataType::Utf8View, 1) | (DataType::BinaryView, 1) => { |
| std::mem::size_of::<u128>() * length |
| } |
| (DataType::Utf8View, i) | (DataType::BinaryView, i) => { |
| variadic_buffer_lengths[i - 2] as usize |
| } |
| // buffer len of primitive types |
| _ => { |
| let bits = bit_width(data_type, i)?; |
| bit_util::ceil(length * bits, 8) |
| } |
| }) |
| } |
| |
| /// returns the null bit buffer. |
| /// Rust implementation uses a buffer that is not part of the array of buffers. |
| /// The C Data interface's null buffer is part of the array of buffers. |
| fn null_bit_buffer(&self) -> Option<Buffer> { |
| // similar to `self.buffer_len(0)`, but without `Result`. |
| // `ffi::ArrowArray` records array offset, we need to add it back to the |
| // buffer length to get the actual buffer length. |
| let length = self.array.len() + self.array.offset(); |
| let buffer_len = bit_util::ceil(length, 8); |
| |
| unsafe { create_buffer(self.owner.clone(), self.array, 0, buffer_len) } |
| } |
| |
| fn dictionary(&self) -> Result<Option<ImportedArrowArray<'_>>> { |
| match (self.array.dictionary(), &self.data_type) { |
| (Some(array), DataType::Dictionary(_, value_type)) => Ok(Some(ImportedArrowArray { |
| array, |
| data_type: value_type.as_ref().clone(), |
| owner: self.owner, |
| })), |
| (Some(_), _) => Err(ArrowError::CDataInterface( |
| "Got dictionary in FFI_ArrowArray for non-dictionary data type".to_string(), |
| )), |
| (None, DataType::Dictionary(_, _)) => Err(ArrowError::CDataInterface( |
| "Missing dictionary in FFI_ArrowArray for dictionary data type".to_string(), |
| )), |
| (_, _) => Ok(None), |
| } |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests_to_then_from_ffi { |
| use std::collections::HashMap; |
| use std::mem::ManuallyDrop; |
| |
| use arrow_buffer::{ArrowNativeType, NullBuffer}; |
| use arrow_schema::Field; |
| |
| use crate::builder::UnionBuilder; |
| use crate::cast::AsArray; |
| use crate::types::{Float64Type, Int8Type, Int32Type}; |
| use crate::*; |
| |
| use super::*; |
| |
| #[test] |
| fn test_round_trip() { |
| // create an array natively |
| let array = Int32Array::from(vec![1, 2, 3]); |
| |
| // export it |
| let (array, schema) = to_ffi(&array.into_data()).unwrap(); |
| |
| // (simulate consumer) import it |
| let array = Int32Array::from(unsafe { from_ffi(array, &schema) }.unwrap()); |
| |
| // verify |
| assert_eq!(array, Int32Array::from(vec![1, 2, 3])); |
| } |
| |
| #[test] |
| fn test_import() { |
| // Model receiving const pointers from an external system |
| |
| // Create an array natively |
| let data = Int32Array::from(vec![1, 2, 3]).into_data(); |
| let schema = FFI_ArrowSchema::try_from(data.data_type()).unwrap(); |
| let array = FFI_ArrowArray::new(&data); |
| |
| // Use ManuallyDrop to avoid Box:Drop recursing |
| let schema = Box::new(ManuallyDrop::new(schema)); |
| let array = Box::new(ManuallyDrop::new(array)); |
| |
| let schema_ptr = &**schema as *const _; |
| let array_ptr = &**array as *const _; |
| |
| // We can read them back to memory |
| // SAFETY: |
| // Pointers are aligned and valid |
| let data = |
| unsafe { from_ffi(std::ptr::read(array_ptr), &std::ptr::read(schema_ptr)).unwrap() }; |
| |
| let array = Int32Array::from(data); |
| assert_eq!(array, Int32Array::from(vec![1, 2, 3])); |
| } |
| |
| #[test] |
| fn test_round_trip_with_offset() -> Result<()> { |
| // create an array natively |
| let array = Int32Array::from(vec![Some(1), Some(2), None, Some(3), None]); |
| |
| let array = array.slice(1, 2); |
| |
| // export it |
| let (array, schema) = to_ffi(&array.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| let array = array.as_any().downcast_ref::<Int32Array>().unwrap(); |
| |
| assert_eq!(array, &Int32Array::from(vec![Some(2), None])); |
| |
| // (drop/release) |
| Ok(()) |
| } |
| |
| #[test] |
| #[cfg(not(feature = "force_validate"))] |
| fn test_decimal_round_trip() -> Result<()> { |
| // create an array natively |
| let original_array = [Some(12345_i128), Some(-12345_i128), None] |
| .into_iter() |
| .collect::<Decimal128Array>() |
| .with_precision_and_scale(6, 2) |
| .unwrap(); |
| |
| // export it |
| let (array, schema) = to_ffi(&original_array.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| |
| // perform some operation |
| let array = array.as_any().downcast_ref::<Decimal128Array>().unwrap(); |
| |
| // verify |
| assert_eq!(array, &original_array); |
| |
| // (drop/release) |
| Ok(()) |
| } |
| // case with nulls is tested in the docs, through the example on this module. |
| |
| #[test] |
| fn test_null_count_handling() { |
| let int32_data = ArrayData::builder(DataType::Int32) |
| .len(10) |
| .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) |
| .null_bit_buffer(Some(Buffer::from([0b01011111, 0b00000001]))) |
| .build() |
| .unwrap(); |
| let mut ffi_array = FFI_ArrowArray::new(&int32_data); |
| assert_eq!(3, ffi_array.null_count()); |
| assert_eq!(Some(3), ffi_array.null_count_opt()); |
| // Simulating uninitialized state |
| unsafe { |
| ffi_array.set_null_count(-1); |
| } |
| assert_eq!(None, ffi_array.null_count_opt()); |
| let int32_data = unsafe { from_ffi_and_data_type(ffi_array, DataType::Int32) }.unwrap(); |
| assert_eq!(3, int32_data.null_count()); |
| |
| let null_data = &ArrayData::new_null(&DataType::Null, 10); |
| let mut ffi_array = FFI_ArrowArray::new(null_data); |
| assert_eq!(10, ffi_array.null_count()); |
| assert_eq!(Some(10), ffi_array.null_count_opt()); |
| // Simulating uninitialized state |
| unsafe { |
| ffi_array.set_null_count(-1); |
| } |
| assert_eq!(None, ffi_array.null_count_opt()); |
| let null_data = unsafe { from_ffi_and_data_type(ffi_array, DataType::Null) }.unwrap(); |
| assert_eq!(0, null_data.null_count()); |
| } |
| |
| fn test_generic_string<Offset: OffsetSizeTrait>() -> Result<()> { |
| // create an array natively |
| let array = GenericStringArray::<Offset>::from(vec![Some("a"), None, Some("aaa")]); |
| |
| // export it |
| let (array, schema) = to_ffi(&array.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| |
| // perform some operation |
| let array = array |
| .as_any() |
| .downcast_ref::<GenericStringArray<Offset>>() |
| .unwrap(); |
| |
| // verify |
| let expected = GenericStringArray::<Offset>::from(vec![Some("a"), None, Some("aaa")]); |
| assert_eq!(array, &expected); |
| |
| // (drop/release) |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_string() -> Result<()> { |
| test_generic_string::<i32>() |
| } |
| |
| #[test] |
| fn test_large_string() -> Result<()> { |
| test_generic_string::<i64>() |
| } |
| |
| fn test_generic_list<Offset: OffsetSizeTrait>() -> Result<()> { |
| // Construct a value array |
| let value_data = ArrayData::builder(DataType::Int32) |
| .len(8) |
| .add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7])) |
| .build() |
| .unwrap(); |
| |
| // Construct a buffer for value offsets, for the nested array: |
| // [[0, 1, 2], [3, 4, 5], [6, 7]] |
| let value_offsets = [0_usize, 3, 6, 8] |
| .iter() |
| .map(|i| Offset::from_usize(*i).unwrap()) |
| .collect::<Buffer>(); |
| |
| // Construct a list array from the above two |
| let list_data_type = GenericListArray::<Offset>::DATA_TYPE_CONSTRUCTOR(Arc::new( |
| Field::new_list_field(DataType::Int32, false), |
| )); |
| |
| let list_data = ArrayData::builder(list_data_type) |
| .len(3) |
| .add_buffer(value_offsets) |
| .add_child_data(value_data) |
| .build() |
| .unwrap(); |
| |
| // create an array natively |
| let array = GenericListArray::<Offset>::from(list_data.clone()); |
| |
| // export it |
| let (array, schema) = to_ffi(&array.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| |
| // downcast |
| let array = array |
| .as_any() |
| .downcast_ref::<GenericListArray<Offset>>() |
| .unwrap(); |
| |
| // verify |
| let expected = GenericListArray::<Offset>::from(list_data); |
| assert_eq!(&array.value(0), &expected.value(0)); |
| assert_eq!(&array.value(1), &expected.value(1)); |
| assert_eq!(&array.value(2), &expected.value(2)); |
| |
| // (drop/release) |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_list() -> Result<()> { |
| test_generic_list::<i32>() |
| } |
| |
| #[test] |
| fn test_large_list() -> Result<()> { |
| test_generic_list::<i64>() |
| } |
| |
| fn test_generic_list_view<Offset: OffsetSizeTrait + ArrowNativeType>() -> Result<()> { |
| // Construct a value array |
| let value_data = ArrayData::builder(DataType::Int16) |
| .len(8) |
| .add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7])) |
| .build() |
| .unwrap(); |
| |
| // Construct a buffer for value offsets, for the nested array: |
| // [[0, 1, 2], [3, 4, 5], [6, 7]] |
| let value_offsets = [0_usize, 3, 6] |
| .iter() |
| .map(|i| Offset::from_usize(*i).unwrap()) |
| .collect::<Buffer>(); |
| |
| let sizes_buffer = [3_usize, 3, 2] |
| .iter() |
| .map(|i| Offset::from_usize(*i).unwrap()) |
| .collect::<Buffer>(); |
| |
| // Construct a list array from the above two |
| let list_view_dt = GenericListViewArray::<Offset>::DATA_TYPE_CONSTRUCTOR(Arc::new( |
| Field::new_list_field(DataType::Int16, false), |
| )); |
| |
| let list_data = ArrayData::builder(list_view_dt) |
| .len(3) |
| .add_buffer(value_offsets) |
| .add_buffer(sizes_buffer) |
| .add_child_data(value_data) |
| .build() |
| .unwrap(); |
| |
| let original = GenericListViewArray::<Offset>::from(list_data.clone()); |
| |
| // export it |
| let (array, schema) = to_ffi(&original.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| |
| // downcast |
| let array = array |
| .as_any() |
| .downcast_ref::<GenericListViewArray<Offset>>() |
| .unwrap(); |
| |
| assert_eq!(&array.value(0), &original.value(0)); |
| assert_eq!(&array.value(1), &original.value(1)); |
| assert_eq!(&array.value(2), &original.value(2)); |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_list_view() -> Result<()> { |
| test_generic_list_view::<i32>() |
| } |
| |
| #[test] |
| fn test_large_list_view() -> Result<()> { |
| test_generic_list_view::<i64>() |
| } |
| |
| fn test_generic_binary<Offset: OffsetSizeTrait>() -> Result<()> { |
| // create an array natively |
| let array: Vec<Option<&[u8]>> = vec![Some(b"a"), None, Some(b"aaa")]; |
| let array = GenericBinaryArray::<Offset>::from(array); |
| |
| // export it |
| let (array, schema) = to_ffi(&array.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| let array = array |
| .as_any() |
| .downcast_ref::<GenericBinaryArray<Offset>>() |
| .unwrap(); |
| |
| // verify |
| let expected: Vec<Option<&[u8]>> = vec![Some(b"a"), None, Some(b"aaa")]; |
| let expected = GenericBinaryArray::<Offset>::from(expected); |
| assert_eq!(array, &expected); |
| |
| // (drop/release) |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_binary() -> Result<()> { |
| test_generic_binary::<i32>() |
| } |
| |
| #[test] |
| fn test_large_binary() -> Result<()> { |
| test_generic_binary::<i64>() |
| } |
| |
| #[test] |
| fn test_bool() -> Result<()> { |
| // create an array natively |
| let array = BooleanArray::from(vec![None, Some(true), Some(false)]); |
| |
| // export it |
| let (array, schema) = to_ffi(&array.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| let array = array.as_any().downcast_ref::<BooleanArray>().unwrap(); |
| |
| // verify |
| assert_eq!( |
| array, |
| &BooleanArray::from(vec![None, Some(true), Some(false)]) |
| ); |
| |
| // (drop/release) |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_time32() -> Result<()> { |
| // create an array natively |
| let array = Time32MillisecondArray::from(vec![None, Some(1), Some(2)]); |
| |
| // export it |
| let (array, schema) = to_ffi(&array.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| let array = array |
| .as_any() |
| .downcast_ref::<Time32MillisecondArray>() |
| .unwrap(); |
| |
| // verify |
| assert_eq!( |
| array, |
| &Time32MillisecondArray::from(vec![None, Some(1), Some(2)]) |
| ); |
| |
| // (drop/release) |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_timestamp() -> Result<()> { |
| // create an array natively |
| let array = TimestampMillisecondArray::from(vec![None, Some(1), Some(2)]); |
| |
| // export it |
| let (array, schema) = to_ffi(&array.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| let array = array |
| .as_any() |
| .downcast_ref::<TimestampMillisecondArray>() |
| .unwrap(); |
| |
| // verify |
| assert_eq!( |
| array, |
| &TimestampMillisecondArray::from(vec![None, Some(1), Some(2)]) |
| ); |
| |
| // (drop/release) |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_fixed_size_binary_array() -> Result<()> { |
| let values = vec![ |
| None, |
| Some(vec![10, 10, 10]), |
| None, |
| Some(vec![20, 20, 20]), |
| Some(vec![30, 30, 30]), |
| None, |
| ]; |
| let array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(values.into_iter(), 3)?; |
| |
| // export it |
| let (array, schema) = to_ffi(&array.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| let array = array |
| .as_any() |
| .downcast_ref::<FixedSizeBinaryArray>() |
| .unwrap(); |
| |
| // verify |
| assert_eq!( |
| array, |
| &FixedSizeBinaryArray::try_from_sparse_iter_with_size( |
| vec![ |
| None, |
| Some(vec![10, 10, 10]), |
| None, |
| Some(vec![20, 20, 20]), |
| Some(vec![30, 30, 30]), |
| None, |
| ] |
| .into_iter(), |
| 3 |
| )? |
| ); |
| |
| // (drop/release) |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_fixed_size_list_array() -> Result<()> { |
| // 0000 0100 |
| let mut validity_bits: [u8; 1] = [0; 1]; |
| bit_util::set_bit(&mut validity_bits, 2); |
| |
| let v: Vec<i32> = (0..9).collect(); |
| let value_data = ArrayData::builder(DataType::Int32) |
| .len(9) |
| .add_buffer(Buffer::from_slice_ref(&v)) |
| .build()?; |
| |
| let list_data_type = |
| DataType::FixedSizeList(Arc::new(Field::new("f", DataType::Int32, false)), 3); |
| let list_data = ArrayData::builder(list_data_type.clone()) |
| .len(3) |
| .null_bit_buffer(Some(Buffer::from(validity_bits))) |
| .add_child_data(value_data) |
| .build()?; |
| |
| // export it |
| let (array, schema) = to_ffi(&list_data)?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| let array = array.as_any().downcast_ref::<FixedSizeListArray>().unwrap(); |
| |
| // 0010 0100 |
| let mut expected_validity_bits: [u8; 1] = [0; 1]; |
| bit_util::set_bit(&mut expected_validity_bits, 2); |
| bit_util::set_bit(&mut expected_validity_bits, 5); |
| |
| let mut w = vec![]; |
| w.extend_from_slice(&v); |
| |
| let expected_value_data = ArrayData::builder(DataType::Int32) |
| .len(9) |
| .add_buffer(Buffer::from_slice_ref(&w)) |
| .build()?; |
| |
| let expected_list_data = ArrayData::builder(list_data_type) |
| .len(3) |
| .null_bit_buffer(Some(Buffer::from(expected_validity_bits))) |
| .add_child_data(expected_value_data) |
| .build()?; |
| let expected_array = FixedSizeListArray::from(expected_list_data); |
| |
| // verify |
| assert_eq!(array, &expected_array); |
| |
| // (drop/release) |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_dictionary() -> Result<()> { |
| // create an array natively |
| let values = vec!["a", "aaa", "aaa"]; |
| let dict_array: DictionaryArray<Int8Type> = values.into_iter().collect(); |
| |
| // export it |
| let (array, schema) = to_ffi(&dict_array.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| let actual = array |
| .as_any() |
| .downcast_ref::<DictionaryArray<Int8Type>>() |
| .unwrap(); |
| |
| // verify |
| let new_values = vec!["a", "aaa", "aaa"]; |
| let expected: DictionaryArray<Int8Type> = new_values.into_iter().collect(); |
| assert_eq!(actual, &expected); |
| |
| // (drop/release) |
| Ok(()) |
| } |
| |
| #[test] |
| #[allow(deprecated)] |
| fn test_export_array_into_raw() -> Result<()> { |
| let array = make_array(Int32Array::from(vec![1, 2, 3]).into_data()); |
| |
| // Assume two raw pointers provided by the consumer |
| let mut out_array = FFI_ArrowArray::empty(); |
| let mut out_schema = FFI_ArrowSchema::empty(); |
| |
| { |
| let out_array_ptr = std::ptr::addr_of_mut!(out_array); |
| let out_schema_ptr = std::ptr::addr_of_mut!(out_schema); |
| unsafe { |
| export_array_into_raw(array, out_array_ptr, out_schema_ptr)?; |
| } |
| } |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(out_array, &out_schema) }?; |
| let array = make_array(data); |
| |
| // perform some operation |
| let array = array.as_any().downcast_ref::<Int32Array>().unwrap(); |
| |
| // verify |
| assert_eq!(array, &Int32Array::from(vec![1, 2, 3])); |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_duration() -> Result<()> { |
| // create an array natively |
| let array = DurationSecondArray::from(vec![None, Some(1), Some(2)]); |
| |
| // export it |
| let (array, schema) = to_ffi(&array.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| let array = array |
| .as_any() |
| .downcast_ref::<DurationSecondArray>() |
| .unwrap(); |
| |
| // verify |
| assert_eq!( |
| array, |
| &DurationSecondArray::from(vec![None, Some(1), Some(2)]) |
| ); |
| |
| // (drop/release) |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_map_array() -> Result<()> { |
| let keys = vec!["a", "b", "c", "d", "e", "f", "g", "h"]; |
| let values_data = UInt32Array::from(vec![0u32, 10, 20, 30, 40, 50, 60, 70]); |
| |
| // Construct a buffer for value offsets, for the nested array: |
| // [[a, b, c], [d, e, f], [g, h]] |
| let entry_offsets = [0, 3, 6, 8]; |
| |
| let map_array = |
| MapArray::new_from_strings(keys.clone().into_iter(), &values_data, &entry_offsets) |
| .unwrap(); |
| |
| // export it |
| let (array, schema) = to_ffi(&map_array.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| |
| // perform some operation |
| let array = array.as_any().downcast_ref::<MapArray>().unwrap(); |
| assert_eq!(array, &map_array); |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_struct_array() -> Result<()> { |
| let metadata: HashMap<String, String> = |
| [("Hello".to_string(), "World! 😊".to_string())].into(); |
| let struct_array = StructArray::from(vec![( |
| Arc::new(Field::new("a", DataType::Int32, false).with_metadata(metadata)), |
| Arc::new(Int32Array::from(vec![2, 4, 6])) as Arc<dyn Array>, |
| )]); |
| |
| // export it |
| let (array, schema) = to_ffi(&struct_array.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| |
| // perform some operation |
| let array = array.as_any().downcast_ref::<StructArray>().unwrap(); |
| assert_eq!(array.data_type(), struct_array.data_type()); |
| assert_eq!(array, &struct_array); |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_union_sparse_array() -> Result<()> { |
| let mut builder = UnionBuilder::new_sparse(); |
| builder.append::<Int32Type>("a", 1).unwrap(); |
| builder.append_null::<Int32Type>("a").unwrap(); |
| builder.append::<Float64Type>("c", 3.0).unwrap(); |
| builder.append::<Int32Type>("a", 4).unwrap(); |
| let union = builder.build().unwrap(); |
| |
| // export it |
| let (array, schema) = to_ffi(&union.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| |
| let array = array.as_any().downcast_ref::<UnionArray>().unwrap(); |
| |
| let expected_type_ids = vec![0_i8, 0, 1, 0]; |
| |
| // Check type ids |
| assert_eq!(*array.type_ids(), expected_type_ids); |
| for (i, id) in expected_type_ids.iter().enumerate() { |
| assert_eq!(id, &array.type_id(i)); |
| } |
| |
| // Check offsets, sparse union should only have a single buffer, i.e. no offsets |
| assert!(array.offsets().is_none()); |
| |
| for i in 0..array.len() { |
| let slot = array.value(i); |
| match i { |
| 0 => { |
| let slot = slot.as_primitive::<Int32Type>(); |
| assert!(!slot.is_null(0)); |
| assert_eq!(slot.len(), 1); |
| let value = slot.value(0); |
| assert_eq!(1_i32, value); |
| } |
| 1 => assert!(slot.is_null(0)), |
| 2 => { |
| let slot = slot.as_primitive::<Float64Type>(); |
| assert!(!slot.is_null(0)); |
| assert_eq!(slot.len(), 1); |
| let value = slot.value(0); |
| assert_eq!(value, 3_f64); |
| } |
| 3 => { |
| let slot = slot.as_primitive::<Int32Type>(); |
| assert!(!slot.is_null(0)); |
| assert_eq!(slot.len(), 1); |
| let value = slot.value(0); |
| assert_eq!(4_i32, value); |
| } |
| _ => unreachable!(), |
| } |
| } |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_union_dense_array() -> Result<()> { |
| let mut builder = UnionBuilder::new_dense(); |
| builder.append::<Int32Type>("a", 1).unwrap(); |
| builder.append_null::<Int32Type>("a").unwrap(); |
| builder.append::<Float64Type>("c", 3.0).unwrap(); |
| builder.append::<Int32Type>("a", 4).unwrap(); |
| let union = builder.build().unwrap(); |
| |
| // export it |
| let (array, schema) = to_ffi(&union.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = UnionArray::from(data); |
| |
| let expected_type_ids = vec![0_i8, 0, 1, 0]; |
| |
| // Check type ids |
| assert_eq!(*array.type_ids(), expected_type_ids); |
| for (i, id) in expected_type_ids.iter().enumerate() { |
| assert_eq!(id, &array.type_id(i)); |
| } |
| |
| assert!(array.offsets().is_some()); |
| |
| for i in 0..array.len() { |
| let slot = array.value(i); |
| match i { |
| 0 => { |
| let slot = slot.as_primitive::<Int32Type>(); |
| assert!(!slot.is_null(0)); |
| assert_eq!(slot.len(), 1); |
| let value = slot.value(0); |
| assert_eq!(1_i32, value); |
| } |
| 1 => assert!(slot.is_null(0)), |
| 2 => { |
| let slot = slot.as_primitive::<Float64Type>(); |
| assert!(!slot.is_null(0)); |
| assert_eq!(slot.len(), 1); |
| let value = slot.value(0); |
| assert_eq!(value, 3_f64); |
| } |
| 3 => { |
| let slot = slot.as_primitive::<Int32Type>(); |
| assert!(!slot.is_null(0)); |
| assert_eq!(slot.len(), 1); |
| let value = slot.value(0); |
| assert_eq!(4_i32, value); |
| } |
| _ => unreachable!(), |
| } |
| } |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_run_array() -> Result<()> { |
| let value_data = |
| PrimitiveArray::<Int8Type>::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]); |
| |
| // Construct a run_ends array: |
| let run_ends_values = [4_i32, 6, 7, 9, 13, 18, 20, 22]; |
| let run_ends_data = |
| PrimitiveArray::<Int32Type>::from_iter_values(run_ends_values.iter().copied()); |
| |
| // Construct a run ends encoded array from the above two |
| let ree_array = RunArray::<Int32Type>::try_new(&run_ends_data, &value_data).unwrap(); |
| |
| // export it |
| let (array, schema) = to_ffi(&ree_array.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| |
| // perform some operation |
| let array = array |
| .as_any() |
| .downcast_ref::<RunArray<Int32Type>>() |
| .unwrap(); |
| assert_eq!(array.data_type(), ree_array.data_type()); |
| assert_eq!(array.run_ends().values(), ree_array.run_ends().values()); |
| assert_eq!(array.values(), ree_array.values()); |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_nullable_run_array() -> Result<()> { |
| let nulls = NullBuffer::from(vec![true, false, true, true, false]); |
| let value_data = |
| PrimitiveArray::<Int8Type>::new(vec![1_i8, 2, 3, 4, 5].into(), Some(nulls)); |
| |
| // Construct a run_ends array: |
| let run_ends_values = [5_i32, 6, 7, 8, 10]; |
| let run_ends_data = |
| PrimitiveArray::<Int32Type>::from_iter_values(run_ends_values.iter().copied()); |
| |
| // Construct a run ends encoded array from the above two |
| let ree_array = RunArray::<Int32Type>::try_new(&run_ends_data, &value_data).unwrap(); |
| |
| // export it |
| let (array, schema) = to_ffi(&ree_array.to_data())?; |
| |
| // (simulate consumer) import it |
| let data = unsafe { from_ffi(array, &schema) }?; |
| let array = make_array(data); |
| |
| // perform some operation |
| let array = array |
| .as_any() |
| .downcast_ref::<RunArray<Int32Type>>() |
| .unwrap(); |
| assert_eq!(array.data_type(), ree_array.data_type()); |
| assert_eq!(array.run_ends().values(), ree_array.run_ends().values()); |
| assert_eq!(array.values(), ree_array.values()); |
| |
| Ok(()) |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests_from_ffi { |
| #[cfg(not(feature = "force_validate"))] |
| use std::ptr::NonNull; |
| use std::sync::Arc; |
| |
| use arrow_buffer::NullBuffer; |
| #[cfg(not(feature = "force_validate"))] |
| use arrow_buffer::{ScalarBuffer, bit_util, buffer::Buffer}; |
| #[cfg(feature = "force_validate")] |
| use arrow_buffer::{bit_util, buffer::Buffer}; |
| |
| use arrow_data::ArrayData; |
| use arrow_data::transform::MutableArrayData; |
| use arrow_schema::{DataType, Field}; |
| |
| use super::Result; |
| |
| use crate::builder::GenericByteViewBuilder; |
| use crate::types::{BinaryViewType, ByteViewType, Int32Type, StringViewType}; |
| use crate::{ |
| ArrayRef, GenericByteViewArray, ListArray, |
| array::{ |
| Array, BooleanArray, DictionaryArray, FixedSizeBinaryArray, FixedSizeListArray, |
| Int32Array, Int64Array, StringArray, StructArray, UInt32Array, UInt64Array, |
| }, |
| ffi::{FFI_ArrowArray, FFI_ArrowSchema, from_ffi}, |
| make_array, |
| }; |
| |
| fn test_round_trip(expected: &ArrayData) -> Result<()> { |
| // here we export the array |
| let array = FFI_ArrowArray::new(expected); |
| let schema = FFI_ArrowSchema::try_from(expected.data_type())?; |
| |
| // simulate an external consumer by being the consumer |
| let result = &unsafe { from_ffi(array, &schema) }?; |
| |
| assert_eq!(result, expected); |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_u32() -> Result<()> { |
| let array = UInt32Array::from(vec![Some(2), None, Some(1), None]); |
| let data = array.into_data(); |
| test_round_trip(&data) |
| } |
| |
| #[test] |
| fn test_u64() -> Result<()> { |
| let array = UInt64Array::from(vec![Some(2), None, Some(1), None]); |
| let data = array.into_data(); |
| test_round_trip(&data) |
| } |
| |
| #[test] |
| fn test_i64() -> Result<()> { |
| let array = Int64Array::from(vec![Some(2), None, Some(1), None]); |
| let data = array.into_data(); |
| test_round_trip(&data) |
| } |
| |
| #[test] |
| fn test_struct() -> Result<()> { |
| let inner = StructArray::from(vec![ |
| ( |
| Arc::new(Field::new("a1", DataType::Boolean, false)), |
| Arc::new(BooleanArray::from(vec![true, true, false, false])) as Arc<dyn Array>, |
| ), |
| ( |
| Arc::new(Field::new("a2", DataType::UInt32, false)), |
| Arc::new(UInt32Array::from(vec![1, 2, 3, 4])), |
| ), |
| ]); |
| |
| let array = StructArray::from(vec![ |
| ( |
| Arc::new(Field::new("a", inner.data_type().clone(), false)), |
| Arc::new(inner) as Arc<dyn Array>, |
| ), |
| ( |
| Arc::new(Field::new("b", DataType::Boolean, false)), |
| Arc::new(BooleanArray::from(vec![false, false, true, true])) as Arc<dyn Array>, |
| ), |
| ( |
| Arc::new(Field::new("c", DataType::UInt32, false)), |
| Arc::new(UInt32Array::from(vec![42, 28, 19, 31])), |
| ), |
| ]); |
| let data = array.into_data(); |
| test_round_trip(&data) |
| } |
| |
| #[test] |
| fn test_dictionary() -> Result<()> { |
| let values = StringArray::from(vec![Some("foo"), Some("bar"), None]); |
| let keys = Int32Array::from(vec![ |
| Some(0), |
| Some(1), |
| None, |
| Some(1), |
| Some(1), |
| None, |
| Some(1), |
| Some(2), |
| Some(1), |
| None, |
| ]); |
| let array = DictionaryArray::new(keys, Arc::new(values)); |
| |
| let data = array.into_data(); |
| test_round_trip(&data) |
| } |
| |
| #[test] |
| fn test_fixed_size_binary() -> Result<()> { |
| let values = vec![vec![10, 10, 10], vec![20, 20, 20], vec![30, 30, 30]]; |
| let array = FixedSizeBinaryArray::try_from_iter(values.into_iter())?; |
| |
| let data = array.into_data(); |
| test_round_trip(&data) |
| } |
| |
| #[test] |
| fn test_fixed_size_binary_with_nulls() -> Result<()> { |
| let values = vec![ |
| None, |
| Some(vec![10, 10, 10]), |
| None, |
| Some(vec![20, 20, 20]), |
| Some(vec![30, 30, 30]), |
| None, |
| ]; |
| let array = FixedSizeBinaryArray::try_from_sparse_iter_with_size(values.into_iter(), 3)?; |
| |
| let data = array.into_data(); |
| test_round_trip(&data) |
| } |
| |
| #[test] |
| fn test_fixed_size_list() -> Result<()> { |
| let v: Vec<i64> = (0..9).collect(); |
| let value_data = ArrayData::builder(DataType::Int64) |
| .len(9) |
| .add_buffer(Buffer::from_slice_ref(v)) |
| .build()?; |
| let list_data_type = |
| DataType::FixedSizeList(Arc::new(Field::new("f", DataType::Int64, false)), 3); |
| let list_data = ArrayData::builder(list_data_type) |
| .len(3) |
| .add_child_data(value_data) |
| .build()?; |
| let array = FixedSizeListArray::from(list_data); |
| |
| let data = array.into_data(); |
| test_round_trip(&data) |
| } |
| |
| #[test] |
| fn test_fixed_size_list_with_nulls() -> Result<()> { |
| // 0100 0110 |
| let mut validity_bits: [u8; 1] = [0; 1]; |
| bit_util::set_bit(&mut validity_bits, 1); |
| bit_util::set_bit(&mut validity_bits, 2); |
| bit_util::set_bit(&mut validity_bits, 6); |
| |
| let v: Vec<i16> = (0..16).collect(); |
| let value_data = ArrayData::builder(DataType::Int16) |
| .len(16) |
| .add_buffer(Buffer::from_slice_ref(v)) |
| .build()?; |
| let list_data_type = |
| DataType::FixedSizeList(Arc::new(Field::new("f", DataType::Int16, false)), 2); |
| let list_data = ArrayData::builder(list_data_type) |
| .len(8) |
| .null_bit_buffer(Some(Buffer::from(validity_bits))) |
| .add_child_data(value_data) |
| .build()?; |
| let array = FixedSizeListArray::from(list_data); |
| |
| let data = array.into_data(); |
| test_round_trip(&data) |
| } |
| |
| #[test] |
| fn test_fixed_size_list_nested() -> Result<()> { |
| let v: Vec<i32> = (0..16).collect(); |
| let value_data = ArrayData::builder(DataType::Int32) |
| .len(16) |
| .add_buffer(Buffer::from_slice_ref(v)) |
| .build()?; |
| |
| let offsets: Vec<i32> = vec![0, 2, 4, 6, 8, 10, 12, 14, 16]; |
| let value_offsets = Buffer::from_slice_ref(offsets); |
| let inner_list_data_type = |
| DataType::List(Arc::new(Field::new_list_field(DataType::Int32, false))); |
| let inner_list_data = ArrayData::builder(inner_list_data_type.clone()) |
| .len(8) |
| .add_buffer(value_offsets) |
| .add_child_data(value_data) |
| .build()?; |
| |
| // 0000 0100 |
| let mut validity_bits: [u8; 1] = [0; 1]; |
| bit_util::set_bit(&mut validity_bits, 2); |
| |
| let list_data_type = |
| DataType::FixedSizeList(Arc::new(Field::new("f", inner_list_data_type, false)), 2); |
| let list_data = ArrayData::builder(list_data_type) |
| .len(4) |
| .null_bit_buffer(Some(Buffer::from(validity_bits))) |
| .add_child_data(inner_list_data) |
| .build()?; |
| |
| let array = FixedSizeListArray::from(list_data); |
| |
| let data = array.into_data(); |
| test_round_trip(&data) |
| } |
| |
| #[test] |
| fn test_list_view() -> Result<()> { |
| // Construct a value array |
| let value_data = ArrayData::builder(DataType::Int16) |
| .len(8) |
| .add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7])) |
| .build() |
| .unwrap(); |
| |
| // Construct a buffer for value offsets, for the nested array: |
| // [[0, 1, 2], [3, 4, 5], [6, 7]] |
| let value_offsets = Buffer::from(vec![0_i32, 3, 6]); |
| let sizes_buffer = Buffer::from(vec![3_i32, 3, 2]); |
| |
| // Construct a list array from the above two |
| let list_view_dt = |
| DataType::ListView(Arc::new(Field::new_list_field(DataType::Int16, false))); |
| |
| let list_view_data = ArrayData::builder(list_view_dt) |
| .len(3) |
| .add_buffer(value_offsets) |
| .add_buffer(sizes_buffer) |
| .add_child_data(value_data) |
| .build() |
| .unwrap(); |
| |
| test_round_trip(&list_view_data) |
| } |
| |
| #[test] |
| fn test_list_view_with_nulls() -> Result<()> { |
| // Construct a value array |
| let value_data = ArrayData::builder(DataType::Int16) |
| .len(8) |
| .add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7])) |
| .build() |
| .unwrap(); |
| |
| // Construct a buffer for value offsets, for the nested array: |
| // [[0, 1, 2], [3, 4, 5], [6, 7], null] |
| let value_offsets = Buffer::from(vec![0_i32, 3, 6, 8]); |
| let sizes_buffer = Buffer::from(vec![3_i32, 3, 2, 0]); |
| |
| // Construct a list array from the above two |
| let list_view_dt = |
| DataType::ListView(Arc::new(Field::new_list_field(DataType::Int16, true))); |
| |
| let list_view_data = ArrayData::builder(list_view_dt) |
| .len(4) |
| .add_buffer(value_offsets) |
| .add_buffer(sizes_buffer) |
| .add_child_data(value_data) |
| .nulls(Some(NullBuffer::from(vec![true, true, true, false]))) |
| .build() |
| .unwrap(); |
| |
| test_round_trip(&list_view_data) |
| } |
| |
| #[test] |
| #[cfg(not(feature = "force_validate"))] |
| fn test_empty_string_with_non_zero_offset() -> Result<()> { |
| use super::ImportedArrowArray; |
| use arrow_buffer::{MutableBuffer, OffsetBuffer}; |
| |
| // Simulate an empty string array with a non-zero offset from a producer |
| let data: Buffer = MutableBuffer::new(0).into(); |
| let offsets = OffsetBuffer::new(vec![123].into()); |
| let string_array = |
| unsafe { StringArray::new_unchecked(offsets.clone(), data.clone(), None) }; |
| |
| let data = string_array.into_data(); |
| |
| let array = FFI_ArrowArray::new(&data); |
| let schema = FFI_ArrowSchema::try_from(data.data_type())?; |
| |
| let dt = DataType::try_from(&schema)?; |
| let array = Arc::new(array); |
| let imported_array = ImportedArrowArray { |
| array: &array, |
| data_type: dt, |
| owner: &array, |
| }; |
| |
| let offset_buf_len = imported_array.buffer_len(1, &[], &imported_array.data_type)?; |
| let data_buf_len = imported_array.buffer_len(2, &[], &imported_array.data_type)?; |
| |
| assert_eq!(offset_buf_len, 4); |
| assert_eq!(data_buf_len, 0); |
| |
| test_round_trip(&imported_array.consume()?) |
| } |
| |
| fn roundtrip_string_array(array: StringArray) -> StringArray { |
| let data = array.into_data(); |
| |
| let array = FFI_ArrowArray::new(&data); |
| let schema = FFI_ArrowSchema::try_from(data.data_type()).unwrap(); |
| |
| let array = unsafe { from_ffi(array, &schema) }.unwrap(); |
| StringArray::from(array) |
| } |
| |
| fn roundtrip_byte_view_array<T: ByteViewType>( |
| array: GenericByteViewArray<T>, |
| ) -> GenericByteViewArray<T> { |
| let data = array.into_data(); |
| |
| let array = FFI_ArrowArray::new(&data); |
| let schema = FFI_ArrowSchema::try_from(data.data_type()).unwrap(); |
| |
| let array = unsafe { from_ffi(array, &schema) }.unwrap(); |
| GenericByteViewArray::<T>::from(array) |
| } |
| |
| fn extend_array(array: &dyn Array) -> ArrayRef { |
| let len = array.len(); |
| let data = array.to_data(); |
| |
| let mut mutable = MutableArrayData::new(vec![&data], false, len); |
| mutable.extend(0, 0, len); |
| make_array(mutable.freeze()) |
| } |
| |
| #[test] |
| fn test_extend_imported_string_slice() { |
| let mut strings = vec![]; |
| |
| for i in 0..1000 { |
| strings.push(format!("string: {i}")); |
| } |
| |
| let string_array = StringArray::from(strings); |
| |
| let imported = roundtrip_string_array(string_array.clone()); |
| assert_eq!(imported.len(), 1000); |
| assert_eq!(imported.value(0), "string: 0"); |
| assert_eq!(imported.value(499), "string: 499"); |
| |
| let copied = extend_array(&imported); |
| assert_eq!( |
| copied.as_any().downcast_ref::<StringArray>().unwrap(), |
| &imported |
| ); |
| |
| let slice = string_array.slice(500, 500); |
| |
| let imported = roundtrip_string_array(slice); |
| assert_eq!(imported.len(), 500); |
| assert_eq!(imported.value(0), "string: 500"); |
| assert_eq!(imported.value(499), "string: 999"); |
| |
| let copied = extend_array(&imported); |
| assert_eq!( |
| copied.as_any().downcast_ref::<StringArray>().unwrap(), |
| &imported |
| ); |
| } |
| |
| fn roundtrip_list_array(array: ListArray) -> ListArray { |
| let data = array.into_data(); |
| |
| let array = FFI_ArrowArray::new(&data); |
| let schema = FFI_ArrowSchema::try_from(data.data_type()).unwrap(); |
| |
| let array = unsafe { from_ffi(array, &schema) }.unwrap(); |
| ListArray::from(array) |
| } |
| |
| #[test] |
| fn test_extend_imported_list_slice() { |
| let mut data = vec![]; |
| |
| for i in 0..1000 { |
| let mut list = vec![]; |
| for j in 0..100 { |
| list.push(Some(i * 1000 + j)); |
| } |
| data.push(Some(list)); |
| } |
| |
| let list_array = ListArray::from_iter_primitive::<Int32Type, _, _>(data); |
| |
| let slice = list_array.slice(500, 500); |
| let imported = roundtrip_list_array(slice.clone()); |
| assert_eq!(imported.len(), 500); |
| assert_eq!(&slice, &imported); |
| |
| let copied = extend_array(&imported); |
| assert_eq!( |
| copied.as_any().downcast_ref::<ListArray>().unwrap(), |
| &imported |
| ); |
| } |
| |
| /// Helper trait to allow us to use easily strings as either BinaryViewType::Native or |
| /// StringViewType::Native scalars. |
| trait NativeFromStr { |
| fn from_str(value: &str) -> &Self; |
| } |
| |
| impl NativeFromStr for str { |
| fn from_str(value: &str) -> &Self { |
| value |
| } |
| } |
| |
| impl NativeFromStr for [u8] { |
| fn from_str(value: &str) -> &Self { |
| value.as_bytes() |
| } |
| } |
| |
| #[test] |
| #[cfg(not(feature = "force_validate"))] |
| fn test_utf8_view_ffi_from_dangling_pointer() { |
| let empty = GenericByteViewBuilder::<StringViewType>::new().finish(); |
| let buffers = empty.data_buffers().to_vec(); |
| let nulls = empty.nulls().cloned(); |
| |
| // Create a dangling pointer to a view buffer with zero length. |
| let alloc = Arc::new(1); |
| let buffer = unsafe { Buffer::from_custom_allocation(NonNull::<u8>::dangling(), 0, alloc) }; |
| let views = unsafe { ScalarBuffer::new_unchecked(buffer) }; |
| |
| let str_view: GenericByteViewArray<StringViewType> = |
| unsafe { GenericByteViewArray::new_unchecked(views, buffers, nulls) }; |
| let imported = roundtrip_byte_view_array(str_view); |
| assert_eq!(imported.len(), 0); |
| assert_eq!(&imported, &empty); |
| } |
| |
| #[test] |
| fn test_round_trip_byte_view() { |
| fn test_case<T>() |
| where |
| T: ByteViewType, |
| T::Native: NativeFromStr, |
| { |
| macro_rules! run_test_case { |
| ($array:expr) => {{ |
| // round-trip through C Data Interface |
| let len = $array.len(); |
| let imported = roundtrip_byte_view_array($array); |
| assert_eq!(imported.len(), len); |
| |
| let copied = extend_array(&imported); |
| assert_eq!( |
| copied |
| .as_any() |
| .downcast_ref::<GenericByteViewArray<T>>() |
| .unwrap(), |
| &imported |
| ); |
| }}; |
| } |
| |
| // Empty test case. |
| let empty = GenericByteViewBuilder::<T>::new().finish(); |
| run_test_case!(empty); |
| |
| // All inlined strings test case. |
| let mut all_inlined = GenericByteViewBuilder::<T>::new(); |
| all_inlined.append_value(T::Native::from_str("inlined1")); |
| all_inlined.append_value(T::Native::from_str("inlined2")); |
| all_inlined.append_value(T::Native::from_str("inlined3")); |
| let all_inlined = all_inlined.finish(); |
| assert_eq!(all_inlined.data_buffers().len(), 0); |
| run_test_case!(all_inlined); |
| |
| // some inlined + non-inlined, 1 variadic buffer. |
| let mixed_one_variadic = { |
| let mut builder = GenericByteViewBuilder::<T>::new(); |
| builder.append_value(T::Native::from_str("inlined")); |
| let block_id = |
| builder.append_block(Buffer::from("non-inlined-string-buffer".as_bytes())); |
| builder.try_append_view(block_id, 0, 25).unwrap(); |
| builder.finish() |
| }; |
| assert_eq!(mixed_one_variadic.data_buffers().len(), 1); |
| run_test_case!(mixed_one_variadic); |
| |
| // inlined + non-inlined, 2 variadic buffers. |
| let mixed_two_variadic = { |
| let mut builder = GenericByteViewBuilder::<T>::new(); |
| builder.append_value(T::Native::from_str("inlined")); |
| let block_id = |
| builder.append_block(Buffer::from("non-inlined-string-buffer".as_bytes())); |
| builder.try_append_view(block_id, 0, 25).unwrap(); |
| |
| let block_id = builder |
| .append_block(Buffer::from("another-non-inlined-string-buffer".as_bytes())); |
| builder.try_append_view(block_id, 0, 33).unwrap(); |
| builder.finish() |
| }; |
| assert_eq!(mixed_two_variadic.data_buffers().len(), 2); |
| run_test_case!(mixed_two_variadic); |
| } |
| |
| test_case::<StringViewType>(); |
| test_case::<BinaryViewType>(); |
| } |
| } |