blob: 7ed4bc5826c08dccf6c44357e173c8fbf1818e5c [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::builder::ArrayBuilder;
use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType};
use crate::{Array, ArrayRef, GenericByteArray, OffsetSizeTrait};
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer, NullBufferBuilder, ScalarBuffer};
use arrow_data::ArrayDataBuilder;
use arrow_schema::ArrowError;
use std::any::Any;
use std::sync::Arc;
/// Builder for [`GenericByteArray`]
///
/// For building strings, see docs on [`GenericStringBuilder`].
/// For building binary, see docs on [`GenericBinaryBuilder`].
pub struct GenericByteBuilder<T: ByteArrayType> {
value_builder: Vec<u8>,
offsets_builder: Vec<T::Offset>,
null_buffer_builder: NullBufferBuilder,
}
impl<T: ByteArrayType> GenericByteBuilder<T> {
/// Creates a new [`GenericByteBuilder`].
pub fn new() -> Self {
Self::with_capacity(1024, 1024)
}
/// Creates a new [`GenericByteBuilder`].
///
/// - `item_capacity` is the number of items to pre-allocate.
/// The size of the preallocated buffer of offsets is the number of items plus one.
/// - `data_capacity` is the total number of bytes of data to pre-allocate
/// (for all items, not per item).
pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
let mut offsets_builder = Vec::with_capacity(item_capacity + 1);
offsets_builder.push(T::Offset::from_usize(0).unwrap());
Self {
value_builder: Vec::with_capacity(data_capacity),
offsets_builder,
null_buffer_builder: NullBufferBuilder::new(item_capacity),
}
}
/// Creates a new [`GenericByteBuilder`] from buffers.
///
/// # Safety
///
/// This doesn't verify buffer contents as it assumes the buffers are from
/// existing and valid [`GenericByteArray`].
pub unsafe fn new_from_buffer(
offsets_buffer: MutableBuffer,
value_buffer: MutableBuffer,
null_buffer: Option<MutableBuffer>,
) -> Self {
let offsets_builder: Vec<T::Offset> =
ScalarBuffer::<T::Offset>::from(offsets_buffer).into();
let value_builder: Vec<u8> = ScalarBuffer::<u8>::from(value_buffer).into();
let null_buffer_builder = null_buffer
.map(|buffer| NullBufferBuilder::new_from_buffer(buffer, offsets_builder.len() - 1))
.unwrap_or_else(|| NullBufferBuilder::new_with_len(offsets_builder.len() - 1));
Self {
offsets_builder,
value_builder,
null_buffer_builder,
}
}
#[inline]
fn next_offset(&self) -> T::Offset {
T::Offset::from_usize(self.value_builder.len()).expect("byte array offset overflow")
}
/// Appends a value into the builder.
///
/// See the [GenericStringBuilder] documentation for examples of
/// incrementally building string values with multiple `write!` calls.
///
/// # Panics
///
/// Panics if the resulting length of [`Self::values_slice`] would exceed
/// `T::Offset::MAX` bytes.
///
/// For example, this can happen with [`StringArray`] or [`BinaryArray`]
/// where the total length of all values exceeds 2GB
///
/// [`StringArray`]: crate::StringArray
/// [`BinaryArray`]: crate::BinaryArray
#[inline]
pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
self.value_builder
.extend_from_slice(value.as_ref().as_ref());
self.null_buffer_builder.append(true);
self.offsets_builder.push(self.next_offset());
}
/// Append an `Option` value into the builder.
///
/// - A `None` value will append a null value.
/// - A `Some` value will append the value.
///
/// See [`Self::append_value`] for more panic information.
#[inline]
pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
match value {
None => self.append_null(),
Some(v) => self.append_value(v),
};
}
/// Append a null value into the builder.
#[inline]
pub fn append_null(&mut self) {
self.null_buffer_builder.append(false);
self.offsets_builder.push(self.next_offset());
}
/// Appends `n` `null`s into the builder.
#[inline]
pub fn append_nulls(&mut self, n: usize) {
self.null_buffer_builder.append_n_nulls(n);
let next_offset = self.next_offset();
self.offsets_builder
.extend(std::iter::repeat_n(next_offset, n));
}
/// Appends array values and null to this builder as is
/// (this means that underlying null values are copied as is).
#[inline]
pub fn append_array(&mut self, array: &GenericByteArray<T>) -> Result<(), ArrowError> {
use num_traits::CheckedAdd;
if array.len() == 0 {
return Ok(());
}
let offsets = array.offsets();
// If the offsets are contiguous, we can append them directly avoiding the need to align
// for example, when the first appended array is not sliced (starts at offset 0)
if self.next_offset() == offsets[0] {
self.offsets_builder.extend_from_slice(&offsets[1..]);
} else {
// Shifting all the offsets
let shift: T::Offset = self.next_offset() - offsets[0];
if shift.checked_add(&offsets[offsets.len() - 1]).is_none() {
return Err(ArrowError::OffsetOverflowError(
shift.as_usize() + offsets[offsets.len() - 1].as_usize(),
));
}
self.offsets_builder
.extend(offsets[1..].iter().map(|&offset| offset + shift));
}
// Append underlying values, starting from the first offset and ending at the last offset
self.value_builder.extend_from_slice(
&array.values().as_slice()[offsets[0].as_usize()..offsets[array.len()].as_usize()],
);
if let Some(null_buffer) = array.nulls() {
self.null_buffer_builder.append_buffer(null_buffer);
} else {
self.null_buffer_builder.append_n_non_nulls(array.len());
}
Ok(())
}
/// Builds the [`GenericByteArray`] and reset this builder.
pub fn finish(&mut self) -> GenericByteArray<T> {
let array_type = T::DATA_TYPE;
let array_builder = ArrayDataBuilder::new(array_type)
.len(self.len())
.add_buffer(std::mem::take(&mut self.offsets_builder).into())
.add_buffer(std::mem::take(&mut self.value_builder).into())
.nulls(self.null_buffer_builder.finish());
self.offsets_builder.push(self.next_offset());
let array_data = unsafe { array_builder.build_unchecked() };
GenericByteArray::from(array_data)
}
/// Builds the [`GenericByteArray`] without resetting the builder.
pub fn finish_cloned(&self) -> GenericByteArray<T> {
let array_type = T::DATA_TYPE;
let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice());
let value_buffer = Buffer::from_slice_ref(self.value_builder.as_slice());
let array_builder = ArrayDataBuilder::new(array_type)
.len(self.len())
.add_buffer(offset_buffer)
.add_buffer(value_buffer)
.nulls(self.null_buffer_builder.finish_cloned());
let array_data = unsafe { array_builder.build_unchecked() };
GenericByteArray::from(array_data)
}
/// Returns the current values buffer as a slice
pub fn values_slice(&self) -> &[u8] {
self.value_builder.as_slice()
}
/// Returns the current offsets buffer as a slice
pub fn offsets_slice(&self) -> &[T::Offset] {
self.offsets_builder.as_slice()
}
/// Returns the current null buffer as a slice
pub fn validity_slice(&self) -> Option<&[u8]> {
self.null_buffer_builder.as_slice()
}
/// Returns the current null buffer as a mutable slice
pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> {
self.null_buffer_builder.as_slice_mut()
}
}
impl<T: ByteArrayType> std::fmt::Debug for GenericByteBuilder<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}{}Builder", T::Offset::PREFIX, T::PREFIX)?;
f.debug_struct("")
.field("value_builder", &self.value_builder)
.field("offsets_builder", &self.offsets_builder)
.field("null_buffer_builder", &self.null_buffer_builder)
.finish()
}
}
impl<T: ByteArrayType> Default for GenericByteBuilder<T> {
fn default() -> Self {
Self::new()
}
}
impl<T: ByteArrayType> ArrayBuilder for GenericByteBuilder<T> {
/// Returns the number of binary slots in the builder
fn len(&self) -> usize {
self.null_buffer_builder.len()
}
/// Builds the array and reset this builder.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
/// Returns the builder as a non-mutable `Any` reference.
fn as_any(&self) -> &dyn Any {
self
}
/// Returns the builder as a mutable `Any` reference.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
}
impl<T: ByteArrayType, V: AsRef<T::Native>> Extend<Option<V>> for GenericByteBuilder<T> {
#[inline]
fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
for v in iter {
self.append_option(v)
}
}
}
/// Array builder for [`GenericStringArray`][crate::GenericStringArray]
///
/// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with
/// [`GenericByteBuilder::append_null`].
///
/// This builder also implements [`std::fmt::Write`] with any written data
/// included in the next appended value. This allows using [`std::fmt::Display`]
/// with standard Rust idioms like `write!` and `writeln!` to write data
/// directly to the builder without intermediate allocations.
///
/// # Example writing strings with `append_value`
/// ```
/// # use arrow_array::builder::GenericStringBuilder;
/// let mut builder = GenericStringBuilder::<i32>::new();
///
/// // Write one string value
/// builder.append_value("foobarbaz");
///
/// // Write a second string
/// builder.append_value("v2");
///
/// let array = builder.finish();
/// assert_eq!(array.value(0), "foobarbaz");
/// assert_eq!(array.value(1), "v2");
/// ```
///
/// # Example incrementally writing strings with `std::fmt::Write`
///
/// ```
/// # use std::fmt::Write;
/// # use arrow_array::builder::GenericStringBuilder;
/// let mut builder = GenericStringBuilder::<i32>::new();
///
/// // Write data in multiple `write!` calls
/// write!(builder, "foo").unwrap();
/// write!(builder, "bar").unwrap();
/// // The next call to append_value finishes the current string
/// // including all previously written strings.
/// builder.append_value("baz");
///
/// // Write second value with a single write call
/// write!(builder, "v2").unwrap();
/// // finish the value by calling append_value with an empty string
/// builder.append_value("");
///
/// let array = builder.finish();
/// assert_eq!(array.value(0), "foobarbaz");
/// assert_eq!(array.value(1), "v2");
/// ```
pub type GenericStringBuilder<O> = GenericByteBuilder<GenericStringType<O>>;
impl<O: OffsetSizeTrait> std::fmt::Write for GenericStringBuilder<O> {
fn write_str(&mut self, s: &str) -> std::fmt::Result {
self.value_builder.extend_from_slice(s.as_bytes());
Ok(())
}
}
/// A byte size value representing the number of bytes to allocate per string in [`GenericStringBuilder`]
///
/// To create a [`GenericStringBuilder`] using `.with_capacity` we are required to provide: \
/// - `item_capacity` - the row count \
/// - `data_capacity` - total string byte count \
///
/// We will use the `AVERAGE_STRING_LENGTH` * row_count for `data_capacity`. \
///
/// These capacities are preallocation hints used to improve performance,
/// but consequences of passing a hint too large or too small should be negligible.
const AVERAGE_STRING_LENGTH: usize = 16;
/// Trait for string-like array builders
///
/// This trait provides unified interface for builders that append string-like data
/// such as [`GenericStringBuilder<O>`] and [`crate::builder::StringViewBuilder`]
pub trait StringLikeArrayBuilder: ArrayBuilder {
/// Returns a human-readable type name for the builder.
fn type_name() -> &'static str;
/// Creates a new builder with the given row capacity.
fn with_capacity(capacity: usize) -> Self;
/// Appends a non-null string value to the builder.
fn append_value(&mut self, value: &str);
/// Appends a null value to the builder.
fn append_null(&mut self);
}
impl<O: OffsetSizeTrait> StringLikeArrayBuilder for GenericStringBuilder<O> {
fn type_name() -> &'static str {
std::any::type_name::<Self>()
}
fn with_capacity(capacity: usize) -> Self {
Self::with_capacity(capacity, capacity * AVERAGE_STRING_LENGTH)
}
fn append_value(&mut self, value: &str) {
Self::append_value(self, value);
}
fn append_null(&mut self) {
Self::append_null(self);
}
}
/// A byte size value representing the number of bytes to allocate per binary in [`GenericBinaryBuilder`]
///
/// To create a [`GenericBinaryBuilder`] using `.with_capacity` we are required to provide: \
/// - `item_capacity` - the row count \
/// - `data_capacity` - total binary byte count \
///
/// We will use the `AVERAGE_BINARY_LENGTH` * row_count for `data_capacity`. \
///
/// These capacities are preallocation hints used to improve performance,
/// but consequences of passing a hint too large or too small should be negligible.
const AVERAGE_BINARY_LENGTH: usize = 128;
/// Trait for binary-like array builders
///
/// This trait provides unified interface for builders that append binary-like data
/// such as [`GenericBinaryBuilder<O>`] and [`crate::builder::BinaryViewBuilder`]
pub trait BinaryLikeArrayBuilder: ArrayBuilder {
/// Returns a human-readable type name for the builder.
fn type_name() -> &'static str;
/// Creates a new builder with the given row capacity.
fn with_capacity(capacity: usize) -> Self;
/// Appends a non-null string value to the builder.
fn append_value(&mut self, value: &[u8]);
/// Appends a null value to the builder.
fn append_null(&mut self);
}
impl<O: OffsetSizeTrait> BinaryLikeArrayBuilder for GenericBinaryBuilder<O> {
fn type_name() -> &'static str {
std::any::type_name::<Self>()
}
fn with_capacity(capacity: usize) -> Self {
Self::with_capacity(capacity, capacity * AVERAGE_BINARY_LENGTH)
}
fn append_value(&mut self, value: &[u8]) {
Self::append_value(self, value);
}
fn append_null(&mut self) {
Self::append_null(self);
}
}
/// Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray]
///
/// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with
/// [`GenericByteBuilder::append_null`].
///
/// # Example
/// ```
/// # use arrow_array::builder::GenericBinaryBuilder;
/// let mut builder = GenericBinaryBuilder::<i32>::new();
///
/// // Write data
/// builder.append_value("foo");
///
/// // Write second value
/// builder.append_value(&[0,1,2]);
///
/// let array = builder.finish();
/// // binary values
/// assert_eq!(array.value(0), b"foo");
/// assert_eq!(array.value(1), b"\x00\x01\x02");
/// ```
///
/// # Example incrementally writing bytes with `write_bytes`
///
/// ```
/// # use std::io::Write;
/// # use arrow_array::builder::GenericBinaryBuilder;
/// let mut builder = GenericBinaryBuilder::<i32>::new();
///
/// // Write data in multiple `write_bytes` calls
/// write!(builder, "foo").unwrap();
/// write!(builder, "bar").unwrap();
/// // The next call to append_value finishes the current string
/// // including all previously written strings.
/// builder.append_value("baz");
///
/// // Write second value with a single write call
/// write!(builder, "v2").unwrap();
/// // finish the value by calling append_value with an empty string
/// builder.append_value("");
///
/// let array = builder.finish();
/// assert_eq!(array.value(0), "foobarbaz".as_bytes());
/// assert_eq!(array.value(1), "v2".as_bytes());
/// ```
pub type GenericBinaryBuilder<O> = GenericByteBuilder<GenericBinaryType<O>>;
impl<O: OffsetSizeTrait> std::io::Write for GenericBinaryBuilder<O> {
fn write(&mut self, bs: &[u8]) -> std::io::Result<usize> {
self.value_builder.extend_from_slice(bs);
Ok(bs.len())
}
fn flush(&mut self) -> std::io::Result<()> {
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::GenericStringArray;
use crate::array::Array;
use arrow_buffer::NullBuffer;
use std::fmt::Write as _;
use std::io::Write as _;
fn _test_generic_binary_builder<O: OffsetSizeTrait>() {
let mut builder = GenericBinaryBuilder::<O>::new();
builder.append_value(b"hello");
builder.append_value(b"");
builder.append_null();
builder.append_value(b"rust");
let array = builder.finish();
assert_eq!(4, array.len());
assert_eq!(1, array.null_count());
assert_eq!(b"hello", array.value(0));
assert_eq!([] as [u8; 0], array.value(1));
assert!(array.is_null(2));
assert_eq!(b"rust", array.value(3));
assert_eq!(O::from_usize(5).unwrap(), array.value_offsets()[2]);
assert_eq!(O::from_usize(4).unwrap(), array.value_length(3));
}
#[test]
fn test_binary_builder() {
_test_generic_binary_builder::<i32>()
}
#[test]
fn test_large_binary_builder() {
_test_generic_binary_builder::<i64>()
}
fn _test_generic_binary_builder_all_nulls<O: OffsetSizeTrait>() {
let mut builder = GenericBinaryBuilder::<O>::new();
builder.append_null();
builder.append_null();
builder.append_null();
builder.append_nulls(2);
assert_eq!(5, builder.len());
assert!(!builder.is_empty());
let array = builder.finish();
assert_eq!(5, array.null_count());
assert_eq!(5, array.len());
assert!(array.is_null(0));
assert!(array.is_null(1));
assert!(array.is_null(2));
assert!(array.is_null(3));
assert!(array.is_null(4));
}
#[test]
fn test_binary_builder_all_nulls() {
_test_generic_binary_builder_all_nulls::<i32>()
}
#[test]
fn test_large_binary_builder_all_nulls() {
_test_generic_binary_builder_all_nulls::<i64>()
}
fn _test_generic_binary_builder_reset<O: OffsetSizeTrait>() {
let mut builder = GenericBinaryBuilder::<O>::new();
builder.append_value(b"hello");
builder.append_value(b"");
builder.append_null();
builder.append_value(b"rust");
builder.finish();
assert!(builder.is_empty());
builder.append_value(b"parquet");
builder.append_null();
builder.append_value(b"arrow");
builder.append_value(b"");
builder.append_nulls(2);
builder.append_value(b"hi");
let array = builder.finish();
assert_eq!(7, array.len());
assert_eq!(3, array.null_count());
assert_eq!(b"parquet", array.value(0));
assert!(array.is_null(1));
assert!(array.is_null(4));
assert!(array.is_null(5));
assert_eq!(b"arrow", array.value(2));
assert_eq!(b"", array.value(1));
assert_eq!(b"hi", array.value(6));
assert_eq!(O::zero(), array.value_offsets()[0]);
assert_eq!(O::from_usize(7).unwrap(), array.value_offsets()[2]);
assert_eq!(O::from_usize(14).unwrap(), array.value_offsets()[7]);
assert_eq!(O::from_usize(5).unwrap(), array.value_length(2));
}
#[test]
fn test_binary_builder_reset() {
_test_generic_binary_builder_reset::<i32>()
}
#[test]
fn test_large_binary_builder_reset() {
_test_generic_binary_builder_reset::<i64>()
}
fn _test_generic_string_array_builder<O: OffsetSizeTrait>() {
let mut builder = GenericStringBuilder::<O>::new();
let owned = "arrow".to_owned();
builder.append_value("hello");
builder.append_value("");
builder.append_value(&owned);
builder.append_null();
builder.append_option(Some("rust"));
builder.append_option(None::<&str>);
builder.append_option(None::<String>);
builder.append_nulls(2);
builder.append_value("parquet");
assert_eq!(10, builder.len());
assert_eq!(
GenericStringArray::<O>::from(vec![
Some("hello"),
Some(""),
Some("arrow"),
None,
Some("rust"),
None,
None,
None,
None,
Some("parquet")
]),
builder.finish()
);
}
#[test]
fn test_string_array_builder() {
_test_generic_string_array_builder::<i32>()
}
#[test]
fn test_large_string_array_builder() {
_test_generic_string_array_builder::<i64>()
}
fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() {
let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
builder.append_value("hello");
builder.append_value("rust");
builder.append_null();
builder.finish();
assert!(builder.is_empty());
assert_eq!(&[O::zero()], builder.offsets_slice());
builder.append_value("arrow");
builder.append_value("parquet");
let arr = builder.finish();
// array should not have null buffer because there is not `null` value.
assert!(arr.nulls().is_none());
assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,)
}
#[test]
fn test_string_array_builder_finish() {
_test_generic_string_array_builder_finish::<i32>()
}
#[test]
fn test_large_string_array_builder_finish() {
_test_generic_string_array_builder_finish::<i64>()
}
fn _test_generic_string_array_builder_finish_cloned<O: OffsetSizeTrait>() {
let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
builder.append_value("hello");
builder.append_value("rust");
builder.append_null();
let mut arr = builder.finish_cloned();
assert!(!builder.is_empty());
assert_eq!(3, arr.len());
builder.append_value("arrow");
builder.append_value("parquet");
arr = builder.finish();
assert!(arr.nulls().is_some());
assert_eq!(&[O::zero()], builder.offsets_slice());
assert_eq!(5, arr.len());
}
#[test]
fn test_string_array_builder_finish_cloned() {
_test_generic_string_array_builder_finish_cloned::<i32>()
}
#[test]
fn test_large_string_array_builder_finish_cloned() {
_test_generic_string_array_builder_finish_cloned::<i64>()
}
#[test]
fn test_extend() {
let mut builder = GenericStringBuilder::<i32>::new();
builder.extend(["a", "b", "c", "", "a", "b", "c"].into_iter().map(Some));
builder.extend(["d", "cupcakes", "hello"].into_iter().map(Some));
let array = builder.finish();
assert_eq!(array.value_offsets(), &[0, 1, 2, 3, 3, 4, 5, 6, 7, 15, 20]);
assert_eq!(array.value_data(), b"abcabcdcupcakeshello");
}
#[test]
fn test_write_str() {
let mut builder = GenericStringBuilder::<i32>::new();
write!(builder, "foo").unwrap();
builder.append_value("");
writeln!(builder, "bar").unwrap();
builder.append_value("");
write!(builder, "fiz").unwrap();
write!(builder, "buz").unwrap();
builder.append_value("");
let a = builder.finish();
let r: Vec<_> = a.iter().flatten().collect();
assert_eq!(r, &["foo", "bar\n", "fizbuz"])
}
#[test]
fn test_write_bytes() {
let mut builder = GenericBinaryBuilder::<i32>::new();
write!(builder, "foo").unwrap();
builder.append_value("");
writeln!(builder, "bar").unwrap();
builder.append_value("");
write!(builder, "fiz").unwrap();
write!(builder, "buz").unwrap();
builder.append_value("");
let a = builder.finish();
let r: Vec<_> = a.iter().flatten().collect();
assert_eq!(
r,
&["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()]
)
}
#[test]
fn test_append_array_without_nulls() {
let input = vec![
"hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well",
"thank", "you", "for", "asking",
];
let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec());
let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec());
let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());
let mut builder = GenericStringBuilder::<i32>::new();
builder.append_array(&arr1).unwrap();
builder.append_array(&arr2).unwrap();
builder.append_array(&arr3).unwrap();
let actual = builder.finish();
let expected = GenericStringArray::<i32>::from(input);
assert_eq!(actual, expected);
}
#[test]
fn test_append_array_with_nulls() {
let input = vec![
Some("hello"),
None,
Some("how"),
None,
None,
None,
None,
Some("I"),
Some("am"),
Some("doing"),
Some("well"),
];
let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec());
let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec());
let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());
let mut builder = GenericStringBuilder::<i32>::new();
builder.append_array(&arr1).unwrap();
builder.append_array(&arr2).unwrap();
builder.append_array(&arr3).unwrap();
let actual = builder.finish();
let expected = GenericStringArray::<i32>::from(input);
assert_eq!(actual, expected);
}
#[test]
fn test_append_empty_array() {
let arr = GenericStringArray::<i32>::from(Vec::<&str>::new());
let mut builder = GenericStringBuilder::<i32>::new();
builder.append_array(&arr).unwrap();
let result = builder.finish();
assert_eq!(result.len(), 0);
}
#[test]
fn test_append_array_with_offset_not_starting_at_0() {
let input = vec![
Some("hello"),
None,
Some("how"),
None,
None,
None,
None,
Some("I"),
Some("am"),
Some("doing"),
Some("well"),
];
let full_array = GenericStringArray::<i32>::from(input);
let sliced = full_array.slice(1, 4);
assert_ne!(sliced.offsets()[0].as_usize(), 0);
assert_ne!(sliced.offsets().last(), full_array.offsets().last());
let mut builder = GenericStringBuilder::<i32>::new();
builder.append_array(&sliced).unwrap();
let actual = builder.finish();
let expected = GenericStringArray::<i32>::from(vec![None, Some("how"), None, None]);
assert_eq!(actual, expected);
}
#[test]
fn test_append_underlying_null_values_added_as_is() {
let input_1_array_with_nulls = {
let input = vec![
"hello", "world", "how", "are", "you", "doing", "today", "I", "am",
];
let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts();
GenericStringArray::<i32>::new(
offsets,
buffer,
Some(NullBuffer::from(&[
true, false, true, false, false, true, true, true, false,
])),
)
};
let input_2_array_with_nulls = {
let input = vec!["doing", "well", "thank", "you", "for", "asking"];
let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts();
GenericStringArray::<i32>::new(
offsets,
buffer,
Some(NullBuffer::from(&[false, false, true, false, true, true])),
)
};
let mut builder = GenericStringBuilder::<i32>::new();
builder.append_array(&input_1_array_with_nulls).unwrap();
builder.append_array(&input_2_array_with_nulls).unwrap();
let actual = builder.finish();
let expected = GenericStringArray::<i32>::from(vec![
Some("hello"),
None, // world
Some("how"),
None, // are
None, // you
Some("doing"),
Some("today"),
Some("I"),
None, // am
None, // doing
None, // well
Some("thank"),
None, // "you",
Some("for"),
Some("asking"),
]);
assert_eq!(actual, expected);
let expected_underlying_buffer = Buffer::from(
[
"hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing",
"well", "thank", "you", "for", "asking",
]
.join("")
.as_bytes(),
);
assert_eq!(actual.values(), &expected_underlying_buffer);
}
#[test]
fn append_array_with_continues_indices() {
let input = vec![
"hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well",
"thank", "you", "for", "asking",
];
let full_array = GenericStringArray::<i32>::from(input);
let slice1 = full_array.slice(0, 3);
let slice2 = full_array.slice(3, 4);
let slice3 = full_array.slice(7, full_array.len() - 7);
let mut builder = GenericStringBuilder::<i32>::new();
builder.append_array(&slice1).unwrap();
builder.append_array(&slice2).unwrap();
builder.append_array(&slice3).unwrap();
let actual = builder.finish();
assert_eq!(actual, full_array);
}
#[test]
fn test_append_array_offset_overflow_precise() {
let mut builder = GenericStringBuilder::<i32>::new();
let initial_string = "x".repeat(i32::MAX as usize - 100);
builder.append_value(&initial_string);
let overflow_string = "y".repeat(200);
let overflow_array = GenericStringArray::<i32>::from(vec![overflow_string.as_str()]);
let result = builder.append_array(&overflow_array);
assert!(matches!(result, Err(ArrowError::OffsetOverflowError(_))));
}
}