arrow-array/src/array/byte_view_array.rs - arrow-rs - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 use crate::array::print_long_array;
 use crate::builder::{ArrayBuilder, GenericByteViewBuilder};
 use crate::iterator::ArrayIter;
 use crate::types::bytes::ByteArrayNativeType;
 use crate::types::{BinaryViewType, ByteViewType, StringViewType};
 use crate::{Array, ArrayAccessor, ArrayRef, GenericByteArray, OffsetSizeTrait, Scalar};
 use arrow_buffer::{ArrowNativeType, Buffer, NullBuffer, ScalarBuffer};
 use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
 use arrow_schema::{ArrowError, DataType};
 use num::ToPrimitive;
 use std::any::Any;
 use std::fmt::Debug;
 use std::marker::PhantomData;
 use std::sync::Arc;

 use super::ByteArrayType;

 /// [Variable-size Binary View Layout]: An array of variable length bytes view arrays.
 ///
 /// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
 ///
 /// This is different from [`GenericByteArray`] as it stores both an offset and
 /// length meaning that take / filter operations can be implemented without
 /// copying the underlying data. In addition, it stores an inlined prefix which
 /// can be used to speed up comparisons.
 ///
 /// # See Also
 ///
 /// See [`StringViewArray`] for storing utf8 encoded string data and
 /// [`BinaryViewArray`] for storing bytes.
 ///
 /// # Notes
 ///
 /// Comparing two `GenericByteViewArray` using PartialEq compares by structure,
 /// not by value. as there are many different buffer layouts to represent the
 /// same data (e.g. different offsets, different buffer sizes, etc).
 ///
 /// # Layout: "views" and buffers
 ///
 /// A `GenericByteViewArray` stores variable length byte strings. An array of
 /// `N` elements is stored as `N` fixed length "views" and a variable number
 /// of variable length "buffers".
 ///
 /// Each view is a `u128` value whose layout is different depending on the
 /// length of the string stored at that location:
 ///
 /// ```text
 ///                         ┌──────┬────────────────────────┐
 ///                         │length│      string value      │
 ///    Strings (len <= 12)  │      │    (padded with 0)     │
 ///                         └──────┴────────────────────────┘
 ///                          0    31                      127
 ///
 ///                         ┌───────┬───────┬───────┬───────┐
 ///                         │length │prefix │  buf  │offset │
 ///    Strings (len > 12)   │       │       │ index │       │
 ///                         └───────┴───────┴───────┴───────┘
 ///                          0    31       63      95    127
 /// ```
 ///
 /// * Strings with length <= 12 are stored directly in the view. See
 ///   [`Self::inline_value`] to access the inlined prefix from a short view.
 ///
 /// * Strings with length > 12: The first four bytes are stored inline in the
 ///   view and the entire string is stored in one of the buffers. See [`ByteView`]
 ///   to access the fields of the these views.
 ///
 /// Unlike [`GenericByteArray`], there are no constraints on the offsets other
 /// than they must point into a valid buffer. However, they can be out of order,
 /// non continuous and overlapping.
 ///
 /// For example, in the following diagram, the strings "FishWasInTownToday" and
 /// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
 /// separate buffer while the string "LavaMonster" is stored inlined in the
 /// view. In this case, the same bytes for "Fish" are used to store both strings.
 ///
 /// [`ByteView`]: arrow_data::ByteView
 ///
 /// ```text
 ///                                                                            ┌───┐
 ///                         ┌──────┬──────┬──────┬──────┐               offset │...│
 /// "FishWasInTownTodayYay" │  21  │ Fish │  0   │ 115  │─ ─              103  │Mr.│
 ///                         └──────┴──────┴──────┴──────┘   │      ┌ ─ ─ ─ ─ ▶ │Cru│
 ///                         ┌──────┬──────┬──────┬──────┐                      │mpl│
 /// "CrumpleFacedFish"      │  16  │ Crum │  0   │ 103  │─ ─│─ ─ ─ ┘           │eFa│
 ///                         └──────┴──────┴──────┴──────┘                      │ced│
 ///                         ┌──────┬────────────────────┐   └ ─ ─ ─ ─ ─ ─ ─ ─ ▶│Fis│
 /// "LavaMonster"           │  11  │   LavaMonster\0    │                      │hWa│
 ///                         └──────┴────────────────────┘               offset │sIn│
 ///                                                                       115  │Tow│
 ///                                                                            │nTo│
 ///                                                                            │day│
 ///                                  u128 "views"                              │Yay│
 ///                                                                   buffer 0 │...│
 ///                                                                            └───┘
 /// ```
 pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
     data_type: DataType,
     views: ScalarBuffer<u128>,
     buffers: Vec<Buffer>,
     phantom: PhantomData<T>,
     nulls: Option<NullBuffer>,
 }

 impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
     fn clone(&self) -> Self {
         Self {
             data_type: T::DATA_TYPE,
             views: self.views.clone(),
             buffers: self.buffers.clone(),
             nulls: self.nulls.clone(),
             phantom: Default::default(),
         }
     }
 }

 // PartialEq
 impl<T: ByteViewType + ?Sized> PartialEq for GenericByteViewArray<T> {
     fn eq(&self, other: &Self) -> bool {
         other.data_type.eq(&self.data_type)
             && other.views.eq(&self.views)
             && other.buffers.eq(&self.buffers)
             && other.nulls.eq(&self.nulls)
     }
 }

 impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
     /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
     ///
     /// # Panics
     ///
     /// Panics if [`GenericByteViewArray::try_new`] returns an error
     pub fn new(views: ScalarBuffer<u128>, buffers: Vec<Buffer>, nulls: Option<NullBuffer>) -> Self {
         Self::try_new(views, buffers, nulls).unwrap()
     }

     /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
     ///
     /// # Errors
     ///
     /// * `views.len() != nulls.len()`
     /// * [ByteViewType::validate] fails
     pub fn try_new(
         views: ScalarBuffer<u128>,
         buffers: Vec<Buffer>,
         nulls: Option<NullBuffer>,
     ) -> Result<Self, ArrowError> {
         T::validate(&views, &buffers)?;

         if let Some(n) = nulls.as_ref() {
             if n.len() != views.len() {
                 return Err(ArrowError::InvalidArgumentError(format!(
                     "Incorrect length of null buffer for {}ViewArray, expected {} got {}",
                     T::PREFIX,
                     views.len(),
                     n.len(),
                 )));
             }
         }

         Ok(Self {
             data_type: T::DATA_TYPE,
             views,
             buffers,
             nulls,
             phantom: Default::default(),
         })
     }

     /// Create a new [`GenericByteViewArray`] from the provided parts, without validation
     ///
     /// # Safety
     ///
     /// Safe if [`Self::try_new`] would not error
     pub unsafe fn new_unchecked(
         views: ScalarBuffer<u128>,
         buffers: Vec<Buffer>,
         nulls: Option<NullBuffer>,
     ) -> Self {
         Self {
             data_type: T::DATA_TYPE,
             phantom: Default::default(),
             views,
             buffers,
             nulls,
         }
     }

     /// Create a new [`GenericByteViewArray`] of length `len` where all values are null
     pub fn new_null(len: usize) -> Self {
         Self {
             data_type: T::DATA_TYPE,
             views: vec![0; len].into(),
             buffers: vec![],
             nulls: Some(NullBuffer::new_null(len)),
             phantom: Default::default(),
         }
     }

     /// Create a new [`Scalar`] from `value`
     pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
         Scalar::new(Self::from_iter_values(std::iter::once(value)))
     }

     /// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
     pub fn from_iter_values<Ptr, I>(iter: I) -> Self
     where
         Ptr: AsRef<T::Native>,
         I: IntoIterator<Item = Ptr>,
     {
         let iter = iter.into_iter();
         let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
         for v in iter {
             builder.append_value(v);
         }
         builder.finish()
     }

     /// Deconstruct this array into its constituent parts
     pub fn into_parts(self) -> (ScalarBuffer<u128>, Vec<Buffer>, Option<NullBuffer>) {
         (self.views, self.buffers, self.nulls)
     }

     /// Returns the views buffer
     #[inline]
     pub fn views(&self) -> &ScalarBuffer<u128> {
         &self.views
     }

     /// Returns the buffers storing string data
     #[inline]
     pub fn data_buffers(&self) -> &[Buffer] {
         &self.buffers
     }

     /// Returns the element at index `i`
     /// # Panics
     /// Panics if index `i` is out of bounds.
     pub fn value(&self, i: usize) -> &T::Native {
         assert!(
             i < self.len(),
             "Trying to access an element at index {} from a {}ViewArray of length {}",
             i,
             T::PREFIX,
             self.len()
         );

         unsafe { self.value_unchecked(i) }
     }

     /// Returns the element at index `i` without bounds checking
     ///
     /// # Safety
     ///
     /// Caller is responsible for ensuring that the index is within the bounds
     /// of the array
     pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
         let v = self.views.get_unchecked(idx);
         let len = *v as u32;
         let b = if len <= 12 {
             Self::inline_value(v, len as usize)
         } else {
             let view = ByteView::from(*v);
             let data = self.buffers.get_unchecked(view.buffer_index as usize);
             let offset = view.offset as usize;
             data.get_unchecked(offset..offset + len as usize)
         };
         T::Native::from_bytes_unchecked(b)
     }

     /// Returns the first `len` bytes the inline value of the view.
     ///
     /// # Safety
     /// - The `view` must be a valid element from `Self::views()` that adheres to the view layout.
     /// - The `len` must be the length of the inlined value. It should never be larger than 12.
     #[inline(always)]
     pub unsafe fn inline_value(view: &u128, len: usize) -> &[u8] {
         debug_assert!(len <= 12);
         std::slice::from_raw_parts((view as *const u128 as *const u8).wrapping_add(4), len)
     }

     /// Constructs a new iterator for iterating over the values of this array
     pub fn iter(&self) -> ArrayIter<&Self> {
         ArrayIter::new(self)
     }

     /// Returns a zero-copy slice of this array with the indicated offset and length.
     pub fn slice(&self, offset: usize, length: usize) -> Self {
         Self {
             data_type: T::DATA_TYPE,
             views: self.views.slice(offset, length),
             buffers: self.buffers.clone(),
             nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
             phantom: Default::default(),
         }
     }

     /// Returns a "compacted" version of this array
     ///
     /// The original array will *not* be modified
     ///
     /// # Garbage Collection
     ///
     /// Before GC:
     /// ```text
     ///                                        ┌──────┐
     ///                                        │......│
     ///                                        │......│
     /// ┌────────────────────┐       ┌ ─ ─ ─ ▶ │Data1 │   Large buffer
     /// │       View 1       │─ ─ ─ ─          │......│  with data that
     /// ├────────────────────┤                 │......│ is not referred
     /// │       View 2       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
     /// └────────────────────┘                 │......│      View 2
     ///                                        │......│
     ///    2 views, refer to                   │......│
     ///   small portions of a                  └──────┘
     ///      large buffer
     /// ```
     ///
     /// After GC:
     ///
     /// ```text
     /// ┌────────────────────┐                 ┌─────┐    After gc, only
     /// │       View 1       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│     data that is
     /// ├────────────────────┤       ┌ ─ ─ ─ ▶ │Data2│    pointed to by
     /// │       View 2       │─ ─ ─ ─          └─────┘     the views is
     /// └────────────────────┘                                 left
     ///
     ///
     ///         2 views
     /// ```
     /// This method will compact the data buffers by recreating the view array and only include the data
     /// that is pointed to by the views.
     ///
     /// Note that it will copy the array regardless of whether the original array is compact.
     /// Use with caution as this can be an expensive operation, only use it when you are sure that the view
     /// array is significantly smaller than when it is originally created, e.g., after filtering or slicing.
     ///
     /// Note: this function does not attempt to canonicalize / deduplicate values. For this
     /// feature see  [`GenericByteViewBuilder::with_deduplicate_strings`].
     pub fn gc(&self) -> Self {
         let mut builder = GenericByteViewBuilder::<T>::with_capacity(self.len());

         for v in self.iter() {
             builder.append_option(v);
         }

         builder.finish()
     }

     /// Compare two [`GenericByteViewArray`] at index `left_idx` and `right_idx`
     ///
     /// Comparing two ByteView types are non-trivial.
     /// It takes a bit of patience to understand why we don't just compare two &[u8] directly.
     ///
     /// ByteView types give us the following two advantages, and we need to be careful not to lose them:
     /// (1) For string/byte smaller than 12 bytes, the entire data is inlined in the view.
     ///     Meaning that reading one array element requires only one memory access
     ///     (two memory access required for StringArray, one for offset buffer, the other for value buffer).
     ///
     /// (2) For string/byte larger than 12 bytes, we can still be faster than (for certain operations) StringArray/ByteArray,
     ///     thanks to the inlined 4 bytes.
     ///     Consider equality check:
     ///     If the first four bytes of the two strings are different, we can return false immediately (with just one memory access).
     ///
     /// If we directly compare two &[u8], we materialize the entire string (i.e., make multiple memory accesses), which might be unnecessary.
     /// - Most of the time (eq, ord), we only need to look at the first 4 bytes to know the answer,
     ///   e.g., if the inlined 4 bytes are different, we can directly return unequal without looking at the full string.
     ///
     /// # Order check flow
     /// (1) if both string are smaller than 12 bytes, we can directly compare the data inlined to the view.
     /// (2) if any of the string is larger than 12 bytes, we need to compare the full string.
     ///     (2.1) if the inlined 4 bytes are different, we can return the result immediately.
     ///     (2.2) o.w., we need to compare the full string.
     ///
     /// # Safety
     /// The left/right_idx must within range of each array
     pub unsafe fn compare_unchecked(
         left: &GenericByteViewArray<T>,
         left_idx: usize,
         right: &GenericByteViewArray<T>,
         right_idx: usize,
     ) -> std::cmp::Ordering {
         let l_view = left.views().get_unchecked(left_idx);
         let l_len = *l_view as u32;

         let r_view = right.views().get_unchecked(right_idx);
         let r_len = *r_view as u32;

         if l_len <= 12 && r_len <= 12 {
             let l_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, l_len as usize) };
             let r_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, r_len as usize) };
             return l_data.cmp(r_data);
         }

         // one of the string is larger than 12 bytes,
         // we then try to compare the inlined data first
         let l_inlined_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, 4) };
         let r_inlined_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, 4) };
         if r_inlined_data != l_inlined_data {
             return l_inlined_data.cmp(r_inlined_data);
         }

         // unfortunately, we need to compare the full data
         let l_full_data: &[u8] = unsafe { left.value_unchecked(left_idx).as_ref() };
         let r_full_data: &[u8] = unsafe { right.value_unchecked(right_idx).as_ref() };

         l_full_data.cmp(r_full_data)
     }
 }

 impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         write!(f, "{}ViewArray\n[\n", T::PREFIX)?;
         print_long_array(self, f, |array, index, f| {
             std::fmt::Debug::fmt(&array.value(index), f)
         })?;
         write!(f, "]")
     }
 }

 impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
     fn as_any(&self) -> &dyn Any {
         self
     }

     fn to_data(&self) -> ArrayData {
         self.clone().into()
     }

     fn into_data(self) -> ArrayData {
         self.into()
     }

     fn data_type(&self) -> &DataType {
         &self.data_type
     }

     fn slice(&self, offset: usize, length: usize) -> ArrayRef {
         Arc::new(self.slice(offset, length))
     }

     fn len(&self) -> usize {
         self.views.len()
     }

     fn is_empty(&self) -> bool {
         self.views.is_empty()
     }

     fn offset(&self) -> usize {
         0
     }

     fn nulls(&self) -> Option<&NullBuffer> {
         self.nulls.as_ref()
     }

     fn get_buffer_memory_size(&self) -> usize {
         let mut sum = self.buffers.iter().map(|b| b.capacity()).sum::<usize>();
         sum += self.views.inner().capacity();
         if let Some(x) = &self.nulls {
             sum += x.buffer().capacity()
         }
         sum
     }

     fn get_array_memory_size(&self) -> usize {
         std::mem::size_of::<Self>() + self.get_buffer_memory_size()
     }
 }

 impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> {
     type Item = &'a T::Native;

     fn value(&self, index: usize) -> Self::Item {
         GenericByteViewArray::value(self, index)
     }

     unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
         GenericByteViewArray::value_unchecked(self, index)
     }
 }

 impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> {
     type Item = Option<&'a T::Native>;
     type IntoIter = ArrayIter<Self>;

     fn into_iter(self) -> Self::IntoIter {
         ArrayIter::new(self)
     }
 }

 impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
     fn from(value: ArrayData) -> Self {
         let views = value.buffers()[0].clone();
         let views = ScalarBuffer::new(views, value.offset(), value.len());
         let buffers = value.buffers()[1..].to_vec();
         Self {
             data_type: T::DATA_TYPE,
             views,
             buffers,
             nulls: value.nulls().cloned(),
             phantom: Default::default(),
         }
     }
 }

 /// Convert a [`GenericByteArray`] to a [`GenericByteViewArray`] but in a smart way:
 /// If the offsets are all less than u32::MAX, then we directly build the view array on top of existing buffer.
 impl<FROM, V> From<&GenericByteArray<FROM>> for GenericByteViewArray<V>
 where
     FROM: ByteArrayType,
     FROM::Offset: OffsetSizeTrait + ToPrimitive,
     V: ByteViewType<Native = FROM::Native>,
 {
     fn from(byte_array: &GenericByteArray<FROM>) -> Self {
         let offsets = byte_array.offsets();

         let can_reuse_buffer = match offsets.last() {
             Some(offset) => offset.as_usize() < u32::MAX as usize,
             None => true,
         };

         if can_reuse_buffer {
             let len = byte_array.len();
             let mut views_builder = GenericByteViewBuilder::<V>::with_capacity(len);
             let str_values_buf = byte_array.values().clone();
             let block = views_builder.append_block(str_values_buf);
             for (i, w) in offsets.windows(2).enumerate() {
                 let offset = w[0].as_usize();
                 let end = w[1].as_usize();
                 let length = end - offset;

                 if byte_array.is_null(i) {
                     views_builder.append_null();
                 } else {
                     // Safety: the input was a valid array so it valid UTF8 (if string). And
                     // all offsets were valid
                     unsafe {
                         views_builder.append_view_unchecked(block, offset as u32, length as u32)
                     }
                 }
             }
             assert_eq!(views_builder.len(), len);
             views_builder.finish()
         } else {
             // TODO: the first u32::MAX can still be reused
             GenericByteViewArray::<V>::from_iter(byte_array.iter())
         }
     }
 }

 impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
     fn from(mut array: GenericByteViewArray<T>) -> Self {
         let len = array.len();
         array.buffers.insert(0, array.views.into_inner());
         let builder = ArrayDataBuilder::new(T::DATA_TYPE)
             .len(len)
             .buffers(array.buffers)
             .nulls(array.nulls);

         unsafe { builder.build_unchecked() }
     }
 }

 impl<'a, Ptr, T> FromIterator<&'a Option<Ptr>> for GenericByteViewArray<T>
 where
     Ptr: AsRef<T::Native> + 'a,
     T: ByteViewType + ?Sized,
 {
     fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
         iter.into_iter()
             .map(|o| o.as_ref().map(|p| p.as_ref()))
             .collect()
     }
 }

 impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T>
 where
     Ptr: AsRef<T::Native>,
 {
     fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
         let iter = iter.into_iter();
         let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
         builder.extend(iter);
         builder.finish()
     }
 }

 /// A [`GenericByteViewArray`] of `[u8]`
 ///
 /// # Example
 /// ```
 /// use arrow_array::BinaryViewArray;
 /// let array = BinaryViewArray::from_iter_values(vec![b"hello" as &[u8], b"world", b"lulu", b"large payload over 12 bytes"]);
 /// assert_eq!(array.value(0), b"hello");
 /// assert_eq!(array.value(3), b"large payload over 12 bytes");
 /// ```
 pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;

 impl BinaryViewArray {
     /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
     /// If items not utf8 data, validate will fail and error returned.
     pub fn to_string_view(self) -> Result<StringViewArray, ArrowError> {
         StringViewType::validate(self.views(), self.data_buffers())?;
         unsafe { Ok(self.to_string_view_unchecked()) }
     }

     /// Convert the [`BinaryViewArray`] to [`StringViewArray`]
     /// # Safety
     /// Caller is responsible for ensuring that items in array are utf8 data.
     pub unsafe fn to_string_view_unchecked(self) -> StringViewArray {
         StringViewArray::new_unchecked(self.views, self.buffers, self.nulls)
     }
 }

 impl From<Vec<&[u8]>> for BinaryViewArray {
     fn from(v: Vec<&[u8]>) -> Self {
         Self::from_iter_values(v)
     }
 }

 impl From<Vec<Option<&[u8]>>> for BinaryViewArray {
     fn from(v: Vec<Option<&[u8]>>) -> Self {
         v.into_iter().collect()
     }
 }

 /// A [`GenericByteViewArray`] that stores utf8 data
 ///
 /// # Example
 /// ```
 /// use arrow_array::StringViewArray;
 /// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
 /// assert_eq!(array.value(0), "hello");
 /// assert_eq!(array.value(3), "large payload over 12 bytes");
 /// ```
 pub type StringViewArray = GenericByteViewArray<StringViewType>;

 impl StringViewArray {
     /// Convert the [`StringViewArray`] to [`BinaryViewArray`]
     pub fn to_binary_view(self) -> BinaryViewArray {
         unsafe { BinaryViewArray::new_unchecked(self.views, self.buffers, self.nulls) }
     }

     /// Returns true if all data within this array is ASCII
     pub fn is_ascii(&self) -> bool {
         // Alternative (but incorrect): directly check the underlying buffers
         // (1) Our string view might be sparse, i.e., a subset of the buffers,
         //      so even if the buffer is not ascii, we can still be ascii.
         // (2) It is quite difficult to know the range of each buffer (unlike StringArray)
         // This means that this operation is quite expensive, shall we cache the result?
         //  i.e. track `is_ascii` in the builder.
         self.iter().all(|v| match v {
             Some(v) => v.is_ascii(),
             None => true,
         })
     }
 }

 impl From<Vec<&str>> for StringViewArray {
     fn from(v: Vec<&str>) -> Self {
         Self::from_iter_values(v)
     }
 }

 impl From<Vec<Option<&str>>> for StringViewArray {
     fn from(v: Vec<Option<&str>>) -> Self {
         v.into_iter().collect()
     }
 }

 impl From<Vec<String>> for StringViewArray {
     fn from(v: Vec<String>) -> Self {
         Self::from_iter_values(v)
     }
 }

 impl From<Vec<Option<String>>> for StringViewArray {
     fn from(v: Vec<Option<String>>) -> Self {
         v.into_iter().collect()
     }
 }

 #[cfg(test)]
 mod tests {
     use crate::builder::{BinaryViewBuilder, StringViewBuilder};
     use crate::{Array, BinaryViewArray, StringViewArray};
     use arrow_buffer::{Buffer, ScalarBuffer};
     use arrow_data::ByteView;

     #[test]
     fn try_new_string() {
         let array = StringViewArray::from_iter_values(vec![
             "hello",
             "world",
             "lulu",
             "large payload over 12 bytes",
         ]);
         assert_eq!(array.value(0), "hello");
         assert_eq!(array.value(3), "large payload over 12 bytes");
     }

     #[test]
     fn try_new_binary() {
         let array = BinaryViewArray::from_iter_values(vec![
             b"hello".as_slice(),
             b"world".as_slice(),
             b"lulu".as_slice(),
             b"large payload over 12 bytes".as_slice(),
         ]);
         assert_eq!(array.value(0), b"hello");
         assert_eq!(array.value(3), b"large payload over 12 bytes");
     }

     #[test]
     fn try_new_empty_string() {
         // test empty array
         let array = {
             let mut builder = StringViewBuilder::new();
             builder.finish()
         };
         assert!(array.is_empty());
     }

     #[test]
     fn try_new_empty_binary() {
         // test empty array
         let array = {
             let mut builder = BinaryViewBuilder::new();
             builder.finish()
         };
         assert!(array.is_empty());
     }

     #[test]
     fn test_append_string() {
         // test builder append
         let array = {
             let mut builder = StringViewBuilder::new();
             builder.append_value("hello");
             builder.append_null();
             builder.append_option(Some("large payload over 12 bytes"));
             builder.finish()
         };
         assert_eq!(array.value(0), "hello");
         assert!(array.is_null(1));
         assert_eq!(array.value(2), "large payload over 12 bytes");
     }

     #[test]
     fn test_append_binary() {
         // test builder append
         let array = {
             let mut builder = BinaryViewBuilder::new();
             builder.append_value(b"hello");
             builder.append_null();
             builder.append_option(Some(b"large payload over 12 bytes"));
             builder.finish()
         };
         assert_eq!(array.value(0), b"hello");
         assert!(array.is_null(1));
         assert_eq!(array.value(2), b"large payload over 12 bytes");
     }

     #[test]
     fn test_in_progress_recreation() {
         let array = {
             // make a builder with small block size.
             let mut builder = StringViewBuilder::new().with_fixed_block_size(14);
             builder.append_value("large payload over 12 bytes");
             builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"));
             builder.finish()
         };
         assert_eq!(array.value(0), "large payload over 12 bytes");
         assert_eq!(array.value(1), "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created");
         assert_eq!(2, array.buffers.len());
     }

     #[test]
     #[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")]
     fn new_with_invalid_view_data() {
         let v = "large payload over 12 bytes";
         let view = ByteView {
             length: 13,
             prefix: u32::from_le_bytes(v.as_bytes()[0..4].try_into().unwrap()),
             buffer_index: 3,
             offset: 1,
         };
         let views = ScalarBuffer::from(vec![view.into()]);
         let buffers = vec![Buffer::from_slice_ref(v)];
         StringViewArray::new(views, buffers, None);
     }

     #[test]
     #[should_panic(
         expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
     )]
     fn new_with_invalid_utf8_data() {
         let v: Vec<u8> = vec![0xf0, 0x80, 0x80, 0x80];
         let view = ByteView {
             length: v.len() as u32,
             prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()),
             buffer_index: 0,
             offset: 0,
         };
         let views = ScalarBuffer::from(vec![view.into()]);
         let buffers = vec![Buffer::from_slice_ref(v)];
         StringViewArray::new(views, buffers, None);
     }

     #[test]
     #[should_panic(expected = "View at index 0 contained non-zero padding for string of length 1")]
     fn new_with_invalid_zero_padding() {
         let mut data = [0; 12];
         data[0] = b'H';
         data[11] = 1; // no zero padding

         let mut view_buffer = [0; 16];
         view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes());
         view_buffer[4..].copy_from_slice(&data);

         let view = ByteView::from(u128::from_le_bytes(view_buffer));
         let views = ScalarBuffer::from(vec![view.into()]);
         let buffers = vec![];
         StringViewArray::new(views, buffers, None);
     }

     #[test]
     #[should_panic(expected = "Mismatch between embedded prefix and data")]
     fn test_mismatch_between_embedded_prefix_and_data() {
         let input_str_1 = "Hello, Rustaceans!";
         let input_str_2 = "Hallo, Rustaceans!";
         let length = input_str_1.len() as u32;
         assert!(input_str_1.len() > 12);

         let mut view_buffer = [0; 16];
         view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
         view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]);
         view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes());
         view_buffer[12..].copy_from_slice(&0u32.to_le_bytes());
         let view = ByteView::from(u128::from_le_bytes(view_buffer));
         let views = ScalarBuffer::from(vec![view.into()]);
         let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())];

         StringViewArray::new(views, buffers, None);
     }

     #[test]
     fn test_gc() {
         let test_data = [
             Some("longer than 12 bytes"),
             Some("short"),
             Some("t"),
             Some("longer than 12 bytes"),
             None,
             Some("short"),
         ];

         let array = {
             let mut builder = StringViewBuilder::new().with_fixed_block_size(8); // create multiple buffers
             test_data.into_iter().for_each(|v| builder.append_option(v));
             builder.finish()
         };
         assert!(array.buffers.len() > 1);

         fn check_gc(to_test: &StringViewArray) {
             let gc = to_test.gc();
             assert_ne!(to_test.data_buffers().len(), gc.data_buffers().len());

             to_test.iter().zip(gc.iter()).for_each(|(a, b)| {
                 assert_eq!(a, b);
             });
             assert_eq!(to_test.len(), gc.len());
         }

         check_gc(&array);
         check_gc(&array.slice(1, 3));
         check_gc(&array.slice(2, 1));
         check_gc(&array.slice(2, 2));
         check_gc(&array.slice(3, 1));
     }

     #[test]
     fn test_eq() {
         let test_data = [
             Some("longer than 12 bytes"),
             None,
             Some("short"),
             Some("again, this is longer than 12 bytes"),
         ];

         let array1 = {
             let mut builder = StringViewBuilder::new().with_fixed_block_size(8);
             test_data.into_iter().for_each(|v| builder.append_option(v));
             builder.finish()
         };
         let array2 = {
             // create a new array with the same data but different layout
             let mut builder = StringViewBuilder::new().with_fixed_block_size(100);
             test_data.into_iter().for_each(|v| builder.append_option(v));
             builder.finish()
         };
         assert_eq!(array1, array1.clone());
         assert_eq!(array2, array2.clone());
         assert_ne!(array1, array2);
     }
 }