| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| use std::any::Any; |
| use std::marker::PhantomData; |
| use std::sync::Arc; |
| |
| use arrow_buffer::{Buffer, NullBufferBuilder, ScalarBuffer}; |
| use arrow_data::{ByteView, MAX_INLINE_VIEW_LEN}; |
| use arrow_schema::ArrowError; |
| use hashbrown::HashTable; |
| use hashbrown::hash_table::Entry; |
| |
| use crate::builder::{ArrayBuilder, BinaryLikeArrayBuilder, StringLikeArrayBuilder}; |
| use crate::types::bytes::ByteArrayNativeType; |
| use crate::types::{BinaryViewType, ByteViewType, StringViewType}; |
| use crate::{Array, ArrayRef, GenericByteViewArray}; |
| |
| const STARTING_BLOCK_SIZE: u32 = 8 * 1024; // 8KiB |
| const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; // 2MiB |
| |
| enum BlockSizeGrowthStrategy { |
| Fixed { size: u32 }, |
| Exponential { current_size: u32 }, |
| } |
| |
| impl BlockSizeGrowthStrategy { |
| fn next_size(&mut self) -> u32 { |
| match self { |
| Self::Fixed { size } => *size, |
| Self::Exponential { current_size } => { |
| if *current_size < MAX_BLOCK_SIZE { |
| // we have fixed start/end block sizes, so we can't overflow |
| *current_size = current_size.saturating_mul(2); |
| *current_size |
| } else { |
| MAX_BLOCK_SIZE |
| } |
| } |
| } |
| } |
| } |
| |
| /// A builder for [`GenericByteViewArray`] |
| /// |
| /// A [`GenericByteViewArray`] consists of a list of data blocks containing string data, |
| /// and a list of views into those buffers. |
| /// |
| /// See examples on [`StringViewBuilder`] and [`BinaryViewBuilder`] |
| /// |
| /// This builder can be used in two ways |
| /// |
| /// # Append Values |
| /// |
| /// To avoid bump allocating, this builder allocates data in fixed size blocks, configurable |
| /// using [`GenericByteViewBuilder::with_fixed_block_size`]. [`GenericByteViewBuilder::append_value`] |
| /// writes values larger than [`MAX_INLINE_VIEW_LEN`] bytes to the current in-progress block, with values smaller |
| /// than [`MAX_INLINE_VIEW_LEN`] bytes inlined into the views. If a value is appended that will not fit in the |
| /// in-progress block, it will be closed, and a new block of sufficient size allocated |
| /// |
| /// # Append Views |
| /// |
| /// Some use-cases may wish to reuse an existing allocation containing string data, for example, |
| /// when parsing data from a parquet data page. In such a case entire blocks can be appended |
| /// using [`GenericByteViewBuilder::append_block`] and then views into this block appended |
| /// using [`GenericByteViewBuilder::try_append_view`] |
| pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> { |
| views_buffer: Vec<u128>, |
| null_buffer_builder: NullBufferBuilder, |
| completed: Vec<Buffer>, |
| in_progress: Vec<u8>, |
| block_size: BlockSizeGrowthStrategy, |
| /// Some if deduplicating strings |
| /// map `<string hash> -> <index to the views>` |
| string_tracker: Option<(HashTable<usize>, ahash::RandomState)>, |
| max_deduplication_len: Option<u32>, |
| phantom: PhantomData<T>, |
| } |
| |
| impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> { |
| /// Creates a new [`GenericByteViewBuilder`]. |
| pub fn new() -> Self { |
| Self::with_capacity(1024) |
| } |
| |
| /// Creates a new [`GenericByteViewBuilder`] with space for `capacity` string values. |
| pub fn with_capacity(capacity: usize) -> Self { |
| Self { |
| views_buffer: Vec::with_capacity(capacity), |
| null_buffer_builder: NullBufferBuilder::new(capacity), |
| completed: vec![], |
| in_progress: vec![], |
| block_size: BlockSizeGrowthStrategy::Exponential { |
| current_size: STARTING_BLOCK_SIZE, |
| }, |
| string_tracker: None, |
| max_deduplication_len: None, |
| phantom: Default::default(), |
| } |
| } |
| |
| /// Configure max deduplication length when deduplicating strings while building the array. |
| /// Default is None. |
| /// |
| /// When [`Self::with_deduplicate_strings`] is enabled, the builder attempts to deduplicate |
| /// any strings longer than 12 bytes. However, since it takes time proportional to the length |
| /// of the string to deduplicate, setting this option limits the CPU overhead for this option. |
| pub fn with_max_deduplication_len(self, max_deduplication_len: u32) -> Self { |
| debug_assert!( |
| max_deduplication_len > 0, |
| "max_deduplication_len must be greater than 0" |
| ); |
| Self { |
| max_deduplication_len: Some(max_deduplication_len), |
| ..self |
| } |
| } |
| |
| /// Set a fixed buffer size for variable length strings |
| /// |
| /// The block size is the size of the buffer used to store values greater |
| /// than [`MAX_INLINE_VIEW_LEN`] bytes. The builder allocates new buffers when the current |
| /// buffer is full. |
| /// |
| /// By default the builder balances buffer size and buffer count by |
| /// growing buffer size exponentially from 8KB up to 2MB. The |
| /// first buffer allocated is 8KB, then 16KB, then 32KB, etc up to 2MB. |
| /// |
| /// If this method is used, any new buffers allocated are |
| /// exactly this size. This can be useful for advanced users |
| /// that want to control the memory usage and buffer count. |
| /// |
| /// See <https://github.com/apache/arrow-rs/issues/6094> for more details on the implications. |
| pub fn with_fixed_block_size(self, block_size: u32) -> Self { |
| debug_assert!(block_size > 0, "Block size must be greater than 0"); |
| Self { |
| block_size: BlockSizeGrowthStrategy::Fixed { size: block_size }, |
| ..self |
| } |
| } |
| |
| /// Deduplicate strings while building the array |
| /// |
| /// This will potentially decrease the memory usage if the array have repeated strings |
| /// It will also increase the time to build the array as it needs to hash the strings |
| pub fn with_deduplicate_strings(self) -> Self { |
| Self { |
| string_tracker: Some(( |
| HashTable::with_capacity(self.views_buffer.capacity()), |
| Default::default(), |
| )), |
| ..self |
| } |
| } |
| |
| /// Append a new data block returning the new block offset |
| /// |
| /// Note: this will first flush any in-progress block |
| /// |
| /// This allows appending views from blocks added using [`Self::append_block`]. See |
| /// [`Self::append_value`] for appending individual values |
| /// |
| /// ``` |
| /// # use arrow_array::builder::StringViewBuilder; |
| /// let mut builder = StringViewBuilder::new(); |
| /// |
| /// let block = builder.append_block(b"helloworldbingobongo".into()); |
| /// |
| /// builder.try_append_view(block, 0, 5).unwrap(); |
| /// builder.try_append_view(block, 5, 5).unwrap(); |
| /// builder.try_append_view(block, 10, 5).unwrap(); |
| /// builder.try_append_view(block, 15, 5).unwrap(); |
| /// builder.try_append_view(block, 0, 15).unwrap(); |
| /// let array = builder.finish(); |
| /// |
| /// let actual: Vec<_> = array.iter().flatten().collect(); |
| /// let expected = &["hello", "world", "bingo", "bongo", "helloworldbingo"]; |
| /// assert_eq!(actual, expected); |
| /// ``` |
| pub fn append_block(&mut self, buffer: Buffer) -> u32 { |
| assert!(buffer.len() < u32::MAX as usize); |
| |
| self.flush_in_progress(); |
| let offset = self.completed.len(); |
| self.push_completed(buffer); |
| offset as u32 |
| } |
| |
| /// Append a view of the given `block`, `offset` and `length` |
| /// |
| /// # Safety |
| /// (1) The block must have been added using [`Self::append_block`] |
| /// (2) The range `offset..offset+length` must be within the bounds of the block |
| /// (3) The data in the block must be valid of type `T` |
| pub unsafe fn append_view_unchecked(&mut self, block: u32, offset: u32, len: u32) { |
| let b = unsafe { self.completed.get_unchecked(block as usize) }; |
| let start = offset as usize; |
| let end = start.saturating_add(len as usize); |
| let b = unsafe { b.get_unchecked(start..end) }; |
| |
| let view = make_view(b, block, offset); |
| self.views_buffer.push(view); |
| self.null_buffer_builder.append_non_null(); |
| } |
| |
| /// Appends an array to the builder. |
| /// This will flush any in-progress block and append the data buffers |
| /// and add the (adapted) views. |
| pub fn append_array(&mut self, array: &GenericByteViewArray<T>) { |
| self.flush_in_progress(); |
| // keep original views if this array is the first to be added or if there are no data buffers (all inline views) |
| let keep_views = self.completed.is_empty() || array.data_buffers().is_empty(); |
| let starting_buffer = self.completed.len() as u32; |
| |
| self.completed.extend(array.data_buffers().iter().cloned()); |
| |
| if keep_views { |
| self.views_buffer.extend_from_slice(array.views()); |
| } else { |
| self.views_buffer.extend(array.views().iter().map(|v| { |
| let mut byte_view = ByteView::from(*v); |
| if byte_view.length > MAX_INLINE_VIEW_LEN { |
| // Small views (<=12 bytes) are inlined, so only need to update large views |
| byte_view.buffer_index += starting_buffer; |
| }; |
| |
| byte_view.as_u128() |
| })); |
| } |
| |
| if let Some(null_buffer) = array.nulls() { |
| self.null_buffer_builder.append_buffer(null_buffer); |
| } else { |
| self.null_buffer_builder.append_n_non_nulls(array.len()); |
| } |
| } |
| |
| /// Try to append a view of the given `block`, `offset` and `length` |
| /// |
| /// See [`Self::append_block`] |
| pub fn try_append_view(&mut self, block: u32, offset: u32, len: u32) -> Result<(), ArrowError> { |
| let b = self.completed.get(block as usize).ok_or_else(|| { |
| ArrowError::InvalidArgumentError(format!("No block found with index {block}")) |
| })?; |
| let start = offset as usize; |
| let end = start.saturating_add(len as usize); |
| |
| let b = b.get(start..end).ok_or_else(|| { |
| ArrowError::InvalidArgumentError(format!( |
| "Range {start}..{end} out of bounds for block of length {}", |
| b.len() |
| )) |
| })?; |
| |
| if T::Native::from_bytes_checked(b).is_none() { |
| return Err(ArrowError::InvalidArgumentError( |
| "Invalid view data".to_string(), |
| )); |
| } |
| |
| unsafe { |
| self.append_view_unchecked(block, offset, len); |
| } |
| Ok(()) |
| } |
| |
| /// Flushes the in progress block if any |
| #[inline] |
| fn flush_in_progress(&mut self) { |
| if !self.in_progress.is_empty() { |
| let f = Buffer::from_vec(std::mem::take(&mut self.in_progress)); |
| self.push_completed(f) |
| } |
| } |
| |
| /// Append a block to `self.completed`, checking for overflow |
| #[inline] |
| fn push_completed(&mut self, block: Buffer) { |
| assert!(block.len() < u32::MAX as usize, "Block too large"); |
| assert!(self.completed.len() < u32::MAX as usize, "Too many blocks"); |
| self.completed.push(block); |
| } |
| |
| /// Returns the value at the given index |
| /// Useful if we want to know what value has been inserted to the builder |
| /// The index has to be smaller than `self.len()`, otherwise it will panic |
| pub fn get_value(&self, index: usize) -> &[u8] { |
| let view = self.views_buffer.as_slice().get(index).unwrap(); |
| let len = *view as u32; |
| if len <= MAX_INLINE_VIEW_LEN { |
| // # Safety |
| // The view is valid from the builder |
| unsafe { GenericByteViewArray::<T>::inline_value(view, len as usize) } |
| } else { |
| let view = ByteView::from(*view); |
| if view.buffer_index < self.completed.len() as u32 { |
| let block = &self.completed[view.buffer_index as usize]; |
| &block[view.offset as usize..view.offset as usize + view.length as usize] |
| } else { |
| &self.in_progress[view.offset as usize..view.offset as usize + view.length as usize] |
| } |
| } |
| } |
| |
| /// Appends a value into the builder |
| /// |
| /// # Panics |
| /// |
| /// Panics if |
| /// - String buffer count exceeds `u32::MAX` |
| /// - String length exceeds `u32::MAX` |
| #[inline] |
| pub fn append_value(&mut self, value: impl AsRef<T::Native>) { |
| self.try_append_value(value).unwrap() |
| } |
| |
| /// Appends a value into the builder |
| /// |
| /// # Errors |
| /// |
| /// Returns an error if: |
| /// - String buffer count exceeds `u32::MAX` |
| /// - String length exceeds `u32::MAX` |
| #[inline] |
| pub fn try_append_value(&mut self, value: impl AsRef<T::Native>) -> Result<(), ArrowError> { |
| let v: &[u8] = value.as_ref().as_ref(); |
| let length: u32 = v.len().try_into().map_err(|_| { |
| ArrowError::InvalidArgumentError(format!("String length {} exceeds u32::MAX", v.len())) |
| })?; |
| |
| if length <= MAX_INLINE_VIEW_LEN { |
| let mut view_buffer = [0; 16]; |
| view_buffer[0..4].copy_from_slice(&length.to_le_bytes()); |
| view_buffer[4..4 + v.len()].copy_from_slice(v); |
| self.views_buffer.push(u128::from_le_bytes(view_buffer)); |
| self.null_buffer_builder.append_non_null(); |
| return Ok(()); |
| } |
| |
| // Deduplication if: |
| // (1) deduplication is enabled. |
| // (2) len > `MAX_INLINE_VIEW_LEN` and len <= `max_deduplication_len` |
| let can_deduplicate = self.string_tracker.is_some() |
| && self |
| .max_deduplication_len |
| .map(|max_length| length <= max_length) |
| .unwrap_or(true); |
| if can_deduplicate { |
| if let Some((mut ht, hasher)) = self.string_tracker.take() { |
| let hash_val = hasher.hash_one(v); |
| let hasher_fn = |v: &_| hasher.hash_one(v); |
| |
| let entry = ht.entry( |
| hash_val, |
| |idx| { |
| let stored_value = self.get_value(*idx); |
| v == stored_value |
| }, |
| hasher_fn, |
| ); |
| match entry { |
| Entry::Occupied(occupied) => { |
| // If the string already exists, we will directly use the view |
| let idx = occupied.get(); |
| self.views_buffer.push(self.views_buffer[*idx]); |
| self.null_buffer_builder.append_non_null(); |
| self.string_tracker = Some((ht, hasher)); |
| return Ok(()); |
| } |
| Entry::Vacant(vacant) => { |
| // o.w. we insert the (string hash -> view index) |
| // the idx is current length of views_builder, as we are inserting a new view |
| vacant.insert(self.views_buffer.len()); |
| } |
| } |
| self.string_tracker = Some((ht, hasher)); |
| } |
| } |
| |
| let required_cap = self.in_progress.len() + v.len(); |
| if self.in_progress.capacity() < required_cap { |
| self.flush_in_progress(); |
| let to_reserve = v.len().max(self.block_size.next_size() as usize); |
| self.in_progress.reserve(to_reserve); |
| }; |
| |
| let offset = self.in_progress.len() as u32; |
| self.in_progress.extend_from_slice(v); |
| |
| let buffer_index: u32 = self.completed.len().try_into().map_err(|_| { |
| ArrowError::InvalidArgumentError(format!( |
| "Buffer count {} exceeds u32::MAX", |
| self.completed.len() |
| )) |
| })?; |
| |
| let view = ByteView { |
| length, |
| // This won't panic as we checked the length of prefix earlier. |
| prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()), |
| buffer_index, |
| offset, |
| }; |
| self.views_buffer.push(view.into()); |
| self.null_buffer_builder.append_non_null(); |
| |
| Ok(()) |
| } |
| |
| /// Append an `Option` value into the builder |
| #[inline] |
| pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) { |
| match value { |
| None => self.append_null(), |
| Some(v) => self.append_value(v), |
| }; |
| } |
| |
| /// Append a null value into the builder |
| #[inline] |
| pub fn append_null(&mut self) { |
| self.null_buffer_builder.append_null(); |
| self.views_buffer.push(0); |
| } |
| |
| /// Builds the [`GenericByteViewArray`] and reset this builder |
| pub fn finish(&mut self) -> GenericByteViewArray<T> { |
| self.flush_in_progress(); |
| let completed = std::mem::take(&mut self.completed); |
| let nulls = self.null_buffer_builder.finish(); |
| if let Some((ht, _)) = self.string_tracker.as_mut() { |
| ht.clear(); |
| } |
| let views = std::mem::take(&mut self.views_buffer); |
| // SAFETY: valid by construction |
| unsafe { GenericByteViewArray::new_unchecked(views.into(), completed, nulls) } |
| } |
| |
| /// Builds the [`GenericByteViewArray`] without resetting the builder |
| pub fn finish_cloned(&self) -> GenericByteViewArray<T> { |
| let mut completed = self.completed.clone(); |
| if !self.in_progress.is_empty() { |
| completed.push(Buffer::from_slice_ref(&self.in_progress)); |
| } |
| let len = self.views_buffer.len(); |
| let views = Buffer::from_slice_ref(self.views_buffer.as_slice()); |
| let views = ScalarBuffer::new(views, 0, len); |
| let nulls = self.null_buffer_builder.finish_cloned(); |
| // SAFETY: valid by construction |
| unsafe { GenericByteViewArray::new_unchecked(views, completed, nulls) } |
| } |
| |
| /// Returns the current null buffer as a slice |
| pub fn validity_slice(&self) -> Option<&[u8]> { |
| self.null_buffer_builder.as_slice() |
| } |
| |
| /// Return the allocated size of this builder in bytes, useful for memory accounting. |
| pub fn allocated_size(&self) -> usize { |
| let views = self.views_buffer.capacity() * std::mem::size_of::<u128>(); |
| let null = self.null_buffer_builder.allocated_size(); |
| let buffer_size = self.completed.iter().map(|b| b.capacity()).sum::<usize>(); |
| let in_progress = self.in_progress.capacity(); |
| let tracker = match &self.string_tracker { |
| Some((ht, _)) => ht.capacity() * std::mem::size_of::<usize>(), |
| None => 0, |
| }; |
| buffer_size + in_progress + tracker + views + null |
| } |
| } |
| |
| impl<T: ByteViewType + ?Sized> Default for GenericByteViewBuilder<T> { |
| fn default() -> Self { |
| Self::new() |
| } |
| } |
| |
| impl<T: ByteViewType + ?Sized> std::fmt::Debug for GenericByteViewBuilder<T> { |
| fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { |
| write!(f, "{}ViewBuilder", T::PREFIX)?; |
| f.debug_struct("") |
| .field("views_buffer", &self.views_buffer) |
| .field("in_progress", &self.in_progress) |
| .field("completed", &self.completed) |
| .field("null_buffer_builder", &self.null_buffer_builder) |
| .finish() |
| } |
| } |
| |
| impl<T: ByteViewType + ?Sized> ArrayBuilder for GenericByteViewBuilder<T> { |
| fn len(&self) -> usize { |
| self.null_buffer_builder.len() |
| } |
| |
| fn finish(&mut self) -> ArrayRef { |
| Arc::new(self.finish()) |
| } |
| |
| fn finish_cloned(&self) -> ArrayRef { |
| Arc::new(self.finish_cloned()) |
| } |
| |
| fn as_any(&self) -> &dyn Any { |
| self |
| } |
| |
| fn as_any_mut(&mut self) -> &mut dyn Any { |
| self |
| } |
| |
| fn into_box_any(self: Box<Self>) -> Box<dyn Any> { |
| self |
| } |
| } |
| |
| impl<T: ByteViewType + ?Sized, V: AsRef<T::Native>> Extend<Option<V>> |
| for GenericByteViewBuilder<T> |
| { |
| #[inline] |
| fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) { |
| for v in iter { |
| self.append_option(v) |
| } |
| } |
| } |
| |
| /// Array builder for [`StringViewArray`][crate::StringViewArray] |
| /// |
| /// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with |
| /// [`GenericByteViewBuilder::append_null`] as normal. |
| /// |
| /// # Example |
| /// ``` |
| /// # use arrow_array::builder::StringViewBuilder; |
| /// # use arrow_array::StringViewArray; |
| /// let mut builder = StringViewBuilder::new(); |
| /// builder.append_value("hello"); |
| /// builder.append_null(); |
| /// builder.append_value("world"); |
| /// let array = builder.finish(); |
| /// |
| /// let expected = vec![Some("hello"), None, Some("world")]; |
| /// let actual: Vec<_> = array.iter().collect(); |
| /// assert_eq!(expected, actual); |
| /// ``` |
| pub type StringViewBuilder = GenericByteViewBuilder<StringViewType>; |
| |
| impl StringLikeArrayBuilder for StringViewBuilder { |
| fn type_name() -> &'static str { |
| std::any::type_name::<StringViewBuilder>() |
| } |
| fn with_capacity(capacity: usize) -> Self { |
| Self::with_capacity(capacity) |
| } |
| fn append_value(&mut self, value: &str) { |
| Self::append_value(self, value); |
| } |
| fn append_null(&mut self) { |
| Self::append_null(self); |
| } |
| } |
| |
| /// Array builder for [`BinaryViewArray`][crate::BinaryViewArray] |
| /// |
| /// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with |
| /// [`GenericByteViewBuilder::append_null`] as normal. |
| /// |
| /// # Example |
| /// ``` |
| /// # use arrow_array::builder::BinaryViewBuilder; |
| /// use arrow_array::BinaryViewArray; |
| /// let mut builder = BinaryViewBuilder::new(); |
| /// builder.append_value("hello"); |
| /// builder.append_null(); |
| /// builder.append_value("world"); |
| /// let array = builder.finish(); |
| /// |
| /// let expected: Vec<Option<&[u8]>> = vec![Some(b"hello"), None, Some(b"world")]; |
| /// let actual: Vec<_> = array.iter().collect(); |
| /// assert_eq!(expected, actual); |
| /// ``` |
| /// |
| pub type BinaryViewBuilder = GenericByteViewBuilder<BinaryViewType>; |
| |
| impl BinaryLikeArrayBuilder for BinaryViewBuilder { |
| fn type_name() -> &'static str { |
| std::any::type_name::<BinaryViewBuilder>() |
| } |
| fn with_capacity(capacity: usize) -> Self { |
| Self::with_capacity(capacity) |
| } |
| fn append_value(&mut self, value: &[u8]) { |
| Self::append_value(self, value); |
| } |
| fn append_null(&mut self) { |
| Self::append_null(self); |
| } |
| } |
| |
| /// Creates a view from a fixed length input (the compiler can generate |
| /// specialized code for this) |
| fn make_inlined_view<const LEN: usize>(data: &[u8]) -> u128 { |
| let mut view_buffer = [0; 16]; |
| view_buffer[0..4].copy_from_slice(&(LEN as u32).to_le_bytes()); |
| view_buffer[4..4 + LEN].copy_from_slice(&data[..LEN]); |
| u128::from_le_bytes(view_buffer) |
| } |
| |
| /// Create a view based on the given data, block id and offset. |
| /// |
| /// Note that the code below is carefully examined with x86_64 assembly code: <https://godbolt.org/z/685YPsd5G> |
| /// The goal is to avoid calling into `ptr::copy_non_interleave`, which makes function call (i.e., not inlined), |
| /// which slows down things. |
| #[inline(never)] |
| pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 { |
| let len = data.len(); |
| |
| // Generate specialized code for each potential small string length |
| // to improve performance |
| match len { |
| 0 => make_inlined_view::<0>(data), |
| 1 => make_inlined_view::<1>(data), |
| 2 => make_inlined_view::<2>(data), |
| 3 => make_inlined_view::<3>(data), |
| 4 => make_inlined_view::<4>(data), |
| 5 => make_inlined_view::<5>(data), |
| 6 => make_inlined_view::<6>(data), |
| 7 => make_inlined_view::<7>(data), |
| 8 => make_inlined_view::<8>(data), |
| 9 => make_inlined_view::<9>(data), |
| 10 => make_inlined_view::<10>(data), |
| 11 => make_inlined_view::<11>(data), |
| 12 => make_inlined_view::<12>(data), |
| // When string is longer than 12 bytes, it can't be inlined, we create a ByteView instead. |
| _ => { |
| let view = ByteView { |
| length: len as u32, |
| prefix: u32::from_le_bytes(data[0..4].try_into().unwrap()), |
| buffer_index: block_id, |
| offset, |
| }; |
| view.as_u128() |
| } |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use core::str; |
| |
| use arrow_buffer::ArrowNativeType; |
| |
| use super::*; |
| |
| #[test] |
| fn test_string_max_deduplication_len() { |
| let value_1 = "short"; |
| let value_2 = "not so similar string but long"; |
| let value_3 = "1234567890123"; |
| |
| let max_deduplication_len = MAX_INLINE_VIEW_LEN * 2; |
| |
| let mut builder = StringViewBuilder::new() |
| .with_deduplicate_strings() |
| .with_max_deduplication_len(max_deduplication_len); |
| |
| assert!(value_1.len() < MAX_INLINE_VIEW_LEN.as_usize()); |
| assert!(value_2.len() > max_deduplication_len.as_usize()); |
| assert!( |
| value_3.len() > MAX_INLINE_VIEW_LEN.as_usize() |
| && value_3.len() < max_deduplication_len.as_usize() |
| ); |
| |
| // append value1 (short), expect it is inlined and not deduplicated |
| builder.append_value(value_1); // view 0 |
| builder.append_value(value_1); // view 1 |
| // append value2, expect second copy is not deduplicated as it exceeds max_deduplication_len |
| builder.append_value(value_2); // view 2 |
| builder.append_value(value_2); // view 3 |
| // append value3, expect second copy is deduplicated |
| builder.append_value(value_3); // view 4 |
| builder.append_value(value_3); // view 5 |
| |
| let array = builder.finish(); |
| |
| // verify |
| let v2 = ByteView::from(array.views()[2]); |
| let v3 = ByteView::from(array.views()[3]); |
| assert_eq!(v2.buffer_index, v3.buffer_index); // stored in same buffer |
| assert_ne!(v2.offset, v3.offset); // different offsets --> not deduplicated |
| |
| let v4 = ByteView::from(array.views()[4]); |
| let v5 = ByteView::from(array.views()[5]); |
| assert_eq!(v4.buffer_index, v5.buffer_index); // stored in same buffer |
| assert_eq!(v4.offset, v5.offset); // same offsets --> deduplicated |
| } |
| |
| #[test] |
| fn test_string_view_deduplicate() { |
| let value_1 = "long string to test string view"; |
| let value_2 = "not so similar string but long"; |
| |
| let mut builder = StringViewBuilder::new() |
| .with_deduplicate_strings() |
| .with_fixed_block_size(value_1.len() as u32 * 2); // so that we will have multiple buffers |
| |
| let values = vec![ |
| Some(value_1), |
| Some(value_2), |
| Some("short"), |
| Some(value_1), |
| None, |
| Some(value_2), |
| Some(value_1), |
| ]; |
| builder.extend(values.clone()); |
| |
| let array = builder.finish_cloned(); |
| array.to_data().validate_full().unwrap(); |
| assert_eq!(array.data_buffers().len(), 1); // without duplication we would need 3 buffers. |
| let actual: Vec<_> = array.iter().collect(); |
| assert_eq!(actual, values); |
| |
| let view0 = array.views().first().unwrap(); |
| let view3 = array.views().get(3).unwrap(); |
| let view6 = array.views().get(6).unwrap(); |
| |
| assert_eq!(view0, view3); |
| assert_eq!(view0, view6); |
| |
| assert_eq!(array.views().get(1), array.views().get(5)); |
| } |
| |
| #[test] |
| fn test_string_view_deduplicate_after_finish() { |
| let mut builder = StringViewBuilder::new().with_deduplicate_strings(); |
| |
| let value_1 = "long string to test string view"; |
| let value_2 = "not so similar string but long"; |
| builder.append_value(value_1); |
| let _array = builder.finish(); |
| builder.append_value(value_2); |
| let _array = builder.finish(); |
| builder.append_value(value_1); |
| let _array = builder.finish(); |
| } |
| |
| #[test] |
| fn test_string_view() { |
| let b1 = Buffer::from(b"world\xFFbananas\xF0\x9F\x98\x81"); |
| let b2 = Buffer::from(b"cupcakes"); |
| let b3 = Buffer::from(b"Many strings are here contained of great length and verbosity"); |
| |
| let mut v = StringViewBuilder::new(); |
| assert_eq!(v.append_block(b1), 0); |
| |
| v.append_value("This is a very long string that exceeds the inline length"); |
| v.append_value("This is another very long string that exceeds the inline length"); |
| |
| assert_eq!(v.append_block(b2), 2); |
| assert_eq!(v.append_block(b3), 3); |
| |
| // Test short strings |
| v.try_append_view(0, 0, 5).unwrap(); // world |
| v.try_append_view(0, 6, 7).unwrap(); // bananas |
| v.try_append_view(2, 3, 5).unwrap(); // cake |
| v.try_append_view(2, 0, 3).unwrap(); // cup |
| v.try_append_view(2, 0, 8).unwrap(); // cupcakes |
| v.try_append_view(0, 13, 4).unwrap(); // 😁 |
| v.try_append_view(0, 13, 0).unwrap(); // |
| |
| // Test longer strings |
| v.try_append_view(3, 0, 16).unwrap(); // Many strings are |
| v.try_append_view(1, 0, 19).unwrap(); // This is a very long |
| v.try_append_view(3, 13, 27).unwrap(); // here contained of great length |
| |
| v.append_value("I do so like long strings"); |
| |
| let array = v.finish_cloned(); |
| array.to_data().validate_full().unwrap(); |
| assert_eq!(array.data_buffers().len(), 5); |
| let actual: Vec<_> = array.iter().flatten().collect(); |
| assert_eq!( |
| actual, |
| &[ |
| "This is a very long string that exceeds the inline length", |
| "This is another very long string that exceeds the inline length", |
| "world", |
| "bananas", |
| "cakes", |
| "cup", |
| "cupcakes", |
| "😁", |
| "", |
| "Many strings are", |
| "This is a very long", |
| "are here contained of great", |
| "I do so like long strings" |
| ] |
| ); |
| |
| let err = v.try_append_view(0, u32::MAX, 1).unwrap_err(); |
| assert_eq!( |
| err.to_string(), |
| "Invalid argument error: Range 4294967295..4294967296 out of bounds for block of length 17" |
| ); |
| |
| let err = v.try_append_view(0, 1, u32::MAX).unwrap_err(); |
| assert_eq!( |
| err.to_string(), |
| "Invalid argument error: Range 1..4294967296 out of bounds for block of length 17" |
| ); |
| |
| let err = v.try_append_view(0, 13, 2).unwrap_err(); |
| assert_eq!(err.to_string(), "Invalid argument error: Invalid view data"); |
| |
| let err = v.try_append_view(0, 40, 0).unwrap_err(); |
| assert_eq!( |
| err.to_string(), |
| "Invalid argument error: Range 40..40 out of bounds for block of length 17" |
| ); |
| |
| let err = v.try_append_view(5, 0, 0).unwrap_err(); |
| assert_eq!( |
| err.to_string(), |
| "Invalid argument error: No block found with index 5" |
| ); |
| } |
| |
| #[test] |
| fn test_string_view_with_block_size_growth() { |
| let mut exp_builder = StringViewBuilder::new(); |
| let mut fixed_builder = StringViewBuilder::new().with_fixed_block_size(STARTING_BLOCK_SIZE); |
| |
| let long_string = str::from_utf8(&[b'a'; STARTING_BLOCK_SIZE as usize]).unwrap(); |
| |
| for i in 0..9 { |
| // 8k, 16k, 32k, 64k, 128k, 256k, 512k, 1M, 2M |
| for _ in 0..(2_u32.pow(i)) { |
| exp_builder.append_value(long_string); |
| fixed_builder.append_value(long_string); |
| } |
| exp_builder.flush_in_progress(); |
| fixed_builder.flush_in_progress(); |
| |
| // Every step only add one buffer, but the buffer size is much larger |
| assert_eq!(exp_builder.completed.len(), i as usize + 1); |
| assert_eq!( |
| exp_builder.completed[i as usize].len(), |
| STARTING_BLOCK_SIZE as usize * 2_usize.pow(i) |
| ); |
| |
| // This step we added 2^i blocks, the sum of blocks should be 2^(i+1) - 1 |
| assert_eq!(fixed_builder.completed.len(), 2_usize.pow(i + 1) - 1); |
| |
| // Every buffer is fixed size |
| assert!( |
| fixed_builder |
| .completed |
| .iter() |
| .all(|b| b.len() == STARTING_BLOCK_SIZE as usize) |
| ); |
| } |
| |
| // Add one more value, and the buffer stop growing. |
| exp_builder.append_value(long_string); |
| exp_builder.flush_in_progress(); |
| assert_eq!( |
| exp_builder.completed.last().unwrap().capacity(), |
| MAX_BLOCK_SIZE as usize |
| ); |
| } |
| } |