| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| use crate::buffer::ScalarBuffer; |
| use crate::{ArrowNativeType, MutableBuffer, OffsetBufferBuilder}; |
| use std::ops::Deref; |
| |
| /// A non-empty buffer of monotonically increasing, positive integers. |
| /// |
| /// [`OffsetBuffer`] are used to represent ranges of offsets. An |
| /// `OffsetBuffer` of `N+1` items contains `N` such ranges. The start |
| /// offset for element `i` is `offsets[i]` and the end offset is |
| /// `offsets[i+1]`. Equal offsets represent an empty range. |
| /// |
| /// # Example |
| /// |
| /// This example shows how 5 distinct ranges, are represented using a |
| /// 6 entry `OffsetBuffer`. The first entry `(0, 3)` represents the |
| /// three offsets `0, 1, 2`. The entry `(3,3)` represent no offsets |
| /// (e.g. an empty list). |
| /// |
| /// ```text |
| /// ┌───────┐ ┌───┐ |
| /// │ (0,3) │ │ 0 │ |
| /// ├───────┤ ├───┤ |
| /// │ (3,3) │ │ 3 │ |
| /// ├───────┤ ├───┤ |
| /// │ (3,4) │ │ 3 │ |
| /// ├───────┤ ├───┤ |
| /// │ (4,5) │ │ 4 │ |
| /// ├───────┤ ├───┤ |
| /// │ (5,7) │ │ 5 │ |
| /// └───────┘ ├───┤ |
| /// │ 7 │ |
| /// └───┘ |
| /// |
| /// Offsets Buffer |
| /// Logical |
| /// Offsets |
| /// |
| /// (offsets[i], |
| /// offsets[i+1]) |
| /// ``` |
| #[derive(Debug, Clone, PartialEq, Eq)] |
| pub struct OffsetBuffer<O: ArrowNativeType>(ScalarBuffer<O>); |
| |
| impl<O: ArrowNativeType> OffsetBuffer<O> { |
| /// Create a new [`OffsetBuffer`] from the provided [`ScalarBuffer`] |
| /// |
| /// # Panics |
| /// |
| /// Panics if `buffer` is not a non-empty buffer containing |
| /// monotonically increasing values greater than or equal to zero |
| pub fn new(buffer: ScalarBuffer<O>) -> Self { |
| assert!(!buffer.is_empty(), "offsets cannot be empty"); |
| assert!( |
| buffer[0] >= O::usize_as(0), |
| "offsets must be greater than 0" |
| ); |
| assert!( |
| buffer.windows(2).all(|w| w[0] <= w[1]), |
| "offsets must be monotonically increasing" |
| ); |
| Self(buffer) |
| } |
| |
| /// Create a new [`OffsetBuffer`] from the provided [`ScalarBuffer`] |
| /// |
| /// # Safety |
| /// |
| /// `buffer` must be a non-empty buffer containing monotonically increasing |
| /// values greater than or equal to zero |
| pub unsafe fn new_unchecked(buffer: ScalarBuffer<O>) -> Self { |
| Self(buffer) |
| } |
| |
| /// Create a new [`OffsetBuffer`] containing a single 0 value |
| pub fn new_empty() -> Self { |
| let buffer = MutableBuffer::from_len_zeroed(std::mem::size_of::<O>()); |
| Self(buffer.into_buffer().into()) |
| } |
| |
| /// Create a new [`OffsetBuffer`] containing `len + 1` `0` values |
| pub fn new_zeroed(len: usize) -> Self { |
| let len_bytes = len |
| .checked_add(1) |
| .and_then(|o| o.checked_mul(std::mem::size_of::<O>())) |
| .expect("overflow"); |
| let buffer = MutableBuffer::from_len_zeroed(len_bytes); |
| Self(buffer.into_buffer().into()) |
| } |
| |
| /// Create a new [`OffsetBuffer`] from the iterator of slice lengths |
| /// |
| /// ``` |
| /// # use arrow_buffer::OffsetBuffer; |
| /// let offsets = OffsetBuffer::<i32>::from_lengths([1, 3, 5]); |
| /// assert_eq!(offsets.as_ref(), &[0, 1, 4, 9]); |
| /// ``` |
| /// |
| /// If you want to create an [`OffsetBuffer`] where all lengths are the same, |
| /// consider using the faster [`OffsetBuffer::from_repeated_length`] instead. |
| /// |
| /// # Panics |
| /// |
| /// Panics on overflow |
| pub fn from_lengths<I>(lengths: I) -> Self |
| where |
| I: IntoIterator<Item = usize>, |
| { |
| let iter = lengths.into_iter(); |
| let mut out = Vec::with_capacity(iter.size_hint().0 + 1); |
| out.push(O::usize_as(0)); |
| |
| let mut acc = 0_usize; |
| for length in iter { |
| acc = acc.checked_add(length).expect("usize overflow"); |
| out.push(O::usize_as(acc)) |
| } |
| // Check for overflow |
| O::from_usize(acc).expect("offset overflow"); |
| Self(out.into()) |
| } |
| |
| /// Create a new [`OffsetBuffer`] where each slice has the same length |
| /// `length`, repeated `n` times. |
| /// |
| /// |
| /// Example |
| /// ``` |
| /// # use arrow_buffer::OffsetBuffer; |
| /// let offsets = OffsetBuffer::<i32>::from_repeated_length(4, 3); |
| /// assert_eq!(offsets.as_ref(), &[0, 4, 8, 12]); |
| /// ``` |
| /// |
| /// # Panics |
| /// |
| /// Panics on overflow |
| pub fn from_repeated_length(length: usize, n: usize) -> Self { |
| if n == 0 { |
| return Self::new_empty(); |
| } |
| |
| if length == 0 { |
| return Self::new_zeroed(n); |
| } |
| |
| // Check for overflow |
| // Making sure we don't overflow usize or O when calculating the total length |
| length.checked_mul(n).expect("usize overflow"); |
| |
| // Check for overflow |
| O::from_usize(length * n).expect("offset overflow"); |
| |
| let offsets = (0..=n) |
| .map(|index| O::usize_as(index * length)) |
| .collect::<Vec<O>>(); |
| |
| Self(ScalarBuffer::from(offsets)) |
| } |
| |
| /// Get an Iterator over the lengths of this [`OffsetBuffer`] |
| /// |
| /// ``` |
| /// # use arrow_buffer::{OffsetBuffer, ScalarBuffer}; |
| /// let offsets = OffsetBuffer::<_>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9])); |
| /// assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![1, 3, 5]); |
| /// ``` |
| /// |
| /// Empty [`OffsetBuffer`] will return an empty iterator |
| /// ``` |
| /// # use arrow_buffer::OffsetBuffer; |
| /// let offsets = OffsetBuffer::<i32>::new_empty(); |
| /// assert_eq!(offsets.lengths().count(), 0); |
| /// ``` |
| /// |
| /// This can be used to merge multiple [`OffsetBuffer`]s to one |
| /// ``` |
| /// # use arrow_buffer::{OffsetBuffer, ScalarBuffer}; |
| /// |
| /// let buffer1 = OffsetBuffer::<i32>::from_lengths([2, 6, 3, 7, 2]); |
| /// let buffer2 = OffsetBuffer::<i32>::from_lengths([1, 3, 5, 7, 9]); |
| /// |
| /// let merged = OffsetBuffer::<i32>::from_lengths( |
| /// vec![buffer1, buffer2].iter().flat_map(|x| x.lengths()) |
| /// ); |
| /// |
| /// assert_eq!(merged.lengths().collect::<Vec<_>>(), &[2, 6, 3, 7, 2, 1, 3, 5, 7, 9]); |
| /// ``` |
| pub fn lengths(&self) -> impl ExactSizeIterator<Item = usize> + '_ { |
| self.0.windows(2).map(|x| x[1].as_usize() - x[0].as_usize()) |
| } |
| |
| /// Free up unused memory. |
| pub fn shrink_to_fit(&mut self) { |
| self.0.shrink_to_fit(); |
| } |
| |
| /// Returns the inner [`ScalarBuffer`] |
| pub fn inner(&self) -> &ScalarBuffer<O> { |
| &self.0 |
| } |
| |
| /// Returns the inner [`ScalarBuffer`], consuming self |
| pub fn into_inner(self) -> ScalarBuffer<O> { |
| self.0 |
| } |
| |
| /// Returns a zero-copy slice of this buffer with length `len` and starting at `offset` |
| pub fn slice(&self, offset: usize, len: usize) -> Self { |
| Self(self.0.slice(offset, len.saturating_add(1))) |
| } |
| |
| /// Returns true if this [`OffsetBuffer`] is equal to `other`, using pointer comparisons |
| /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may |
| /// return false when the arrays are logically equal |
| #[inline] |
| pub fn ptr_eq(&self, other: &Self) -> bool { |
| self.0.ptr_eq(&other.0) |
| } |
| } |
| |
| impl<T: ArrowNativeType> Deref for OffsetBuffer<T> { |
| type Target = [T]; |
| |
| #[inline] |
| fn deref(&self) -> &Self::Target { |
| &self.0 |
| } |
| } |
| |
| impl<T: ArrowNativeType> AsRef<[T]> for OffsetBuffer<T> { |
| #[inline] |
| fn as_ref(&self) -> &[T] { |
| self |
| } |
| } |
| |
| impl<O: ArrowNativeType> From<OffsetBufferBuilder<O>> for OffsetBuffer<O> { |
| fn from(value: OffsetBufferBuilder<O>) -> Self { |
| value.finish() |
| } |
| } |
| |
| impl<O: ArrowNativeType> Default for OffsetBuffer<O> { |
| fn default() -> Self { |
| Self::new_empty() |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| |
| #[test] |
| #[should_panic(expected = "offsets cannot be empty")] |
| fn empty_offsets() { |
| OffsetBuffer::new(Vec::<i32>::new().into()); |
| } |
| |
| #[test] |
| #[should_panic(expected = "offsets must be greater than 0")] |
| fn negative_offsets() { |
| OffsetBuffer::new(vec![-1, 0, 1].into()); |
| } |
| |
| #[test] |
| fn offsets() { |
| OffsetBuffer::new(vec![0, 1, 2, 3].into()); |
| |
| let offsets = OffsetBuffer::<i32>::new_zeroed(3); |
| assert_eq!(offsets.as_ref(), &[0; 4]); |
| |
| let offsets = OffsetBuffer::<i32>::new_zeroed(0); |
| assert_eq!(offsets.as_ref(), &[0; 1]); |
| } |
| |
| #[test] |
| #[should_panic(expected = "overflow")] |
| fn offsets_new_zeroed_overflow() { |
| OffsetBuffer::<i32>::new_zeroed(usize::MAX); |
| } |
| |
| #[test] |
| #[should_panic(expected = "offsets must be monotonically increasing")] |
| fn non_monotonic_offsets() { |
| OffsetBuffer::new(vec![1, 2, 0].into()); |
| } |
| |
| #[test] |
| fn from_lengths() { |
| let buffer = OffsetBuffer::<i32>::from_lengths([2, 6, 3, 7, 2]); |
| assert_eq!(buffer.as_ref(), &[0, 2, 8, 11, 18, 20]); |
| |
| let half_max = i32::MAX / 2; |
| let buffer = OffsetBuffer::<i32>::from_lengths([half_max as usize, half_max as usize]); |
| assert_eq!(buffer.as_ref(), &[0, half_max, half_max * 2]); |
| } |
| |
| #[test] |
| #[should_panic(expected = "offset overflow")] |
| fn from_lengths_offset_overflow() { |
| OffsetBuffer::<i32>::from_lengths([i32::MAX as usize, 1]); |
| } |
| |
| #[test] |
| #[should_panic(expected = "usize overflow")] |
| fn from_lengths_usize_overflow() { |
| OffsetBuffer::<i32>::from_lengths([usize::MAX, 1]); |
| } |
| |
| #[test] |
| #[should_panic(expected = "offset overflow")] |
| fn from_repeated_lengths_offset_length_overflow() { |
| OffsetBuffer::<i32>::from_repeated_length(i32::MAX as usize / 4, 5); |
| } |
| |
| #[test] |
| #[should_panic(expected = "offset overflow")] |
| fn from_repeated_lengths_offset_repeat_overflow() { |
| OffsetBuffer::<i32>::from_repeated_length(1, i32::MAX as usize + 1); |
| } |
| |
| #[test] |
| #[should_panic(expected = "offset overflow")] |
| fn from_repeated_lengths_usize_length_overflow() { |
| OffsetBuffer::<i32>::from_repeated_length(usize::MAX, 1); |
| } |
| |
| #[test] |
| #[should_panic(expected = "usize overflow")] |
| fn from_repeated_lengths_usize_length_usize_overflow() { |
| OffsetBuffer::<i32>::from_repeated_length(usize::MAX, 2); |
| } |
| |
| #[test] |
| #[should_panic(expected = "offset overflow")] |
| fn from_repeated_lengths_usize_repeat_overflow() { |
| OffsetBuffer::<i32>::from_repeated_length(1, usize::MAX); |
| } |
| |
| #[test] |
| fn get_lengths() { |
| let offsets = OffsetBuffer::<i32>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9])); |
| assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![1, 3, 5]); |
| } |
| |
| #[test] |
| fn get_lengths_should_be_with_fixed_size() { |
| let offsets = OffsetBuffer::<i32>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9])); |
| let iter = offsets.lengths(); |
| assert_eq!(iter.size_hint(), (3, Some(3))); |
| assert_eq!(iter.len(), 3); |
| } |
| |
| #[test] |
| fn get_lengths_from_empty_offset_buffer_should_be_empty_iterator() { |
| let offsets = OffsetBuffer::<i32>::new_empty(); |
| assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![]); |
| } |
| |
| #[test] |
| fn impl_eq() { |
| fn are_equal<T: Eq>(a: &T, b: &T) -> bool { |
| a.eq(b) |
| } |
| |
| assert!( |
| are_equal( |
| &OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9])), |
| &OffsetBuffer::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9])) |
| ), |
| "OffsetBuffer should implement Eq." |
| ); |
| } |
| |
| #[test] |
| fn impl_default() { |
| let default = OffsetBuffer::<i32>::default(); |
| assert_eq!(default.as_ref(), &[0]); |
| } |
| |
| #[test] |
| fn from_repeated_length_basic() { |
| // Basic case with length 4, repeated 3 times |
| let buffer = OffsetBuffer::<i32>::from_repeated_length(4, 3); |
| assert_eq!(buffer.as_ref(), &[0, 4, 8, 12]); |
| |
| // Verify the lengths are correct |
| let lengths: Vec<usize> = buffer.lengths().collect(); |
| assert_eq!(lengths, vec![4, 4, 4]); |
| } |
| |
| #[test] |
| fn from_repeated_length_single_repeat() { |
| // Length 5, repeated once |
| let buffer = OffsetBuffer::<i32>::from_repeated_length(5, 1); |
| assert_eq!(buffer.as_ref(), &[0, 5]); |
| |
| let lengths: Vec<usize> = buffer.lengths().collect(); |
| assert_eq!(lengths, vec![5]); |
| } |
| |
| #[test] |
| fn from_repeated_length_zero_repeats() { |
| let buffer = OffsetBuffer::<i32>::from_repeated_length(10, 0); |
| assert_eq!(buffer, OffsetBuffer::<i32>::new_empty()); |
| } |
| |
| #[test] |
| fn from_repeated_length_zero_length() { |
| // Zero length, repeated 5 times (all zeros) |
| let buffer = OffsetBuffer::<i32>::from_repeated_length(0, 5); |
| assert_eq!(buffer.as_ref(), &[0, 0, 0, 0, 0, 0]); |
| |
| // All lengths should be 0 |
| let lengths: Vec<usize> = buffer.lengths().collect(); |
| assert_eq!(lengths, vec![0, 0, 0, 0, 0]); |
| } |
| |
| #[test] |
| fn from_repeated_length_large_values() { |
| // Test with larger values that don't overflow |
| let buffer = OffsetBuffer::<i32>::from_repeated_length(1000, 100); |
| assert_eq!(buffer[0], 0); |
| |
| // Verify all lengths are 1000 |
| let lengths: Vec<usize> = buffer.lengths().collect(); |
| assert_eq!(lengths.len(), 100); |
| assert!(lengths.iter().all(|&len| len == 1000)); |
| } |
| |
| #[test] |
| fn from_repeated_length_unit_length() { |
| // Length 1, repeated multiple times |
| let buffer = OffsetBuffer::<i32>::from_repeated_length(1, 10); |
| assert_eq!(buffer.as_ref(), &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); |
| |
| let lengths: Vec<usize> = buffer.lengths().collect(); |
| assert_eq!(lengths, vec![1; 10]); |
| } |
| |
| #[test] |
| fn from_repeated_length_max_safe_values() { |
| // Test with maximum safe values for i32 |
| // i32::MAX / 3 ensures we don't overflow when repeated twice |
| let third_max = (i32::MAX / 3) as usize; |
| let buffer = OffsetBuffer::<i32>::from_repeated_length(third_max, 2); |
| assert_eq!( |
| buffer.as_ref(), |
| &[0, third_max as i32, (third_max * 2) as i32] |
| ); |
| } |
| } |