blob: af87263e1edd53dc5bf72a5bffed7a4563c8d7e9 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::bit_chunk_iterator::BitChunks;
use crate::bit_iterator::{BitIndexIterator, BitIndexU32Iterator, BitIterator, BitSliceIterator};
use crate::{
BooleanBufferBuilder, Buffer, MutableBuffer, bit_util, buffer_bin_and, buffer_bin_or,
buffer_bin_xor, buffer_unary_not,
};
use std::ops::{BitAnd, BitOr, BitXor, Not};
/// A slice-able [`Buffer`] containing bit-packed booleans
///
/// `BooleanBuffer`s can be modified using [`BooleanBufferBuilder`]
///
///
/// # See Also
/// * [`NullBuffer`] for representing null values in Arrow arrays
///
/// [`NullBuffer`]: crate::NullBuffer
#[derive(Debug, Clone, Eq)]
pub struct BooleanBuffer {
buffer: Buffer,
offset: usize,
len: usize,
}
impl PartialEq for BooleanBuffer {
fn eq(&self, other: &Self) -> bool {
if self.len != other.len {
return false;
}
let lhs = self.bit_chunks().iter_padded();
let rhs = other.bit_chunks().iter_padded();
lhs.zip(rhs).all(|(a, b)| a == b)
}
}
impl BooleanBuffer {
/// Create a new [`BooleanBuffer`] from a [`Buffer`], an `offset` and `length` in bits
///
/// # Panics
///
/// This method will panic if `buffer` is not large enough
pub fn new(buffer: Buffer, offset: usize, len: usize) -> Self {
let total_len = offset.saturating_add(len);
let buffer_len = buffer.len();
let bit_len = buffer_len.saturating_mul(8);
assert!(
total_len <= bit_len,
"buffer not large enough (offset: {offset}, len: {len}, buffer_len: {buffer_len})"
);
Self {
buffer,
offset,
len,
}
}
/// Create a new [`BooleanBuffer`] of `length` where all values are `true`
pub fn new_set(length: usize) -> Self {
let mut builder = BooleanBufferBuilder::new(length);
builder.append_n(length, true);
builder.finish()
}
/// Create a new [`BooleanBuffer`] of `length` where all values are `false`
pub fn new_unset(length: usize) -> Self {
let buffer = MutableBuffer::new_null(length).into_buffer();
Self {
buffer,
offset: 0,
len: length,
}
}
/// Invokes `f` with indexes `0..len` collecting the boolean results into a new `BooleanBuffer`
pub fn collect_bool<F: FnMut(usize) -> bool>(len: usize, f: F) -> Self {
let buffer = MutableBuffer::collect_bool(len, f);
Self::new(buffer.into(), 0, len)
}
/// Create a new [`BooleanBuffer`] by copying the relevant bits from an
/// input buffer.
///
/// # Notes:
/// * The new `BooleanBuffer` has zero offset, even if `offset_in_bits` is non-zero
///
/// # Example: Create a new [`BooleanBuffer`] copying a bit slice from in input slice
/// ```
/// # use arrow_buffer::BooleanBuffer;
/// let input = [0b11001100u8, 0b10111010u8];
/// // // Copy bits 4..16 from input
/// let result = BooleanBuffer::from_bits(&input, 4, 12);
/// assert_eq!(result.values(), &[0b10101100u8, 0b00001011u8]);
pub fn from_bits(src: impl AsRef<[u8]>, offset_in_bits: usize, len_in_bits: usize) -> Self {
Self::from_bitwise_unary_op(src, offset_in_bits, len_in_bits, |a| a)
}
/// Create a new [`BooleanBuffer`] by applying the bitwise operation to `op`
/// to an input buffer.
///
/// This function is faster than applying the operation bit by bit as
/// it processes input buffers in chunks of 64 bits (8 bytes) at a time
///
/// # Notes:
/// * `op` takes a single `u64` inputs and produces one `u64` output.
/// * `op` must only apply bitwise operations
/// on the relevant bits; the input `u64` may contain irrelevant bits
/// and may be processed differently on different endian architectures.
/// * The output always has zero offset
///
/// # See Also
/// - [`apply_bitwise_unary_op`](bit_util::apply_bitwise_unary_op) for in-place unary bitwise operations
///
/// # Example: Create new [`BooleanBuffer`] from bitwise `NOT` of an input [`Buffer`]
/// ```
/// # use arrow_buffer::BooleanBuffer;
/// let input = [0b11001100u8, 0b10111010u8]; // 2 bytes = 16 bits
/// // NOT of the first 12 bits
/// let result = BooleanBuffer::from_bitwise_unary_op(
/// &input, 0, 12, |a| !a
/// );
/// assert_eq!(result.values(), &[0b00110011u8, 0b11110101u8]);
/// ```
pub fn from_bitwise_unary_op<F>(
src: impl AsRef<[u8]>,
offset_in_bits: usize,
len_in_bits: usize,
mut op: F,
) -> Self
where
F: FnMut(u64) -> u64,
{
// try fast path for aligned input
if offset_in_bits & 0x7 == 0 {
// align to byte boundary
let aligned = &src.as_ref()[offset_in_bits / 8..];
if let Some(result) =
Self::try_from_aligned_bitwise_unary_op(aligned, len_in_bits, &mut op)
{
return result;
}
}
let chunks = BitChunks::new(src.as_ref(), offset_in_bits, len_in_bits);
let mut result = MutableBuffer::with_capacity(chunks.num_u64s() * 8);
for chunk in chunks.iter() {
// SAFETY: reserved enough capacity above, (exactly num_u64s()
// items) and we assume `BitChunks` correctly reports upper bound
unsafe {
result.push_unchecked(op(chunk));
}
}
if chunks.remainder_len() > 0 {
debug_assert!(result.capacity() >= result.len() + 8); // should not reallocate
// SAFETY: reserved enough capacity above, (exactly num_u64s()
// items) and we assume `BitChunks` correctly reports upper bound
unsafe {
result.push_unchecked(op(chunks.remainder_bits()));
}
// Just pushed one u64, which may have trailing zeros
result.truncate(chunks.num_bytes());
}
let buffer = Buffer::from(result);
BooleanBuffer {
buffer,
offset: 0,
len: len_in_bits,
}
}
/// Fast path for [`Self::from_bitwise_unary_op`] when input is aligned to
/// 8-byte (64-bit) boundaries
///
/// Returns None if the fast path cannot be taken
fn try_from_aligned_bitwise_unary_op<F>(
src: &[u8],
len_in_bits: usize,
op: &mut F,
) -> Option<Self>
where
F: FnMut(u64) -> u64,
{
// Safety: all valid bytes are valid u64s
let (prefix, aligned_u6us, suffix) = unsafe { src.align_to::<u64>() };
if !(prefix.is_empty() && suffix.is_empty()) {
// Couldn't make this case any faster than the default path, see
// https://github.com/apache/arrow-rs/pull/8996/changes#r2620022082
return None;
}
// the buffer is word (64 bit) aligned, so use optimized Vec code.
let result_u64s: Vec<u64> = aligned_u6us.iter().map(|l| op(*l)).collect();
let buffer = Buffer::from(result_u64s);
Some(BooleanBuffer::new(buffer, 0, len_in_bits))
}
/// Returns the number of set bits in this buffer
pub fn count_set_bits(&self) -> usize {
self.buffer.count_set_bits_offset(self.offset, self.len)
}
/// Returns a [`BitChunks`] instance which can be used to iterate over
/// this buffer's bits in `u64` chunks
#[inline]
pub fn bit_chunks(&self) -> BitChunks<'_> {
BitChunks::new(self.values(), self.offset, self.len)
}
/// Returns the offset of this [`BooleanBuffer`] in bits
#[inline]
pub fn offset(&self) -> usize {
self.offset
}
/// Returns the length of this [`BooleanBuffer`] in bits
#[inline]
pub fn len(&self) -> usize {
self.len
}
/// Returns true if this [`BooleanBuffer`] is empty
#[inline]
pub fn is_empty(&self) -> bool {
self.len == 0
}
/// Free up unused memory.
pub fn shrink_to_fit(&mut self) {
// TODO(emilk): we could shrink even more in the case where we are a small sub-slice of the full buffer
self.buffer.shrink_to_fit();
}
/// Returns the boolean value at index `i`.
///
/// # Panics
///
/// Panics if `i >= self.len()`
#[inline]
pub fn value(&self, idx: usize) -> bool {
assert!(idx < self.len);
unsafe { self.value_unchecked(idx) }
}
/// Returns the boolean value at index `i`.
///
/// # Safety
/// This doesn't check bounds, the caller must ensure that index < self.len()
#[inline]
pub unsafe fn value_unchecked(&self, i: usize) -> bool {
unsafe { bit_util::get_bit_raw(self.buffer.as_ptr(), i + self.offset) }
}
/// Returns the packed values of this [`BooleanBuffer`] not including any offset
#[inline]
pub fn values(&self) -> &[u8] {
&self.buffer
}
/// Slices this [`BooleanBuffer`] by the provided `offset` and `length`
pub fn slice(&self, offset: usize, len: usize) -> Self {
assert!(
offset.saturating_add(len) <= self.len,
"the length + offset of the sliced BooleanBuffer cannot exceed the existing length"
);
Self {
buffer: self.buffer.clone(),
offset: self.offset + offset,
len,
}
}
/// Returns a [`Buffer`] containing the sliced contents of this [`BooleanBuffer`]
///
/// Equivalent to `self.buffer.bit_slice(self.offset, self.len)`
pub fn sliced(&self) -> Buffer {
self.buffer.bit_slice(self.offset, self.len)
}
/// Returns true if this [`BooleanBuffer`] is equal to `other`, using pointer comparisons
/// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
/// return false when the arrays are logically equal
pub fn ptr_eq(&self, other: &Self) -> bool {
self.buffer.as_ptr() == other.buffer.as_ptr()
&& self.offset == other.offset
&& self.len == other.len
}
/// Returns the inner [`Buffer`]
#[inline]
pub fn inner(&self) -> &Buffer {
&self.buffer
}
/// Returns the inner [`Buffer`], consuming self
pub fn into_inner(self) -> Buffer {
self.buffer
}
/// Returns an iterator over the bits in this [`BooleanBuffer`]
pub fn iter(&self) -> BitIterator<'_> {
self.into_iter()
}
/// Returns an iterator over the set bit positions in this [`BooleanBuffer`]
pub fn set_indices(&self) -> BitIndexIterator<'_> {
BitIndexIterator::new(self.values(), self.offset, self.len)
}
/// Returns a `u32` iterator over set bit positions without any usize->u32 conversion
pub fn set_indices_u32(&self) -> BitIndexU32Iterator<'_> {
BitIndexU32Iterator::new(self.values(), self.offset, self.len)
}
/// Returns a [`BitSliceIterator`] yielding contiguous ranges of set bits
pub fn set_slices(&self) -> BitSliceIterator<'_> {
BitSliceIterator::new(self.values(), self.offset, self.len)
}
}
impl Not for &BooleanBuffer {
type Output = BooleanBuffer;
fn not(self) -> Self::Output {
BooleanBuffer {
buffer: buffer_unary_not(&self.buffer, self.offset, self.len),
offset: 0,
len: self.len,
}
}
}
impl BitAnd<&BooleanBuffer> for &BooleanBuffer {
type Output = BooleanBuffer;
fn bitand(self, rhs: &BooleanBuffer) -> Self::Output {
assert_eq!(self.len, rhs.len);
BooleanBuffer {
buffer: buffer_bin_and(&self.buffer, self.offset, &rhs.buffer, rhs.offset, self.len),
offset: 0,
len: self.len,
}
}
}
impl BitOr<&BooleanBuffer> for &BooleanBuffer {
type Output = BooleanBuffer;
fn bitor(self, rhs: &BooleanBuffer) -> Self::Output {
assert_eq!(self.len, rhs.len);
BooleanBuffer {
buffer: buffer_bin_or(&self.buffer, self.offset, &rhs.buffer, rhs.offset, self.len),
offset: 0,
len: self.len,
}
}
}
impl BitXor<&BooleanBuffer> for &BooleanBuffer {
type Output = BooleanBuffer;
fn bitxor(self, rhs: &BooleanBuffer) -> Self::Output {
assert_eq!(self.len, rhs.len);
BooleanBuffer {
buffer: buffer_bin_xor(&self.buffer, self.offset, &rhs.buffer, rhs.offset, self.len),
offset: 0,
len: self.len,
}
}
}
impl<'a> IntoIterator for &'a BooleanBuffer {
type Item = bool;
type IntoIter = BitIterator<'a>;
fn into_iter(self) -> Self::IntoIter {
BitIterator::new(self.values(), self.offset, self.len)
}
}
impl From<&[bool]> for BooleanBuffer {
fn from(value: &[bool]) -> Self {
let mut builder = BooleanBufferBuilder::new(value.len());
builder.append_slice(value);
builder.finish()
}
}
impl From<Vec<bool>> for BooleanBuffer {
fn from(value: Vec<bool>) -> Self {
value.as_slice().into()
}
}
impl FromIterator<bool> for BooleanBuffer {
fn from_iter<T: IntoIterator<Item = bool>>(iter: T) -> Self {
let iter = iter.into_iter();
let (hint, _) = iter.size_hint();
let mut builder = BooleanBufferBuilder::new(hint);
iter.for_each(|b| builder.append(b));
builder.finish()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_boolean_new() {
let bytes = &[0, 1, 2, 3, 4];
let buf = Buffer::from(bytes);
let offset = 0;
let len = 24;
let boolean_buf = BooleanBuffer::new(buf.clone(), offset, len);
assert_eq!(bytes, boolean_buf.values());
assert_eq!(offset, boolean_buf.offset());
assert_eq!(len, boolean_buf.len());
assert_eq!(2, boolean_buf.count_set_bits());
assert_eq!(&buf, boolean_buf.inner());
assert_eq!(buf, boolean_buf.clone().into_inner());
assert!(!boolean_buf.is_empty())
}
#[test]
fn test_boolean_data_equality() {
let boolean_buf1 = BooleanBuffer::new(Buffer::from(&[0, 1, 4, 3, 5]), 0, 32);
let boolean_buf2 = BooleanBuffer::new(Buffer::from(&[0, 1, 4, 3, 5]), 0, 32);
assert_eq!(boolean_buf1, boolean_buf2);
// slice with same offset and same length should still preserve equality
let boolean_buf3 = boolean_buf1.slice(8, 16);
assert_ne!(boolean_buf1, boolean_buf3);
let boolean_buf4 = boolean_buf1.slice(0, 32);
assert_eq!(boolean_buf1, boolean_buf4);
// unequal because of different elements
let boolean_buf2 = BooleanBuffer::new(Buffer::from(&[0, 0, 2, 3, 4]), 0, 32);
assert_ne!(boolean_buf1, boolean_buf2);
// unequal because of different length
let boolean_buf2 = BooleanBuffer::new(Buffer::from(&[0, 1, 4, 3, 5]), 0, 24);
assert_ne!(boolean_buf1, boolean_buf2);
// ptr_eq
assert!(boolean_buf1.ptr_eq(&boolean_buf1));
assert!(boolean_buf2.ptr_eq(&boolean_buf2));
assert!(!boolean_buf1.ptr_eq(&boolean_buf2));
}
#[test]
fn test_boolean_slice() {
let bytes = &[0, 3, 2, 6, 2];
let boolean_buf1 = BooleanBuffer::new(Buffer::from(bytes), 0, 32);
let boolean_buf2 = BooleanBuffer::new(Buffer::from(bytes), 0, 32);
let boolean_slice1 = boolean_buf1.slice(16, 16);
let boolean_slice2 = boolean_buf2.slice(0, 16);
assert_eq!(boolean_slice1.values(), boolean_slice2.values());
assert_eq!(bytes, boolean_slice1.values());
assert_eq!(16, boolean_slice1.offset);
assert_eq!(16, boolean_slice1.len);
assert_eq!(bytes, boolean_slice2.values());
assert_eq!(0, boolean_slice2.offset);
assert_eq!(16, boolean_slice2.len);
}
#[test]
fn test_boolean_bitand() {
let offset = 0;
let len = 40;
let buf1 = Buffer::from(&[0, 1, 1, 0, 0]);
let boolean_buf1 = &BooleanBuffer::new(buf1, offset, len);
let buf2 = Buffer::from(&[0, 1, 1, 1, 0]);
let boolean_buf2 = &BooleanBuffer::new(buf2, offset, len);
let expected = BooleanBuffer::new(Buffer::from(&[0, 1, 1, 0, 0]), offset, len);
assert_eq!(boolean_buf1 & boolean_buf2, expected);
}
#[test]
fn test_boolean_bitor() {
let offset = 0;
let len = 40;
let buf1 = Buffer::from(&[0, 1, 1, 0, 0]);
let boolean_buf1 = &BooleanBuffer::new(buf1, offset, len);
let buf2 = Buffer::from(&[0, 1, 1, 1, 0]);
let boolean_buf2 = &BooleanBuffer::new(buf2, offset, len);
let expected = BooleanBuffer::new(Buffer::from(&[0, 1, 1, 1, 0]), offset, len);
assert_eq!(boolean_buf1 | boolean_buf2, expected);
}
#[test]
fn test_boolean_bitxor() {
let offset = 0;
let len = 40;
let buf1 = Buffer::from(&[0, 1, 1, 0, 0]);
let boolean_buf1 = &BooleanBuffer::new(buf1, offset, len);
let buf2 = Buffer::from(&[0, 1, 1, 1, 0]);
let boolean_buf2 = &BooleanBuffer::new(buf2, offset, len);
let expected = BooleanBuffer::new(Buffer::from(&[0, 0, 0, 1, 0]), offset, len);
assert_eq!(boolean_buf1 ^ boolean_buf2, expected);
}
#[test]
fn test_boolean_not() {
let offset = 0;
let len = 40;
let buf = Buffer::from(&[0, 1, 1, 0, 0]);
let boolean_buf = &BooleanBuffer::new(buf, offset, len);
let expected = BooleanBuffer::new(Buffer::from(&[255, 254, 254, 255, 255]), offset, len);
assert_eq!(!boolean_buf, expected);
}
#[test]
fn test_boolean_from_slice_bool() {
let v = [true, false, false];
let buf = BooleanBuffer::from(&v[..]);
assert_eq!(buf.offset(), 0);
assert_eq!(buf.len(), 3);
assert_eq!(buf.values().len(), 1);
assert!(buf.value(0));
}
#[test]
fn test_from_bitwise_unary_op() {
// Use 1024 boolean values so that at least some of the tests cover multiple u64 chunks and
// perfect alignment
let input_bools = (0..1024)
.map(|_| rand::random::<bool>())
.collect::<Vec<bool>>();
let input_buffer = BooleanBuffer::from(&input_bools[..]);
// Note ensure we test offsets over 100 to cover multiple u64 chunks
for offset in 0..1024 {
let result = BooleanBuffer::from_bitwise_unary_op(
input_buffer.values(),
offset,
input_buffer.len() - offset,
|a| !a,
);
let expected = input_bools[offset..]
.iter()
.map(|b| !*b)
.collect::<BooleanBuffer>();
assert_eq!(result, expected);
}
// Also test when the input doesn't cover the entire buffer
for offset in 0..512 {
let len = 512 - offset; // fixed length less than total
let result =
BooleanBuffer::from_bitwise_unary_op(input_buffer.values(), offset, len, |a| !a);
let expected = input_bools[offset..]
.iter()
.take(len)
.map(|b| !*b)
.collect::<BooleanBuffer>();
assert_eq!(result, expected);
}
}
}