blob: e5e3a610ead2fc89f64c3ae8829ff64c6bf26a2d [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator};
use crate::buffer::BooleanBuffer;
use crate::{Buffer, MutableBuffer};
/// A [`BooleanBuffer`] used to encode validity for Arrow arrays
///
/// In the [Arrow specification], array validity is encoded in a packed bitmask with a
/// `true` value indicating the corresponding slot is not null, and `false` indicating
/// that it is null.
///
/// `NullBuffer`s can be creating using [`NullBufferBuilder`]
///
/// [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps
/// [`NullBufferBuilder`]: crate::NullBufferBuilder
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct NullBuffer {
buffer: BooleanBuffer,
null_count: usize,
}
impl NullBuffer {
/// Create a new [`NullBuffer`] computing the null count
pub fn new(buffer: BooleanBuffer) -> Self {
let null_count = buffer.len() - buffer.count_set_bits();
Self { buffer, null_count }
}
/// Create a new [`NullBuffer`] of length `len` where all values are null
pub fn new_null(len: usize) -> Self {
Self {
buffer: BooleanBuffer::new_unset(len),
null_count: len,
}
}
/// Create a new [`NullBuffer`] of length `len` where all values are valid
///
/// Note: it is more efficient to not set the null buffer if it is known to
/// be all valid (aka all values are not null)
pub fn new_valid(len: usize) -> Self {
Self {
buffer: BooleanBuffer::new_set(len),
null_count: 0,
}
}
/// Create a new [`NullBuffer`] with the provided `buffer` and `null_count`
///
/// # Safety
///
/// `buffer` must contain `null_count` `0` bits
pub unsafe fn new_unchecked(buffer: BooleanBuffer, null_count: usize) -> Self {
Self { buffer, null_count }
}
/// Computes the union of the nulls in two optional [`NullBuffer`]
///
/// This is commonly used by binary operations where the result is NULL if either
/// of the input values is NULL. Handling the null mask separately in this way
/// can yield significant performance improvements over an iterator approach
pub fn union(lhs: Option<&NullBuffer>, rhs: Option<&NullBuffer>) -> Option<NullBuffer> {
match (lhs, rhs) {
(Some(lhs), Some(rhs)) => Some(Self::new(lhs.inner() & rhs.inner())),
(Some(n), None) | (None, Some(n)) => Some(n.clone()),
(None, None) => None,
}
}
/// Returns true if all nulls in `other` also exist in self
pub fn contains(&self, other: &NullBuffer) -> bool {
if other.null_count == 0 {
return true;
}
let lhs = self.inner().bit_chunks().iter_padded();
let rhs = other.inner().bit_chunks().iter_padded();
lhs.zip(rhs).all(|(l, r)| (l & !r) == 0)
}
/// Returns a new [`NullBuffer`] where each bit in the current null buffer
/// is repeated `count` times. This is useful for masking the nulls of
/// the child of a FixedSizeListArray based on its parent
pub fn expand(&self, count: usize) -> Self {
let capacity = self.buffer.len().checked_mul(count).unwrap();
let mut buffer = MutableBuffer::new_null(capacity);
// Expand each bit within `null_mask` into `element_len`
// bits, constructing the implicit mask of the child elements
for i in 0..self.buffer.len() {
if self.is_null(i) {
continue;
}
for j in 0..count {
crate::bit_util::set_bit(buffer.as_mut(), i * count + j)
}
}
Self {
buffer: BooleanBuffer::new(buffer.into(), 0, capacity),
null_count: self.null_count * count,
}
}
/// Returns the length of this [`NullBuffer`] in bits
#[inline]
pub fn len(&self) -> usize {
self.buffer.len()
}
/// Returns the offset of this [`NullBuffer`] in bits
#[inline]
pub fn offset(&self) -> usize {
self.buffer.offset()
}
/// Returns true if this [`NullBuffer`] is empty
#[inline]
pub fn is_empty(&self) -> bool {
self.buffer.is_empty()
}
/// Free up unused memory.
pub fn shrink_to_fit(&mut self) {
self.buffer.shrink_to_fit();
}
/// Returns the null count for this [`NullBuffer`]
#[inline]
pub fn null_count(&self) -> usize {
self.null_count
}
/// Returns `true` if the value at `idx` is not null
#[inline]
pub fn is_valid(&self, idx: usize) -> bool {
self.buffer.value(idx)
}
/// Returns `true` if the value at `idx` is null
#[inline]
pub fn is_null(&self, idx: usize) -> bool {
!self.is_valid(idx)
}
/// Returns the packed validity of this [`NullBuffer`] not including any offset
#[inline]
pub fn validity(&self) -> &[u8] {
self.buffer.values()
}
/// Slices this [`NullBuffer`] by the provided `offset` and `length`
pub fn slice(&self, offset: usize, len: usize) -> Self {
Self::new(self.buffer.slice(offset, len))
}
/// Returns an iterator over the bits in this [`NullBuffer`]
///
/// * `true` indicates that the corresponding value is not NULL
/// * `false` indicates that the corresponding value is NULL
///
/// Note: [`Self::valid_indices`] will be significantly faster for most use-cases
pub fn iter(&self) -> BitIterator<'_> {
self.buffer.iter()
}
/// Returns a [`BitIndexIterator`] over the valid indices in this [`NullBuffer`]
///
/// Valid indices indicate the corresponding value is not NULL
pub fn valid_indices(&self) -> BitIndexIterator<'_> {
self.buffer.set_indices()
}
/// Returns a [`BitSliceIterator`] yielding contiguous ranges of valid indices
///
/// Valid indices indicate the corresponding value is not NULL
pub fn valid_slices(&self) -> BitSliceIterator<'_> {
self.buffer.set_slices()
}
/// Calls the provided closure for each index in this null mask that is set
#[inline]
pub fn try_for_each_valid_idx<E, F: FnMut(usize) -> Result<(), E>>(
&self,
f: F,
) -> Result<(), E> {
if self.null_count == self.len() {
return Ok(());
}
self.valid_indices().try_for_each(f)
}
/// Returns the inner [`BooleanBuffer`]
#[inline]
pub fn inner(&self) -> &BooleanBuffer {
&self.buffer
}
/// Returns the inner [`BooleanBuffer`]
#[inline]
pub fn into_inner(self) -> BooleanBuffer {
self.buffer
}
/// Returns the underlying [`Buffer`]
#[inline]
pub fn buffer(&self) -> &Buffer {
self.buffer.inner()
}
}
impl<'a> IntoIterator for &'a NullBuffer {
type Item = bool;
type IntoIter = BitIterator<'a>;
fn into_iter(self) -> Self::IntoIter {
self.buffer.iter()
}
}
impl From<BooleanBuffer> for NullBuffer {
fn from(value: BooleanBuffer) -> Self {
Self::new(value)
}
}
impl From<&[bool]> for NullBuffer {
fn from(value: &[bool]) -> Self {
BooleanBuffer::from(value).into()
}
}
impl<const N: usize> From<&[bool; N]> for NullBuffer {
fn from(value: &[bool; N]) -> Self {
value[..].into()
}
}
impl From<Vec<bool>> for NullBuffer {
fn from(value: Vec<bool>) -> Self {
BooleanBuffer::from(value).into()
}
}
impl FromIterator<bool> for NullBuffer {
fn from_iter<T: IntoIterator<Item = bool>>(iter: T) -> Self {
BooleanBuffer::from_iter(iter).into()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_size() {
// This tests that the niche optimisation eliminates the overhead of an option
assert_eq!(
std::mem::size_of::<NullBuffer>(),
std::mem::size_of::<Option<NullBuffer>>()
);
}
}