blob: fa875c20e3024256af1d08faee47bb93619a1ee3 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Implements the `nullif` function for Arrow arrays.
use arrow_array::{Array, ArrayRef, BooleanArray, make_array};
use arrow_buffer::buffer::bitwise_bin_op_helper;
use arrow_buffer::{BooleanBuffer, NullBuffer, bitwise_unary_op_helper};
use arrow_schema::{ArrowError, DataType};
/// Returns a new array with the same values and the validity bit to false where
/// the corresponding element of `right` is true.
///
/// This can be used to implement SQL `NULLIF`
///
/// # Example
/// ```
/// # use arrow_array::{Int32Array, BooleanArray};
/// # use arrow_array::cast::AsArray;
/// # use arrow_array::types::Int32Type;
/// # use arrow_select::nullif::nullif;
/// // input is [null, 8, 1, 9]
/// let a = Int32Array::from(vec![None, Some(8), Some(1), Some(9)]);
/// // use nullif to set index 1 to null
/// let bool_array = BooleanArray::from(vec![Some(false), Some(true), Some(false), None]);
/// let nulled = nullif(&a, &bool_array).unwrap();
/// // The resulting array is [null, null, 1, 9]
/// assert_eq!(nulled.as_primitive(), &Int32Array::from(vec![None, None, Some(1), Some(9)]));
/// ```
pub fn nullif(left: &dyn Array, right: &BooleanArray) -> Result<ArrayRef, ArrowError> {
let left_data = left.to_data();
if left_data.len() != right.len() {
return Err(ArrowError::ComputeError(
"Cannot perform comparison operation on arrays of different length".to_string(),
));
}
let len = left_data.len();
if len == 0 || left_data.data_type() == &DataType::Null {
return Ok(make_array(left_data));
}
// left=0 (null) right=null output bitmap=null
// left=0 right=1 output bitmap=null
// left=1 (set) right=null output bitmap=set (passthrough)
// left=1 right=1 & comp=true output bitmap=null
// left=1 right=1 & comp=false output bitmap=set
//
// Thus: result = left null bitmap & (!right_values | !right_bitmap)
// OR left null bitmap & !(right_values & right_bitmap)
// Compute right_values & right_bitmap
let right = match right.nulls() {
Some(nulls) => right.values() & nulls.inner(),
None => right.values().clone(),
};
// Compute left null bitmap & !right
let (combined, null_count) = match left_data.nulls() {
Some(left) => {
let mut valid_count = 0;
let b = bitwise_bin_op_helper(
left.buffer(),
left.offset(),
right.inner(),
right.offset(),
len,
|l, r| {
let t = l & !r;
valid_count += t.count_ones() as usize;
t
},
);
(b, len - valid_count)
}
None => {
let mut null_count = 0;
let buffer = bitwise_unary_op_helper(right.inner(), right.offset(), len, |b| {
let t = !b;
null_count += t.count_zeros() as usize;
t
});
(buffer, null_count)
}
};
let combined = BooleanBuffer::new(combined, 0, len);
// Safety:
// Counted nulls whilst computing
let nulls = unsafe { NullBuffer::new_unchecked(combined, null_count) };
let data = left_data.into_builder().nulls(Some(nulls));
// SAFETY:
// Only altered null mask
Ok(make_array(unsafe { data.build_unchecked() }))
}
#[cfg(test)]
mod tests {
use super::*;
use arrow_array::builder::{BooleanBuilder, Int32Builder, StructBuilder};
use arrow_array::cast::AsArray;
use arrow_array::types::Int32Type;
use arrow_array::{Int32Array, NullArray, StringArray, StructArray};
use arrow_data::ArrayData;
use arrow_schema::{Field, Fields};
use rand::prelude::StdRng;
use rand::{Rng, SeedableRng};
#[test]
fn test_nullif_int_array() {
let a = Int32Array::from(vec![Some(15), None, Some(8), Some(1), Some(9)]);
let comp = BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]);
let res = nullif(&a, &comp).unwrap();
let expected = Int32Array::from(vec![
Some(15),
None,
None, // comp true, slot 2 turned into null
Some(1),
// Even though comp array / right is null, should still pass through original value
// comp true, slot 2 turned into null
Some(9),
]);
let res = res.as_primitive::<Int32Type>();
assert_eq!(&expected, res);
}
#[test]
fn test_nullif_null_array() {
assert_eq!(
nullif(&NullArray::new(0), &BooleanArray::new_null(0))
.unwrap()
.as_ref(),
&NullArray::new(0)
);
assert_eq!(
nullif(
&NullArray::new(3),
&BooleanArray::from(vec![Some(false), Some(true), None]),
)
.unwrap()
.as_ref(),
&NullArray::new(3)
);
}
#[test]
fn test_nullif_int_array_offset() {
let a = Int32Array::from(vec![None, Some(15), Some(8), Some(1), Some(9)]);
let a = a.slice(1, 3); // Some(15), Some(8), Some(1)
let a = a.as_any().downcast_ref::<Int32Array>().unwrap();
let comp = BooleanArray::from(vec![
Some(false),
Some(false),
Some(false),
None,
Some(true),
Some(false),
None,
]);
let comp = comp.slice(2, 3); // Some(false), None, Some(true)
let comp = comp.as_any().downcast_ref::<BooleanArray>().unwrap();
let res = nullif(a, comp).unwrap();
let expected = Int32Array::from(vec![
Some(15), // False => keep it
Some(8), // None => keep it
None, // true => None
]);
let res = res.as_primitive::<Int32Type>();
assert_eq!(&expected, res)
}
#[test]
fn test_nullif_string() {
let s = StringArray::from_iter([
Some("hello"),
None,
Some("world"),
Some("a"),
Some("b"),
None,
None,
]);
let select = BooleanArray::from_iter([
Some(true),
Some(true),
Some(false),
Some(true),
Some(false),
Some(false),
None,
]);
let a = nullif(&s, &select).unwrap();
let r: Vec<_> = a.as_string::<i32>().iter().collect();
assert_eq!(
r,
vec![None, None, Some("world"), None, Some("b"), None, None]
);
let s = s.slice(2, 3);
let select = select.slice(1, 3);
let a = nullif(&s, &select).unwrap();
let r: Vec<_> = a.as_string::<i32>().iter().collect();
assert_eq!(r, vec![None, Some("a"), None]);
}
#[test]
fn test_nullif_int_large_left_offset() {
let a = Int32Array::from(vec![
Some(-1), // 0
Some(-1),
Some(-1),
Some(-1),
Some(-1),
Some(-1),
Some(-1),
Some(-1),
Some(-1), // 8
Some(-1),
Some(-1),
Some(-1),
Some(-1),
Some(-1),
Some(-1),
Some(-1),
None, // 16
Some(15), // 17
Some(8),
Some(1),
Some(9),
]);
let a = a.slice(17, 3); // Some(15), Some(8), Some(1)
let comp = BooleanArray::from(vec![
Some(false),
Some(false),
Some(false),
None,
Some(true),
Some(false),
None,
]);
let comp = comp.slice(2, 3); // Some(false), None, Some(true)
let comp = comp.as_any().downcast_ref::<BooleanArray>().unwrap();
let res = nullif(&a, comp).unwrap();
let res = res.as_any().downcast_ref::<Int32Array>().unwrap();
let expected = Int32Array::from(vec![
Some(15), // False => keep it
Some(8), // None => keep it
None, // true => None
]);
assert_eq!(&expected, res)
}
#[test]
fn test_nullif_int_large_right_offset() {
let a = Int32Array::from(vec![
None, // 0
Some(15), // 1
Some(8),
Some(1),
Some(9),
]);
let a = a.slice(1, 3); // Some(15), Some(8), Some(1)
let comp = BooleanArray::from(vec![
Some(false), // 0
Some(false),
Some(false),
Some(false),
Some(false),
Some(false),
Some(false),
Some(false),
Some(false), // 8
Some(false),
Some(false),
Some(false),
Some(false),
Some(false),
Some(false),
Some(false),
Some(false), // 16
Some(false), // 17
Some(false), // 18
None,
Some(true),
Some(false),
None,
]);
let comp = comp.slice(18, 3); // Some(false), None, Some(true)
let comp = comp.as_any().downcast_ref::<BooleanArray>().unwrap();
let res = nullif(&a, comp).unwrap();
let res = res.as_any().downcast_ref::<Int32Array>().unwrap();
let expected = Int32Array::from(vec![
Some(15), // False => keep it
Some(8), // None => keep it
None, // true => None
]);
assert_eq!(&expected, res)
}
#[test]
fn test_nullif_boolean_offset() {
let a = BooleanArray::from(vec![
None, // 0
Some(true), // 1
Some(false),
Some(true),
Some(true),
]);
let a = a.slice(1, 3); // Some(true), Some(false), Some(true)
let comp = BooleanArray::from(vec![
Some(false), // 0
Some(false), // 1
Some(false), // 2
None,
Some(true),
Some(false),
None,
]);
let comp = comp.slice(2, 3); // Some(false), None, Some(true)
let comp = comp.as_any().downcast_ref::<BooleanArray>().unwrap();
let res = nullif(&a, comp).unwrap();
let res = res.as_any().downcast_ref::<BooleanArray>().unwrap();
let expected = BooleanArray::from(vec![
Some(true), // False => keep it
Some(false), // None => keep it
None, // true => None
]);
assert_eq!(&expected, res)
}
struct Foo {
a: Option<i32>,
b: Option<bool>,
/// Whether the entry should be valid.
is_valid: bool,
}
impl Foo {
fn new_valid(a: i32, b: bool) -> Foo {
Self {
a: Some(a),
b: Some(b),
is_valid: true,
}
}
fn new_null() -> Foo {
Self {
a: None,
b: None,
is_valid: false,
}
}
}
/// Struct Array equality is a bit weird -- we need to have the *child values*
/// correct even if the enclosing struct indicates it is null. But we
/// also need the top level is_valid bits to be correct.
fn create_foo_struct(values: Vec<Foo>) -> StructArray {
let mut struct_array = StructBuilder::new(
Fields::from(vec![
Field::new("a", DataType::Int32, true),
Field::new("b", DataType::Boolean, true),
]),
vec![
Box::new(Int32Builder::with_capacity(values.len())),
Box::new(BooleanBuilder::with_capacity(values.len())),
],
);
for value in values {
struct_array
.field_builder::<Int32Builder>(0)
.unwrap()
.append_option(value.a);
struct_array
.field_builder::<BooleanBuilder>(1)
.unwrap()
.append_option(value.b);
struct_array.append(value.is_valid);
}
struct_array.finish()
}
#[test]
fn test_nullif_struct_slices() {
let struct_array = create_foo_struct(vec![
Foo::new_valid(7, true),
Foo::new_valid(15, false),
Foo::new_valid(8, true),
Foo::new_valid(12, false),
Foo::new_null(),
Foo::new_null(),
Foo::new_valid(42, true),
]);
// Some({a: 15, b: false}), Some({a: 8, b: true}), Some({a: 12, b: false}),
// None, None
let struct_array = struct_array.slice(1, 5);
let comp = BooleanArray::from(vec![
Some(false), // 0
Some(false), // 1
Some(false), // 2
None,
Some(true),
Some(false),
None,
]);
let comp = comp.slice(2, 5); // Some(false), None, Some(true), Some(false), None
let comp = comp.as_any().downcast_ref::<BooleanArray>().unwrap();
let res = nullif(&struct_array, comp).unwrap();
let res = res.as_any().downcast_ref::<StructArray>().unwrap();
let expected = create_foo_struct(vec![
// Some(false) -> keep
Foo::new_valid(15, false),
// None -> keep
Foo::new_valid(8, true),
// Some(true) -> null out. But child values are still there.
Foo {
a: Some(12),
b: Some(false),
is_valid: false,
},
// Some(false) -> keep, but was null
Foo::new_null(),
// None -> keep, but was null
Foo::new_null(),
]);
assert_eq!(&expected, res);
}
#[test]
fn test_nullif_no_nulls() {
let a = Int32Array::from(vec![Some(15), Some(7), Some(8), Some(1), Some(9)]);
let comp = BooleanArray::from(vec![Some(false), None, Some(true), Some(false), None]);
let res = nullif(&a, &comp).unwrap();
let res = res.as_primitive::<Int32Type>();
let expected = Int32Array::from(vec![Some(15), Some(7), None, Some(1), Some(9)]);
assert_eq!(res, &expected);
}
#[test]
fn nullif_empty() {
let a = Int32Array::from(ArrayData::new_empty(&DataType::Int32));
let mask = BooleanArray::from(ArrayData::new_empty(&DataType::Boolean));
let res = nullif(&a, &mask).unwrap();
assert_eq!(res.as_ref(), &a);
}
fn test_nullif(values: &Int32Array, filter: &BooleanArray) {
let expected: Int32Array = values
.iter()
.zip(filter.iter())
.map(|(a, b)| match b {
Some(true) => None,
Some(false) | None => a,
})
.collect();
let r = nullif(values, filter).unwrap();
let r_data = r.to_data();
r_data.validate().unwrap();
assert_eq!(
r.as_ref(),
&expected,
"expected nulls: {:#?}\n\n\
result nulls: {:#?}\n\n\\
expected values: {:#?}\n\n\
result values: {:#?}",
expected.nulls(),
r.nulls(),
expected.values(),
r.as_primitive::<Int32Type>().values()
);
validate_nulls(expected.nulls());
validate_nulls(r.nulls());
}
/// Ensures that the null count matches the actual number of nulls.
fn validate_nulls(nulls: Option<&NullBuffer>) {
let Some(nulls) = nulls else {
return;
};
let mut actual_null_count = 0;
for i in 0..nulls.len() {
if nulls.is_null(i) {
actual_null_count += 1;
}
}
assert_eq!(actual_null_count, nulls.null_count());
}
#[test]
fn nullif_fuzz() {
let mut rng = StdRng::seed_from_u64(7337);
let arrays = [
Int32Array::from(vec![0; 1024]), // no nulls
(0..1024) // 50% nulls
.map(|_| rng.random_bool(0.5).then_some(1))
.collect(),
];
for a in arrays {
let a_slices = [
(0, 128),
(0, 129),
(64, 64),
(0, 64),
(32, 32),
(0, 0),
(32, 0),
(5, 800),
(33, 53),
(77, 101),
];
for (a_offset, a_length) in a_slices {
let a = a.slice(a_offset, a_length);
for i in 1..65 {
let b_start_offset = rng.random_range(0..i);
let b_end_offset = rng.random_range(0..i);
// b with 50% nulls
let b: BooleanArray = (0..a_length + b_start_offset + b_end_offset)
.map(|_| rng.random_bool(0.5).then(|| rng.random_bool(0.5)))
.collect();
let b_sliced = b.slice(b_start_offset, a_length);
test_nullif(&a, &b_sliced);
// b with no nulls (and no null buffer)
let b = remove_null_buffer(&b);
let b_sliced = b.slice(b_start_offset, a_length);
test_nullif(&a, &b_sliced);
// b with no nulls (but with a null buffer)
let b = remove_null_values(&b);
let b_sliced = b.slice(b_start_offset, a_length);
test_nullif(&a, &b_sliced);
}
}
}
}
/// Returns a new BooleanArray with no null buffer
fn remove_null_buffer(array: &BooleanArray) -> BooleanArray {
make_array(
array
.into_data()
.into_builder()
.nulls(None)
.build()
.unwrap(),
)
.as_boolean()
.clone()
}
/// Returns a new BooleanArray with a null buffer where all values are valid
fn remove_null_values(array: &BooleanArray) -> BooleanArray {
let len = array.len();
let new_nulls = NullBuffer::from_iter(std::iter::repeat_n(true, len));
make_array(
array
.into_data()
.into_builder()
.nulls(Some(new_nulls))
.build()
.unwrap(),
)
.as_boolean()
.clone()
}
}