blob: 56de594830169a23c1cdaa8df2b118d43e310061 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Common utilities for computation kernels.
use crate::array::*;
use crate::buffer::{buffer_bin_and, buffer_bin_or, Buffer};
use crate::datatypes::*;
use crate::error::{ArrowError, Result};
use num::{One, ToPrimitive, Zero};
use std::ops::Add;
/// Combines the null bitmaps of two arrays using a bitwise `and` operation.
///
/// This function is useful when implementing operations on higher level arrays.
#[allow(clippy::unnecessary_wraps)]
pub(super) fn combine_option_bitmap(
left_data: &ArrayData,
right_data: &ArrayData,
len_in_bits: usize,
) -> Result<Option<Buffer>> {
let left_offset_in_bits = left_data.offset();
let right_offset_in_bits = right_data.offset();
let left = left_data.null_buffer();
let right = right_data.null_buffer();
match left {
None => match right {
None => Ok(None),
Some(r) => Ok(Some(r.bit_slice(right_offset_in_bits, len_in_bits))),
},
Some(l) => match right {
None => Ok(Some(l.bit_slice(left_offset_in_bits, len_in_bits))),
Some(r) => Ok(Some(buffer_bin_and(
&l,
left_offset_in_bits,
&r,
right_offset_in_bits,
len_in_bits,
))),
},
}
}
/// Compares the null bitmaps of two arrays using a bitwise `or` operation.
///
/// This function is useful when implementing operations on higher level arrays.
#[allow(clippy::unnecessary_wraps)]
pub(super) fn compare_option_bitmap(
left_data: &ArrayData,
right_data: &ArrayData,
len_in_bits: usize,
) -> Result<Option<Buffer>> {
let left_offset_in_bits = left_data.offset();
let right_offset_in_bits = right_data.offset();
let left = left_data.null_buffer();
let right = right_data.null_buffer();
match left {
None => match right {
None => Ok(None),
Some(r) => Ok(Some(r.bit_slice(right_offset_in_bits, len_in_bits))),
},
Some(l) => match right {
None => Ok(Some(l.bit_slice(left_offset_in_bits, len_in_bits))),
Some(r) => Ok(Some(buffer_bin_or(
&l,
left_offset_in_bits,
&r,
right_offset_in_bits,
len_in_bits,
))),
},
}
}
/// Takes/filters a list array's inner data using the offsets of the list array.
///
/// Where a list array has indices `[0,2,5,10]`, taking indices of `[2,0]` returns
/// an array of the indices `[5..10, 0..2]` and offsets `[0,5,7]` (5 elements and 2
/// elements)
pub(super) fn take_value_indices_from_list<IndexType, OffsetType>(
list: &GenericListArray<OffsetType::Native>,
indices: &PrimitiveArray<IndexType>,
) -> Result<(PrimitiveArray<OffsetType>, Vec<OffsetType::Native>)>
where
IndexType: ArrowNumericType,
IndexType::Native: ToPrimitive,
OffsetType: ArrowNumericType,
OffsetType::Native: OffsetSizeTrait + Add + Zero + One,
PrimitiveArray<OffsetType>: From<Vec<Option<OffsetType::Native>>>,
{
// TODO: benchmark this function, there might be a faster unsafe alternative
let offsets: &[OffsetType::Native] = list.value_offsets();
let mut new_offsets = Vec::with_capacity(indices.len());
let mut values = Vec::new();
let mut current_offset = OffsetType::Native::zero();
// add first offset
new_offsets.push(OffsetType::Native::zero());
// compute the value indices, and set offsets accordingly
for i in 0..indices.len() {
if indices.is_valid(i) {
let ix = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| {
ArrowError::ComputeError("Cast to usize failed".to_string())
})?;
let start = offsets[ix];
let end = offsets[ix + 1];
current_offset += end - start;
new_offsets.push(current_offset);
let mut curr = start;
// if start == end, this slot is empty
while curr < end {
values.push(Some(curr));
curr += OffsetType::Native::one();
}
} else {
new_offsets.push(current_offset);
}
}
Ok((PrimitiveArray::<OffsetType>::from(values), new_offsets))
}
/// Takes/filters a fixed size list array's inner data using the offsets of the list array.
pub(super) fn take_value_indices_from_fixed_size_list<IndexType>(
list: &FixedSizeListArray,
indices: &PrimitiveArray<IndexType>,
length: <UInt32Type as ArrowPrimitiveType>::Native,
) -> Result<PrimitiveArray<UInt32Type>>
where
IndexType: ArrowNumericType,
IndexType::Native: ToPrimitive,
{
let mut values = vec![];
for i in 0..indices.len() {
if indices.is_valid(i) {
let index = ToPrimitive::to_usize(&indices.value(i)).ok_or_else(|| {
ArrowError::ComputeError("Cast to usize failed".to_string())
})?;
let start =
list.value_offset(index) as <UInt32Type as ArrowPrimitiveType>::Native;
values.extend(start..start + length);
}
}
Ok(PrimitiveArray::<UInt32Type>::from(values))
}
#[cfg(test)]
pub(super) mod tests {
use super::*;
use std::sync::Arc;
use crate::datatypes::DataType;
use crate::util::bit_util;
use crate::{array::ArrayData, buffer::MutableBuffer};
fn make_data_with_null_bit_buffer(
len: usize,
offset: usize,
null_bit_buffer: Option<Buffer>,
) -> Arc<ArrayData> {
// empty vec for buffers and children is not really correct, but for these tests we only care about the null bitmap
Arc::new(ArrayData::new(
DataType::UInt8,
len,
None,
null_bit_buffer,
offset,
vec![],
vec![],
))
}
#[test]
fn test_combine_option_bitmap() {
let none_bitmap = make_data_with_null_bit_buffer(8, 0, None);
let some_bitmap =
make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b01001010])));
let inverse_bitmap =
make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b10110101])));
assert_eq!(
None,
combine_option_bitmap(&none_bitmap, &none_bitmap, 8).unwrap()
);
assert_eq!(
Some(Buffer::from([0b01001010])),
combine_option_bitmap(&some_bitmap, &none_bitmap, 8).unwrap()
);
assert_eq!(
Some(Buffer::from([0b01001010])),
combine_option_bitmap(&none_bitmap, &some_bitmap, 8,).unwrap()
);
assert_eq!(
Some(Buffer::from([0b01001010])),
combine_option_bitmap(&some_bitmap, &some_bitmap, 8,).unwrap()
);
assert_eq!(
Some(Buffer::from([0b0])),
combine_option_bitmap(&some_bitmap, &inverse_bitmap, 8,).unwrap()
);
}
#[test]
fn test_compare_option_bitmap() {
let none_bitmap = make_data_with_null_bit_buffer(8, 0, None);
let some_bitmap =
make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b01001010])));
let inverse_bitmap =
make_data_with_null_bit_buffer(8, 0, Some(Buffer::from([0b10110101])));
assert_eq!(
None,
compare_option_bitmap(&none_bitmap, &none_bitmap, 8).unwrap()
);
assert_eq!(
Some(Buffer::from([0b01001010])),
compare_option_bitmap(&some_bitmap, &none_bitmap, 8).unwrap()
);
assert_eq!(
Some(Buffer::from([0b01001010])),
compare_option_bitmap(&none_bitmap, &some_bitmap, 8,).unwrap()
);
assert_eq!(
Some(Buffer::from([0b01001010])),
compare_option_bitmap(&some_bitmap, &some_bitmap, 8,).unwrap()
);
assert_eq!(
Some(Buffer::from([0b11111111])),
compare_option_bitmap(&some_bitmap, &inverse_bitmap, 8,).unwrap()
);
}
pub(crate) fn build_generic_list<S, T>(
data: Vec<Option<Vec<T::Native>>>,
) -> GenericListArray<S>
where
S: OffsetSizeTrait + 'static,
T: ArrowPrimitiveType,
PrimitiveArray<T>: From<Vec<Option<T::Native>>>,
{
let data = data
.into_iter()
.map(|subarray| {
subarray.map(|item| {
item.into_iter()
.map(Some)
.collect::<Vec<Option<T::Native>>>()
})
})
.collect();
build_generic_list_nullable(data)
}
pub(crate) fn build_generic_list_nullable<S, T>(
data: Vec<Option<Vec<Option<T::Native>>>>,
) -> GenericListArray<S>
where
S: OffsetSizeTrait + 'static,
T: ArrowPrimitiveType,
PrimitiveArray<T>: From<Vec<Option<T::Native>>>,
{
use std::any::TypeId;
let mut offset = vec![0];
let mut values = vec![];
let list_len = data.len();
let num_bytes = bit_util::ceil(list_len, 8);
let mut list_null_count = 0;
let mut list_bitmap = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true);
for (idx, array) in data.into_iter().enumerate() {
if let Some(mut array) = array {
values.append(&mut array);
} else {
list_null_count += 1;
bit_util::unset_bit(&mut list_bitmap.as_slice_mut(), idx);
}
offset.push(values.len() as i64);
}
let value_data = PrimitiveArray::<T>::from(values).data().clone();
let (list_data_type, value_offsets) = if TypeId::of::<S>() == TypeId::of::<i32>()
{
(
DataType::List(Box::new(Field::new(
"item",
T::DATA_TYPE,
list_null_count == 0,
))),
Buffer::from_slice_ref(
&offset.into_iter().map(|x| x as i32).collect::<Vec<i32>>(),
),
)
} else if TypeId::of::<S>() == TypeId::of::<i64>() {
(
DataType::LargeList(Box::new(Field::new(
"item",
T::DATA_TYPE,
list_null_count == 0,
))),
Buffer::from_slice_ref(&offset),
)
} else {
unreachable!()
};
let list_data = ArrayData::builder(list_data_type)
.len(list_len)
.null_bit_buffer(list_bitmap.into())
.add_buffer(value_offsets)
.add_child_data(value_data)
.build();
GenericListArray::<S>::from(list_data)
}
pub(crate) fn build_fixed_size_list<T>(
data: Vec<Option<Vec<T::Native>>>,
length: <Int32Type as ArrowPrimitiveType>::Native,
) -> FixedSizeListArray
where
T: ArrowPrimitiveType,
PrimitiveArray<T>: From<Vec<Option<T::Native>>>,
{
let data = data
.into_iter()
.map(|subarray| {
subarray.map(|item| {
item.into_iter()
.map(Some)
.collect::<Vec<Option<T::Native>>>()
})
})
.collect();
build_fixed_size_list_nullable(data, length)
}
pub(crate) fn build_fixed_size_list_nullable<T>(
list_values: Vec<Option<Vec<Option<T::Native>>>>,
length: <Int32Type as ArrowPrimitiveType>::Native,
) -> FixedSizeListArray
where
T: ArrowPrimitiveType,
PrimitiveArray<T>: From<Vec<Option<T::Native>>>,
{
let mut values = vec![];
let mut list_null_count = 0;
let list_len = list_values.len();
let num_bytes = bit_util::ceil(list_len, 8);
let mut list_bitmap = MutableBuffer::new(num_bytes).with_bitset(num_bytes, true);
for (idx, list_element) in list_values.into_iter().enumerate() {
if let Some(items) = list_element {
// every sub-array should have the same length
debug_assert_eq!(length as usize, items.len());
values.extend(items.into_iter());
} else {
list_null_count += 1;
bit_util::unset_bit(&mut list_bitmap.as_slice_mut(), idx);
values.extend(vec![None; length as usize].into_iter());
}
}
let list_data_type = DataType::FixedSizeList(
Box::new(Field::new("item", T::DATA_TYPE, list_null_count == 0)),
length,
);
let child_data = PrimitiveArray::<T>::from(values).data().clone();
let list_data = ArrayData::builder(list_data_type)
.len(list_len)
.null_bit_buffer(list_bitmap.into())
.add_child_data(child_data)
.build();
FixedSizeListArray::from(list_data)
}
#[test]
fn test_take_value_index_from_list() {
let list = build_generic_list::<i32, Int32Type>(vec![
Some(vec![0, 1]),
Some(vec![2, 3, 4]),
Some(vec![5, 6, 7, 8, 9]),
]);
let indices = UInt32Array::from(vec![2, 0]);
let (indexed, offsets) = take_value_indices_from_list(&list, &indices).unwrap();
assert_eq!(indexed, Int32Array::from(vec![5, 6, 7, 8, 9, 0, 1]));
assert_eq!(offsets, vec![0, 5, 7]);
}
#[test]
fn test_take_value_index_from_large_list() {
let list = build_generic_list::<i64, Int32Type>(vec![
Some(vec![0, 1]),
Some(vec![2, 3, 4]),
Some(vec![5, 6, 7, 8, 9]),
]);
let indices = UInt32Array::from(vec![2, 0]);
let (indexed, offsets) =
take_value_indices_from_list::<_, Int64Type>(&list, &indices).unwrap();
assert_eq!(indexed, Int64Array::from(vec![5, 6, 7, 8, 9, 0, 1]));
assert_eq!(offsets, vec![0, 5, 7]);
}
#[test]
fn test_take_value_index_from_fixed_list() {
let list = build_fixed_size_list_nullable::<Int32Type>(
vec![
Some(vec![Some(1), Some(2), None]),
Some(vec![Some(4), None, Some(6)]),
None,
Some(vec![None, Some(8), Some(9)]),
],
3,
);
let indices = UInt32Array::from(vec![2, 1, 0]);
let indexed =
take_value_indices_from_fixed_size_list(&list, &indices, 3).unwrap();
assert_eq!(indexed, UInt32Array::from(vec![6, 7, 8, 3, 4, 5, 0, 1, 2]));
let indices = UInt32Array::from(vec![3, 2, 1, 2, 0]);
let indexed =
take_value_indices_from_fixed_size_list(&list, &indices, 3).unwrap();
assert_eq!(
indexed,
UInt32Array::from(vec![9, 10, 11, 6, 7, 8, 3, 4, 5, 6, 7, 8, 0, 1, 2])
);
}
}