blob: 64767e31cf230114953cfec617ee583a29a2823b [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates
//! common attributes and operations for Arrow array.
use std::mem;
use std::sync::Arc;
use crate::datatypes::{DataType, IntervalUnit};
use crate::{bitmap::Bitmap, datatypes::ArrowNativeType};
use crate::{
buffer::{Buffer, MutableBuffer},
util::bit_util,
};
use super::equal::equal;
#[inline]
pub(crate) fn count_nulls(
null_bit_buffer: Option<&Buffer>,
offset: usize,
len: usize,
) -> usize {
if let Some(buf) = null_bit_buffer {
len.checked_sub(buf.count_set_bits_offset(offset, len))
.unwrap()
} else {
0
}
}
/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots).
#[inline]
pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
let empty_buffer = MutableBuffer::new(0);
match data_type {
DataType::Null => [empty_buffer, MutableBuffer::new(0)],
DataType::Boolean => {
let bytes = bit_util::ceil(capacity, 8);
let buffer = MutableBuffer::new(bytes);
[buffer, empty_buffer]
}
DataType::UInt8 => [
MutableBuffer::new(capacity * mem::size_of::<u8>()),
empty_buffer,
],
DataType::UInt16 => [
MutableBuffer::new(capacity * mem::size_of::<u16>()),
empty_buffer,
],
DataType::UInt32 => [
MutableBuffer::new(capacity * mem::size_of::<u32>()),
empty_buffer,
],
DataType::UInt64 => [
MutableBuffer::new(capacity * mem::size_of::<u64>()),
empty_buffer,
],
DataType::Int8 => [
MutableBuffer::new(capacity * mem::size_of::<i8>()),
empty_buffer,
],
DataType::Int16 => [
MutableBuffer::new(capacity * mem::size_of::<i16>()),
empty_buffer,
],
DataType::Int32 => [
MutableBuffer::new(capacity * mem::size_of::<i32>()),
empty_buffer,
],
DataType::Int64 => [
MutableBuffer::new(capacity * mem::size_of::<i64>()),
empty_buffer,
],
DataType::Float32 => [
MutableBuffer::new(capacity * mem::size_of::<f32>()),
empty_buffer,
],
DataType::Float64 => [
MutableBuffer::new(capacity * mem::size_of::<f64>()),
empty_buffer,
],
DataType::Date32 | DataType::Time32(_) => [
MutableBuffer::new(capacity * mem::size_of::<i32>()),
empty_buffer,
],
DataType::Date64
| DataType::Time64(_)
| DataType::Duration(_)
| DataType::Timestamp(_, _) => [
MutableBuffer::new(capacity * mem::size_of::<i64>()),
empty_buffer,
],
DataType::Interval(IntervalUnit::YearMonth) => [
MutableBuffer::new(capacity * mem::size_of::<i32>()),
empty_buffer,
],
DataType::Interval(IntervalUnit::DayTime) => [
MutableBuffer::new(capacity * mem::size_of::<i64>()),
empty_buffer,
],
DataType::Utf8 | DataType::Binary => {
let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
// safety: `unsafe` code assumes that this buffer is initialized with one element
buffer.push(0i32);
[buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
}
DataType::LargeUtf8 | DataType::LargeBinary => {
let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
// safety: `unsafe` code assumes that this buffer is initialized with one element
buffer.push(0i64);
[buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
}
DataType::List(_) => {
// offset buffer always starts with a zero
let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
buffer.push(0i32);
[buffer, empty_buffer]
}
DataType::LargeList(_) => {
// offset buffer always starts with a zero
let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
buffer.push(0i64);
[buffer, empty_buffer]
}
DataType::FixedSizeBinary(size) => {
[MutableBuffer::new(capacity * *size as usize), empty_buffer]
}
DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() {
DataType::UInt8 => [
MutableBuffer::new(capacity * mem::size_of::<u8>()),
empty_buffer,
],
DataType::UInt16 => [
MutableBuffer::new(capacity * mem::size_of::<u16>()),
empty_buffer,
],
DataType::UInt32 => [
MutableBuffer::new(capacity * mem::size_of::<u32>()),
empty_buffer,
],
DataType::UInt64 => [
MutableBuffer::new(capacity * mem::size_of::<u64>()),
empty_buffer,
],
DataType::Int8 => [
MutableBuffer::new(capacity * mem::size_of::<i8>()),
empty_buffer,
],
DataType::Int16 => [
MutableBuffer::new(capacity * mem::size_of::<i16>()),
empty_buffer,
],
DataType::Int32 => [
MutableBuffer::new(capacity * mem::size_of::<i32>()),
empty_buffer,
],
DataType::Int64 => [
MutableBuffer::new(capacity * mem::size_of::<i64>()),
empty_buffer,
],
_ => unreachable!(),
},
DataType::Float16 => unreachable!(),
DataType::FixedSizeList(_, _) | DataType::Struct(_) => {
[empty_buffer, MutableBuffer::new(0)]
}
DataType::Decimal(_, _) => [
MutableBuffer::new(capacity * mem::size_of::<u8>()),
empty_buffer,
],
DataType::Union(_) => unimplemented!(),
}
}
/// Maps 2 [`MutableBuffer`]s into a vector of [Buffer]s whose size depends on `data_type`.
#[inline]
pub(crate) fn into_buffers(
data_type: &DataType,
buffer1: MutableBuffer,
buffer2: MutableBuffer,
) -> Vec<Buffer> {
match data_type {
DataType::Null | DataType::Struct(_) => vec![],
DataType::Utf8
| DataType::Binary
| DataType::LargeUtf8
| DataType::LargeBinary => vec![buffer1.into(), buffer2.into()],
_ => vec![buffer1.into()],
}
}
/// An generic representation of Arrow array data which encapsulates common attributes and
/// operations for Arrow array. Specific operations for different arrays types (e.g.,
/// primitive, list, struct) are implemented in `Array`.
#[derive(Debug, Clone)]
pub struct ArrayData {
/// The data type for this array data
data_type: DataType,
/// The number of elements in this array data
len: usize,
/// The number of null elements in this array data
null_count: usize,
/// The offset into this array data, in number of items
offset: usize,
/// The buffers for this array data. Note that depending on the array types, this
/// could hold different kinds of buffers (e.g., value buffer, value offset buffer)
/// at different positions.
buffers: Vec<Buffer>,
/// The child(ren) of this array. Only non-empty for nested types, currently
/// `ListArray` and `StructArray`.
child_data: Vec<ArrayData>,
/// The null bitmap. A `None` value for this indicates all values are non-null in
/// this array.
null_bitmap: Option<Bitmap>,
}
pub type ArrayDataRef = Arc<ArrayData>;
impl ArrayData {
pub fn new(
data_type: DataType,
len: usize,
null_count: Option<usize>,
null_bit_buffer: Option<Buffer>,
offset: usize,
buffers: Vec<Buffer>,
child_data: Vec<ArrayData>,
) -> Self {
let null_count = match null_count {
None => count_nulls(null_bit_buffer.as_ref(), offset, len),
Some(null_count) => null_count,
};
let null_bitmap = null_bit_buffer.map(Bitmap::from);
Self {
data_type,
len,
null_count,
offset,
buffers,
child_data,
null_bitmap,
}
}
/// Returns a builder to construct a `ArrayData` instance.
#[inline]
pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
ArrayDataBuilder::new(data_type)
}
/// Returns a reference to the data type of this array data
#[inline]
pub const fn data_type(&self) -> &DataType {
&self.data_type
}
/// Returns a slice of buffers for this array data
pub fn buffers(&self) -> &[Buffer] {
&self.buffers[..]
}
/// Returns a slice of children data arrays
pub fn child_data(&self) -> &[ArrayData] {
&self.child_data[..]
}
/// Returns whether the element at index `i` is null
pub fn is_null(&self, i: usize) -> bool {
if let Some(ref b) = self.null_bitmap {
return !b.is_set(self.offset + i);
}
false
}
/// Returns a reference to the null bitmap of this array data
#[inline]
pub const fn null_bitmap(&self) -> &Option<Bitmap> {
&self.null_bitmap
}
/// Returns a reference to the null buffer of this array data.
pub fn null_buffer(&self) -> Option<&Buffer> {
self.null_bitmap().as_ref().map(|b| b.buffer_ref())
}
/// Returns whether the element at index `i` is not null
pub fn is_valid(&self, i: usize) -> bool {
if let Some(ref b) = self.null_bitmap {
return b.is_set(self.offset + i);
}
true
}
/// Returns the length (i.e., number of elements) of this array
#[inline]
pub const fn len(&self) -> usize {
self.len
}
// Returns whether array data is empty
#[inline]
pub const fn is_empty(&self) -> bool {
self.len == 0
}
/// Returns the offset of this array
#[inline]
pub const fn offset(&self) -> usize {
self.offset
}
/// Returns the total number of nulls in this array
#[inline]
pub const fn null_count(&self) -> usize {
self.null_count
}
/// Returns the total number of bytes of memory occupied by the buffers owned by this [ArrayData].
pub fn get_buffer_memory_size(&self) -> usize {
let mut size = 0;
for buffer in &self.buffers {
size += buffer.capacity();
}
if let Some(bitmap) = &self.null_bitmap {
size += bitmap.get_buffer_memory_size()
}
for child in &self.child_data {
size += child.get_buffer_memory_size();
}
size
}
/// Returns the total number of bytes of memory occupied physically by this [ArrayData].
pub fn get_array_memory_size(&self) -> usize {
let mut size = 0;
// Calculate size of the fields that don't have [get_array_memory_size] method internally.
size += mem::size_of_val(self)
- mem::size_of_val(&self.buffers)
- mem::size_of_val(&self.null_bitmap)
- mem::size_of_val(&self.child_data);
// Calculate rest of the fields top down which contain actual data
for buffer in &self.buffers {
size += mem::size_of_val(&buffer);
size += buffer.capacity();
}
if let Some(bitmap) = &self.null_bitmap {
size += bitmap.get_array_memory_size()
}
for child in &self.child_data {
size += child.get_array_memory_size();
}
size
}
/// Creates a zero-copy slice of itself. This creates a new [ArrayData]
/// with a different offset, len and a shifted null bitmap.
///
/// # Panics
///
/// Panics if `offset + length > self.len()`.
pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
assert!((offset + length) <= self.len());
let mut new_data = self.clone();
new_data.len = length;
new_data.offset = offset + self.offset;
new_data.null_count =
count_nulls(new_data.null_buffer(), new_data.offset, new_data.len);
new_data
}
/// Returns the `buffer` as a slice of type `T` starting at self.offset
/// # Panics
/// This function panics if:
/// * the buffer is not byte-aligned with type T, or
/// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable)
#[inline]
pub(crate) fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
let values = unsafe { self.buffers[buffer].as_slice().align_to::<T>() };
if !values.0.is_empty() || !values.2.is_empty() {
panic!("The buffer is not byte-aligned with its interpretation")
};
assert_ne!(self.data_type, DataType::Boolean);
&values.1[self.offset..]
}
/// Returns a new empty [ArrayData] valid for `data_type`.
pub fn new_empty(data_type: &DataType) -> Self {
let buffers = new_buffers(data_type, 0);
let [buffer1, buffer2] = buffers;
let buffers = into_buffers(data_type, buffer1, buffer2);
let child_data = match data_type {
DataType::Null
| DataType::Boolean
| DataType::UInt8
| DataType::UInt16
| DataType::UInt32
| DataType::UInt64
| DataType::Int8
| DataType::Int16
| DataType::Int32
| DataType::Int64
| DataType::Float32
| DataType::Float64
| DataType::Date32
| DataType::Date64
| DataType::Time32(_)
| DataType::Time64(_)
| DataType::Duration(_)
| DataType::Timestamp(_, _)
| DataType::Utf8
| DataType::Binary
| DataType::LargeUtf8
| DataType::LargeBinary
| DataType::Interval(_)
| DataType::FixedSizeBinary(_)
| DataType::Decimal(_, _) => vec![],
DataType::List(field) => {
vec![Self::new_empty(field.data_type())]
}
DataType::FixedSizeList(field, _) => {
vec![Self::new_empty(field.data_type())]
}
DataType::LargeList(field) => {
vec![Self::new_empty(field.data_type())]
}
DataType::Struct(fields) => fields
.iter()
.map(|field| Self::new_empty(field.data_type()))
.collect(),
DataType::Union(_) => unimplemented!(),
DataType::Dictionary(_, data_type) => {
vec![Self::new_empty(data_type)]
}
DataType::Float16 => unreachable!(),
};
Self::new(data_type.clone(), 0, Some(0), None, 0, buffers, child_data)
}
}
impl PartialEq for ArrayData {
fn eq(&self, other: &Self) -> bool {
equal(self, other)
}
}
/// Builder for `ArrayData` type
#[derive(Debug)]
pub struct ArrayDataBuilder {
data_type: DataType,
len: usize,
null_count: Option<usize>,
null_bit_buffer: Option<Buffer>,
offset: usize,
buffers: Vec<Buffer>,
child_data: Vec<ArrayData>,
}
impl ArrayDataBuilder {
#[inline]
pub const fn new(data_type: DataType) -> Self {
Self {
data_type,
len: 0,
null_count: None,
null_bit_buffer: None,
offset: 0,
buffers: vec![],
child_data: vec![],
}
}
#[inline]
#[allow(clippy::len_without_is_empty)]
pub const fn len(mut self, n: usize) -> Self {
self.len = n;
self
}
pub fn null_count(mut self, null_count: usize) -> Self {
self.null_count = Some(null_count);
self
}
pub fn null_bit_buffer(mut self, buf: Buffer) -> Self {
self.null_bit_buffer = Some(buf);
self
}
#[inline]
pub const fn offset(mut self, n: usize) -> Self {
self.offset = n;
self
}
pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
self.buffers = v;
self
}
pub fn add_buffer(mut self, b: Buffer) -> Self {
self.buffers.push(b);
self
}
pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
self.child_data = v;
self
}
pub fn add_child_data(mut self, r: ArrayData) -> Self {
self.child_data.push(r);
self
}
pub fn build(self) -> ArrayData {
ArrayData::new(
self.data_type,
self.len,
self.null_count,
self.null_bit_buffer,
self.offset,
self.buffers,
self.child_data,
)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::buffer::Buffer;
use crate::util::bit_util;
#[test]
fn test_new() {
let arr_data =
ArrayData::new(DataType::Boolean, 10, Some(1), None, 2, vec![], vec![]);
assert_eq!(10, arr_data.len());
assert_eq!(1, arr_data.null_count());
assert_eq!(2, arr_data.offset());
assert_eq!(0, arr_data.buffers().len());
assert_eq!(0, arr_data.child_data().len());
}
#[test]
fn test_builder() {
let child_arr_data = ArrayData::new(
DataType::Int32,
5,
Some(0),
None,
0,
vec![Buffer::from_slice_ref(&[1i32, 2, 3, 4, 5])],
vec![],
);
let v = vec![0, 1, 2, 3];
let b1 = Buffer::from(&v[..]);
let arr_data = ArrayData::builder(DataType::Int32)
.len(20)
.offset(5)
.add_buffer(b1)
.null_bit_buffer(Buffer::from(vec![
0b01011111, 0b10110101, 0b01100011, 0b00011110,
]))
.add_child_data(child_arr_data.clone())
.build();
assert_eq!(20, arr_data.len());
assert_eq!(10, arr_data.null_count());
assert_eq!(5, arr_data.offset());
assert_eq!(1, arr_data.buffers().len());
assert_eq!(&[0, 1, 2, 3], arr_data.buffers()[0].as_slice());
assert_eq!(1, arr_data.child_data().len());
assert_eq!(child_arr_data, arr_data.child_data()[0]);
}
#[test]
fn test_null_count() {
let mut bit_v: [u8; 2] = [0; 2];
bit_util::set_bit(&mut bit_v, 0);
bit_util::set_bit(&mut bit_v, 3);
bit_util::set_bit(&mut bit_v, 10);
let arr_data = ArrayData::builder(DataType::Int32)
.len(16)
.null_bit_buffer(Buffer::from(bit_v))
.build();
assert_eq!(13, arr_data.null_count());
// Test with offset
let mut bit_v: [u8; 2] = [0; 2];
bit_util::set_bit(&mut bit_v, 0);
bit_util::set_bit(&mut bit_v, 3);
bit_util::set_bit(&mut bit_v, 10);
let arr_data = ArrayData::builder(DataType::Int32)
.len(12)
.offset(2)
.null_bit_buffer(Buffer::from(bit_v))
.build();
assert_eq!(10, arr_data.null_count());
}
#[test]
fn test_null_buffer_ref() {
let mut bit_v: [u8; 2] = [0; 2];
bit_util::set_bit(&mut bit_v, 0);
bit_util::set_bit(&mut bit_v, 3);
bit_util::set_bit(&mut bit_v, 10);
let arr_data = ArrayData::builder(DataType::Int32)
.len(16)
.null_bit_buffer(Buffer::from(bit_v))
.build();
assert!(arr_data.null_buffer().is_some());
assert_eq!(&bit_v, arr_data.null_buffer().unwrap().as_slice());
}
#[test]
fn test_slice() {
let mut bit_v: [u8; 2] = [0; 2];
bit_util::set_bit(&mut bit_v, 0);
bit_util::set_bit(&mut bit_v, 3);
bit_util::set_bit(&mut bit_v, 10);
let data = ArrayData::builder(DataType::Int32)
.len(16)
.null_bit_buffer(Buffer::from(bit_v))
.build();
let new_data = data.slice(1, 15);
assert_eq!(data.len() - 1, new_data.len());
assert_eq!(1, new_data.offset());
assert_eq!(data.null_count(), new_data.null_count());
// slice of a slice (removes one null)
let new_data = new_data.slice(1, 14);
assert_eq!(data.len() - 2, new_data.len());
assert_eq!(2, new_data.offset());
assert_eq!(data.null_count() - 1, new_data.null_count());
}
#[test]
fn test_equality() {
let int_data = ArrayData::builder(DataType::Int32).build();
let float_data = ArrayData::builder(DataType::Float32).build();
assert_ne!(int_data, float_data);
}
#[test]
fn test_count_nulls() {
let null_buffer = Some(Buffer::from(vec![0b00010110, 0b10011111]));
let count = count_nulls(null_buffer.as_ref(), 0, 16);
assert_eq!(count, 7);
let count = count_nulls(null_buffer.as_ref(), 4, 8);
assert_eq!(count, 3);
}
}