blob: 285a4f035e24fe03e0b1a43fa6f56e1326523691 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder};
use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType};
use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray};
use arrow_buffer::ArrowNativeType;
use arrow_schema::{ArrowError, DataType};
use hashbrown::hash_map::RawEntryMut;
use hashbrown::HashMap;
use std::any::Any;
use std::sync::Arc;
/// Builder for [`DictionaryArray`] of [`GenericByteArray`]
///
/// For example to map a set of byte indices to String values. Note that
/// the use of a `HashMap` here will not scale to very large arrays or
/// result in an ordered dictionary.
#[derive(Debug)]
pub struct GenericByteDictionaryBuilder<K, T>
where
K: ArrowDictionaryKeyType,
T: ByteArrayType,
{
state: ahash::RandomState,
/// Used to provide a lookup from string value to key type
///
/// Note: usize's hash implementation is not used, instead the raw entry
/// API is used to store keys w.r.t the hash of the strings themselves
///
dedup: HashMap<usize, (), ()>,
keys_builder: PrimitiveBuilder<K>,
values_builder: GenericByteBuilder<T>,
}
impl<K, T> Default for GenericByteDictionaryBuilder<K, T>
where
K: ArrowDictionaryKeyType,
T: ByteArrayType,
{
fn default() -> Self {
Self::new()
}
}
impl<K, T> GenericByteDictionaryBuilder<K, T>
where
K: ArrowDictionaryKeyType,
T: ByteArrayType,
{
/// Creates a new `GenericByteDictionaryBuilder`
pub fn new() -> Self {
let keys_builder = PrimitiveBuilder::new();
let values_builder = GenericByteBuilder::<T>::new();
Self {
state: Default::default(),
dedup: HashMap::with_capacity_and_hasher(keys_builder.capacity(), ()),
keys_builder,
values_builder,
}
}
/// Creates a new `GenericByteDictionaryBuilder` with the provided capacities
///
/// `keys_capacity`: the number of keys, i.e. length of array to build
/// `value_capacity`: the number of distinct dictionary values, i.e. size of dictionary
/// `data_capacity`: the total number of bytes of all distinct bytes in the dictionary
pub fn with_capacity(
keys_capacity: usize,
value_capacity: usize,
data_capacity: usize,
) -> Self {
Self {
state: Default::default(),
dedup: Default::default(),
keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
values_builder: GenericByteBuilder::<T>::with_capacity(value_capacity, data_capacity),
}
}
/// Creates a new `GenericByteDictionaryBuilder` from a keys capacity and a dictionary
/// which is initialized with the given values.
/// The indices of those dictionary values are used as keys.
///
/// # Example
///
/// ```
/// # use arrow_array::builder::StringDictionaryBuilder;
/// # use arrow_array::{Int16Array, StringArray};
///
/// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]);
///
/// let mut builder = StringDictionaryBuilder::new_with_dictionary(3, &dictionary_values).unwrap();
/// builder.append("def").unwrap();
/// builder.append_null();
/// builder.append("abc").unwrap();
///
/// let dictionary_array = builder.finish();
///
/// let keys = dictionary_array.keys();
///
/// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)]));
/// ```
pub fn new_with_dictionary(
keys_capacity: usize,
dictionary_values: &GenericByteArray<T>,
) -> Result<Self, ArrowError> {
let state = ahash::RandomState::default();
let dict_len = dictionary_values.len();
let mut dedup = HashMap::with_capacity_and_hasher(dict_len, ());
let values_len = dictionary_values.value_data().len();
let mut values_builder = GenericByteBuilder::<T>::with_capacity(dict_len, values_len);
K::Native::from_usize(dictionary_values.len())
.ok_or(ArrowError::DictionaryKeyOverflowError)?;
for (idx, maybe_value) in dictionary_values.iter().enumerate() {
match maybe_value {
Some(value) => {
let value_bytes: &[u8] = value.as_ref();
let hash = state.hash_one(value_bytes);
let entry = dedup.raw_entry_mut().from_hash(hash, |idx: &usize| {
value_bytes == get_bytes(&values_builder, *idx)
});
if let RawEntryMut::Vacant(v) = entry {
v.insert_with_hasher(hash, idx, (), |idx| {
state.hash_one(get_bytes(&values_builder, *idx))
});
}
values_builder.append_value(value);
}
None => values_builder.append_null(),
}
}
Ok(Self {
state,
dedup,
keys_builder: PrimitiveBuilder::with_capacity(keys_capacity),
values_builder,
})
}
}
impl<K, T> ArrayBuilder for GenericByteDictionaryBuilder<K, T>
where
K: ArrowDictionaryKeyType,
T: ByteArrayType,
{
/// Returns the builder as an non-mutable `Any` reference.
fn as_any(&self) -> &dyn Any {
self
}
/// Returns the builder as an mutable `Any` reference.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
/// Returns the number of array slots in the builder
fn len(&self) -> usize {
self.keys_builder.len()
}
/// Builds the array and reset this builder.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
/// Builds the array without resetting the builder.
fn finish_cloned(&self) -> ArrayRef {
Arc::new(self.finish_cloned())
}
}
impl<K, T> GenericByteDictionaryBuilder<K, T>
where
K: ArrowDictionaryKeyType,
T: ByteArrayType,
{
/// Append a value to the array. Return an existing index
/// if already present in the values array or a new index if the
/// value is appended to the values array.
///
/// Returns an error if the new index would overflow the key type.
pub fn append(&mut self, value: impl AsRef<T::Native>) -> Result<K::Native, ArrowError> {
let value_native: &T::Native = value.as_ref();
let value_bytes: &[u8] = value_native.as_ref();
let state = &self.state;
let storage = &mut self.values_builder;
let hash = state.hash_one(value_bytes);
let entry = self
.dedup
.raw_entry_mut()
.from_hash(hash, |idx| value_bytes == get_bytes(storage, *idx));
let key = match entry {
RawEntryMut::Occupied(entry) => K::Native::usize_as(*entry.into_key()),
RawEntryMut::Vacant(entry) => {
let idx = storage.len();
storage.append_value(value);
entry.insert_with_hasher(hash, idx, (), |idx| {
state.hash_one(get_bytes(storage, *idx))
});
K::Native::from_usize(idx).ok_or(ArrowError::DictionaryKeyOverflowError)?
}
};
self.keys_builder.append_value(key);
Ok(key)
}
/// Infallibly append a value to this builder
///
/// # Panics
///
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
self.append(value).expect("dictionary key overflow");
}
/// Appends a null slot into the builder
#[inline]
pub fn append_null(&mut self) {
self.keys_builder.append_null()
}
/// Append an `Option` value into the builder
///
/// # Panics
///
/// Panics if the resulting length of the dictionary values array would exceed `T::Native::MAX`
#[inline]
pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
match value {
None => self.append_null(),
Some(v) => self.append_value(v),
};
}
/// Builds the `DictionaryArray` and reset this builder.
pub fn finish(&mut self) -> DictionaryArray<K> {
self.dedup.clear();
let values = self.values_builder.finish();
let keys = self.keys_builder.finish();
let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE));
let builder = keys
.into_data()
.into_builder()
.data_type(data_type)
.child_data(vec![values.into_data()]);
DictionaryArray::from(unsafe { builder.build_unchecked() })
}
/// Builds the `DictionaryArray` without resetting the builder.
pub fn finish_cloned(&self) -> DictionaryArray<K> {
let values = self.values_builder.finish_cloned();
let keys = self.keys_builder.finish_cloned();
let data_type = DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(T::DATA_TYPE));
let builder = keys
.into_data()
.into_builder()
.data_type(data_type)
.child_data(vec![values.into_data()]);
DictionaryArray::from(unsafe { builder.build_unchecked() })
}
/// Returns the current null buffer as a slice
pub fn validity_slice(&self) -> Option<&[u8]> {
self.keys_builder.validity_slice()
}
}
impl<K: ArrowDictionaryKeyType, T: ByteArrayType, V: AsRef<T::Native>> Extend<Option<V>>
for GenericByteDictionaryBuilder<K, T>
{
#[inline]
fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
for v in iter {
self.append_option(v)
}
}
}
fn get_bytes<T: ByteArrayType>(values: &GenericByteBuilder<T>, idx: usize) -> &[u8] {
let offsets = values.offsets_slice();
let values = values.values_slice();
let end_offset = offsets[idx + 1].as_usize();
let start_offset = offsets[idx].as_usize();
&values[start_offset..end_offset]
}
/// Builder for [`DictionaryArray`] of [`StringArray`](crate::array::StringArray)
///
/// ```
/// // Create a dictionary array indexed by bytes whose values are Strings.
/// // It can thus hold up to 256 distinct string values.
///
/// # use arrow_array::builder::StringDictionaryBuilder;
/// # use arrow_array::{Int8Array, StringArray};
/// # use arrow_array::types::Int8Type;
///
/// let mut builder = StringDictionaryBuilder::<Int8Type>::new();
///
/// // The builder builds the dictionary value by value
/// builder.append("abc").unwrap();
/// builder.append_null();
/// builder.append("def").unwrap();
/// builder.append("def").unwrap();
/// builder.append("abc").unwrap();
/// let array = builder.finish();
///
/// assert_eq!(
/// array.keys(),
/// &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
/// );
///
/// // Values are polymorphic and so require a downcast.
/// let av = array.values();
/// let ava: &StringArray = av.as_any().downcast_ref::<StringArray>().unwrap();
///
/// assert_eq!(ava.value(0), "abc");
/// assert_eq!(ava.value(1), "def");
///
/// ```
pub type StringDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericStringType<i32>>;
/// Builder for [`DictionaryArray`] of [`LargeStringArray`](crate::array::LargeStringArray)
pub type LargeStringDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericStringType<i64>>;
/// Builder for [`DictionaryArray`] of [`BinaryArray`](crate::array::BinaryArray)
///
/// ```
/// // Create a dictionary array indexed by bytes whose values are binary.
/// // It can thus hold up to 256 distinct binary values.
///
/// # use arrow_array::builder::BinaryDictionaryBuilder;
/// # use arrow_array::{BinaryArray, Int8Array};
/// # use arrow_array::types::Int8Type;
///
/// let mut builder = BinaryDictionaryBuilder::<Int8Type>::new();
///
/// // The builder builds the dictionary value by value
/// builder.append(b"abc").unwrap();
/// builder.append_null();
/// builder.append(b"def").unwrap();
/// builder.append(b"def").unwrap();
/// builder.append(b"abc").unwrap();
/// let array = builder.finish();
///
/// assert_eq!(
/// array.keys(),
/// &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
/// );
///
/// // Values are polymorphic and so require a downcast.
/// let av = array.values();
/// let ava: &BinaryArray = av.as_any().downcast_ref::<BinaryArray>().unwrap();
///
/// assert_eq!(ava.value(0), b"abc");
/// assert_eq!(ava.value(1), b"def");
///
/// ```
pub type BinaryDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericBinaryType<i32>>;
/// Builder for [`DictionaryArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray)
pub type LargeBinaryDictionaryBuilder<K> = GenericByteDictionaryBuilder<K, GenericBinaryType<i64>>;
#[cfg(test)]
mod tests {
use super::*;
use crate::array::Int8Array;
use crate::types::{Int16Type, Int32Type, Int8Type, Utf8Type};
use crate::{BinaryArray, StringArray};
fn test_bytes_dictionary_builder<T>(values: Vec<&T::Native>)
where
T: ByteArrayType,
<T as ByteArrayType>::Native: PartialEq,
<T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
{
let mut builder = GenericByteDictionaryBuilder::<Int8Type, T>::new();
builder.append(values[0]).unwrap();
builder.append_null();
builder.append(values[1]).unwrap();
builder.append(values[1]).unwrap();
builder.append(values[0]).unwrap();
let array = builder.finish();
assert_eq!(
array.keys(),
&Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
);
// Values are polymorphic and so require a downcast.
let av = array.values();
let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
assert_eq!(*ava.value(0), *values[0]);
assert_eq!(*ava.value(1), *values[1]);
}
#[test]
fn test_string_dictionary_builder() {
test_bytes_dictionary_builder::<GenericStringType<i32>>(vec!["abc", "def"]);
}
#[test]
fn test_binary_dictionary_builder() {
test_bytes_dictionary_builder::<GenericBinaryType<i32>>(vec![b"abc", b"def"]);
}
fn test_bytes_dictionary_builder_finish_cloned<T>(values: Vec<&T::Native>)
where
T: ByteArrayType,
<T as ByteArrayType>::Native: PartialEq,
<T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
{
let mut builder = GenericByteDictionaryBuilder::<Int8Type, T>::new();
builder.append(values[0]).unwrap();
builder.append_null();
builder.append(values[1]).unwrap();
builder.append(values[1]).unwrap();
builder.append(values[0]).unwrap();
let mut array = builder.finish_cloned();
assert_eq!(
array.keys(),
&Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
);
// Values are polymorphic and so require a downcast.
let av = array.values();
let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
assert_eq!(ava.value(0), values[0]);
assert_eq!(ava.value(1), values[1]);
builder.append(values[0]).unwrap();
builder.append(values[2]).unwrap();
builder.append(values[1]).unwrap();
array = builder.finish();
assert_eq!(
array.keys(),
&Int8Array::from(vec![
Some(0),
None,
Some(1),
Some(1),
Some(0),
Some(0),
Some(2),
Some(1)
])
);
// Values are polymorphic and so require a downcast.
let av2 = array.values();
let ava2: &GenericByteArray<T> =
av2.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
assert_eq!(ava2.value(0), values[0]);
assert_eq!(ava2.value(1), values[1]);
assert_eq!(ava2.value(2), values[2]);
}
#[test]
fn test_string_dictionary_builder_finish_cloned() {
test_bytes_dictionary_builder_finish_cloned::<GenericStringType<i32>>(vec![
"abc", "def", "ghi",
]);
}
#[test]
fn test_binary_dictionary_builder_finish_cloned() {
test_bytes_dictionary_builder_finish_cloned::<GenericBinaryType<i32>>(vec![
b"abc", b"def", b"ghi",
]);
}
fn test_bytes_dictionary_builder_with_existing_dictionary<T>(
dictionary: GenericByteArray<T>,
values: Vec<&T::Native>,
) where
T: ByteArrayType,
<T as ByteArrayType>::Native: PartialEq,
<T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
{
let mut builder =
GenericByteDictionaryBuilder::<Int8Type, T>::new_with_dictionary(6, &dictionary)
.unwrap();
builder.append(values[0]).unwrap();
builder.append_null();
builder.append(values[1]).unwrap();
builder.append(values[1]).unwrap();
builder.append(values[0]).unwrap();
builder.append(values[2]).unwrap();
let array = builder.finish();
assert_eq!(
array.keys(),
&Int8Array::from(vec![Some(2), None, Some(1), Some(1), Some(2), Some(3)])
);
// Values are polymorphic and so require a downcast.
let av = array.values();
let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
assert!(!ava.is_valid(0));
assert_eq!(ava.value(1), values[1]);
assert_eq!(ava.value(2), values[0]);
assert_eq!(ava.value(3), values[2]);
}
#[test]
fn test_string_dictionary_builder_with_existing_dictionary() {
test_bytes_dictionary_builder_with_existing_dictionary::<GenericStringType<i32>>(
StringArray::from(vec![None, Some("def"), Some("abc")]),
vec!["abc", "def", "ghi"],
);
}
#[test]
fn test_binary_dictionary_builder_with_existing_dictionary() {
let values: Vec<Option<&[u8]>> = vec![None, Some(b"def"), Some(b"abc")];
test_bytes_dictionary_builder_with_existing_dictionary::<GenericBinaryType<i32>>(
BinaryArray::from(values),
vec![b"abc", b"def", b"ghi"],
);
}
fn test_bytes_dictionary_builder_with_reserved_null_value<T>(
dictionary: GenericByteArray<T>,
values: Vec<&T::Native>,
) where
T: ByteArrayType,
<T as ByteArrayType>::Native: PartialEq,
<T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
{
let mut builder =
GenericByteDictionaryBuilder::<Int16Type, T>::new_with_dictionary(4, &dictionary)
.unwrap();
builder.append(values[0]).unwrap();
builder.append_null();
builder.append(values[1]).unwrap();
builder.append(values[0]).unwrap();
let array = builder.finish();
assert!(array.is_null(1));
assert!(!array.is_valid(1));
let keys = array.keys();
assert_eq!(keys.value(0), 1);
assert!(keys.is_null(1));
// zero initialization is currently guaranteed by Buffer allocation and resizing
assert_eq!(keys.value(1), 0);
assert_eq!(keys.value(2), 2);
assert_eq!(keys.value(3), 1);
}
#[test]
fn test_string_dictionary_builder_with_reserved_null_value() {
let v: Vec<Option<&str>> = vec![None];
test_bytes_dictionary_builder_with_reserved_null_value::<GenericStringType<i32>>(
StringArray::from(v),
vec!["abc", "def"],
);
}
#[test]
fn test_binary_dictionary_builder_with_reserved_null_value() {
let values: Vec<Option<&[u8]>> = vec![None];
test_bytes_dictionary_builder_with_reserved_null_value::<GenericBinaryType<i32>>(
BinaryArray::from(values),
vec![b"abc", b"def"],
);
}
#[test]
fn test_extend() {
let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some));
builder.extend(["c", "d", "a"].into_iter().map(Some));
let dict = builder.finish();
assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]);
assert_eq!(dict.values().len(), 4);
}
}