blob: d6c5dd4c3e137919593b74c1814b2154ae18107d [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use crate::builder::{PrimitiveDictionaryBuilder, StringDictionaryBuilder};
use crate::cast::AsArray;
use crate::iterator::ArrayIter;
use crate::types::*;
use crate::{
make_array, Array, ArrayAccessor, ArrayRef, ArrowNativeTypeOp, PrimitiveArray, Scalar,
StringArray,
};
use arrow_buffer::bit_util::set_bit;
use arrow_buffer::buffer::NullBuffer;
use arrow_buffer::{ArrowNativeType, BooleanBuffer, BooleanBufferBuilder};
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
use std::any::Any;
use std::sync::Arc;
/// A [`DictionaryArray`] indexed by `i8`
///
/// # Example: Using `collect`
/// ```
/// # use arrow_array::{Array, Int8DictionaryArray, Int8Array, StringArray};
/// # use std::sync::Arc;
///
/// let array: Int8DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2]));
/// assert_eq!(array.values(), &values);
/// ```
///
/// See [`DictionaryArray`] for more information and examples
pub type Int8DictionaryArray = DictionaryArray<Int8Type>;
/// A [`DictionaryArray`] indexed by `i16`
///
/// # Example: Using `collect`
/// ```
/// # use arrow_array::{Array, Int16DictionaryArray, Int16Array, StringArray};
/// # use std::sync::Arc;
///
/// let array: Int16DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
/// assert_eq!(array.keys(), &Int16Array::from(vec![0, 0, 1, 2]));
/// assert_eq!(array.values(), &values);
/// ```
///
/// See [`DictionaryArray`] for more information and examples
pub type Int16DictionaryArray = DictionaryArray<Int16Type>;
/// A [`DictionaryArray`] indexed by `i32`
///
/// # Example: Using `collect`
/// ```
/// # use arrow_array::{Array, Int32DictionaryArray, Int32Array, StringArray};
/// # use std::sync::Arc;
///
/// let array: Int32DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
/// assert_eq!(array.keys(), &Int32Array::from(vec![0, 0, 1, 2]));
/// assert_eq!(array.values(), &values);
/// ```
///
/// See [`DictionaryArray`] for more information and examples
pub type Int32DictionaryArray = DictionaryArray<Int32Type>;
/// A [`DictionaryArray`] indexed by `i64`
///
/// # Example: Using `collect`
/// ```
/// # use arrow_array::{Array, Int64DictionaryArray, Int64Array, StringArray};
/// # use std::sync::Arc;
///
/// let array: Int64DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
/// assert_eq!(array.keys(), &Int64Array::from(vec![0, 0, 1, 2]));
/// assert_eq!(array.values(), &values);
/// ```
///
/// See [`DictionaryArray`] for more information and examples
pub type Int64DictionaryArray = DictionaryArray<Int64Type>;
/// A [`DictionaryArray`] indexed by `u8`
///
/// # Example: Using `collect`
/// ```
/// # use arrow_array::{Array, UInt8DictionaryArray, UInt8Array, StringArray};
/// # use std::sync::Arc;
///
/// let array: UInt8DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
/// assert_eq!(array.keys(), &UInt8Array::from(vec![0, 0, 1, 2]));
/// assert_eq!(array.values(), &values);
/// ```
///
/// See [`DictionaryArray`] for more information and examples
pub type UInt8DictionaryArray = DictionaryArray<UInt8Type>;
/// A [`DictionaryArray`] indexed by `u16`
///
/// # Example: Using `collect`
/// ```
/// # use arrow_array::{Array, UInt16DictionaryArray, UInt16Array, StringArray};
/// # use std::sync::Arc;
///
/// let array: UInt16DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
/// assert_eq!(array.keys(), &UInt16Array::from(vec![0, 0, 1, 2]));
/// assert_eq!(array.values(), &values);
/// ```
///
/// See [`DictionaryArray`] for more information and examples
pub type UInt16DictionaryArray = DictionaryArray<UInt16Type>;
/// A [`DictionaryArray`] indexed by `u32`
///
/// # Example: Using `collect`
/// ```
/// # use arrow_array::{Array, UInt32DictionaryArray, UInt32Array, StringArray};
/// # use std::sync::Arc;
///
/// let array: UInt32DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
/// assert_eq!(array.keys(), &UInt32Array::from(vec![0, 0, 1, 2]));
/// assert_eq!(array.values(), &values);
/// ```
///
/// See [`DictionaryArray`] for more information and examples
pub type UInt32DictionaryArray = DictionaryArray<UInt32Type>;
/// A [`DictionaryArray`] indexed by `u64`
///
/// # Example: Using `collect`
/// ```
/// # use arrow_array::{Array, UInt64DictionaryArray, UInt64Array, StringArray};
/// # use std::sync::Arc;
///
/// let array: UInt64DictionaryArray = vec!["a", "a", "b", "c"].into_iter().collect();
/// let values: Arc<dyn Array> = Arc::new(StringArray::from(vec!["a", "b", "c"]));
/// assert_eq!(array.keys(), &UInt64Array::from(vec![0, 0, 1, 2]));
/// assert_eq!(array.values(), &values);
/// ```
///
/// See [`DictionaryArray`] for more information and examples
pub type UInt64DictionaryArray = DictionaryArray<UInt64Type>;
/// An array of [dictionary encoded values](https://arrow.apache.org/docs/format/Columnar.html#dictionary-encoded-layout)
///
/// This is mostly used to represent strings or a limited set of primitive types as integers,
/// for example when doing NLP analysis or representing chromosomes by name.
///
/// [`DictionaryArray`] are represented using a `keys` array and a
/// `values` array, which may be different lengths. The `keys` array
/// stores indexes in the `values` array which holds
/// the corresponding logical value, as shown here:
///
/// ```text
/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
/// ┌─────────────────┐ ┌─────────┐ │ ┌─────────────────┐
/// │ │ A │ │ 0 │ │ A │ values[keys[0]]
/// ├─────────────────┤ ├─────────┤ │ ├─────────────────┤
/// │ │ D │ │ 2 │ │ B │ values[keys[1]]
/// ├─────────────────┤ ├─────────┤ │ ├─────────────────┤
/// │ │ B │ │ 2 │ │ B │ values[keys[2]]
/// └─────────────────┘ ├─────────┤ │ ├─────────────────┤
/// │ │ 1 │ │ D │ values[keys[3]]
/// ├─────────┤ │ ├─────────────────┤
/// │ │ 1 │ │ D │ values[keys[4]]
/// ├─────────┤ │ ├─────────────────┤
/// │ │ 0 │ │ A │ values[keys[5]]
/// └─────────┘ │ └─────────────────┘
/// │ values keys
/// ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
/// Logical array
/// Contents
/// DictionaryArray
/// length = 6
/// ```
///
/// # Example: From Nullable Data
///
/// ```
/// # use arrow_array::{DictionaryArray, Int8Array, types::Int8Type};
/// let test = vec!["a", "a", "b", "c"];
/// let array : DictionaryArray<Int8Type> = test.iter().map(|&x| if x == "b" {None} else {Some(x)}).collect();
/// assert_eq!(array.keys(), &Int8Array::from(vec![Some(0), Some(0), None, Some(1)]));
/// ```
///
/// # Example: From Non-Nullable Data
///
/// ```
/// # use arrow_array::{DictionaryArray, Int8Array, types::Int8Type};
/// let test = vec!["a", "a", "b", "c"];
/// let array : DictionaryArray<Int8Type> = test.into_iter().collect();
/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2]));
/// ```
///
/// # Example: From Existing Arrays
///
/// ```
/// # use std::sync::Arc;
/// # use arrow_array::{DictionaryArray, Int8Array, StringArray, types::Int8Type};
/// // You can form your own DictionaryArray by providing the
/// // values (dictionary) and keys (indexes into the dictionary):
/// let values = StringArray::from_iter_values(["a", "b", "c"]);
/// let keys = Int8Array::from_iter_values([0, 0, 1, 2]);
/// let array = DictionaryArray::<Int8Type>::try_new(keys, Arc::new(values)).unwrap();
/// let expected: DictionaryArray::<Int8Type> = vec!["a", "a", "b", "c"].into_iter().collect();
/// assert_eq!(&array, &expected);
/// ```
///
/// # Example: Using Builder
///
/// ```
/// # use arrow_array::{Array, StringArray};
/// # use arrow_array::builder::StringDictionaryBuilder;
/// # use arrow_array::types::Int32Type;
/// let mut builder = StringDictionaryBuilder::<Int32Type>::new();
/// builder.append_value("a");
/// builder.append_null();
/// builder.append_value("a");
/// builder.append_value("b");
/// let array = builder.finish();
///
/// let values: Vec<_> = array.downcast_dict::<StringArray>().unwrap().into_iter().collect();
/// assert_eq!(&values, &[Some("a"), None, Some("a"), Some("b")]);
/// ```
pub struct DictionaryArray<K: ArrowDictionaryKeyType> {
data_type: DataType,
/// The keys of this dictionary. These are constructed from the
/// buffer and null bitmap of `data`. Also, note that these do
/// not correspond to the true values of this array. Rather, they
/// map to the real values.
keys: PrimitiveArray<K>,
/// Array of dictionary values (can by any DataType).
values: ArrayRef,
/// Values are ordered.
is_ordered: bool,
}
impl<K: ArrowDictionaryKeyType> Clone for DictionaryArray<K> {
fn clone(&self) -> Self {
Self {
data_type: self.data_type.clone(),
keys: self.keys.clone(),
values: self.values.clone(),
is_ordered: self.is_ordered,
}
}
}
impl<K: ArrowDictionaryKeyType> DictionaryArray<K> {
/// Attempt to create a new DictionaryArray with a specified keys
/// (indexes into the dictionary) and values (dictionary)
/// array.
///
/// # Panics
///
/// Panics if [`Self::try_new`] returns an error
pub fn new(keys: PrimitiveArray<K>, values: ArrayRef) -> Self {
Self::try_new(keys, values).unwrap()
}
/// Attempt to create a new DictionaryArray with a specified keys
/// (indexes into the dictionary) and values (dictionary)
/// array.
///
/// # Errors
///
/// Returns an error if any `keys[i] >= values.len() || keys[i] < 0`
pub fn try_new(keys: PrimitiveArray<K>, values: ArrayRef) -> Result<Self, ArrowError> {
let data_type = DataType::Dictionary(
Box::new(keys.data_type().clone()),
Box::new(values.data_type().clone()),
);
let zero = K::Native::usize_as(0);
let values_len = values.len();
if let Some((idx, v)) =
keys.values().iter().enumerate().find(|(idx, v)| {
(v.is_lt(zero) || v.as_usize() >= values_len) && keys.is_valid(*idx)
})
{
return Err(ArrowError::InvalidArgumentError(format!(
"Invalid dictionary key {v:?} at index {idx}, expected 0 <= key < {values_len}",
)));
}
Ok(Self {
data_type,
keys,
values,
is_ordered: false,
})
}
/// Create a new [`Scalar`] from `value`
pub fn new_scalar<T: Array + 'static>(value: Scalar<T>) -> Scalar<Self> {
Scalar::new(Self::new(
PrimitiveArray::new(vec![K::Native::usize_as(0)].into(), None),
Arc::new(value.into_inner()),
))
}
/// Create a new [`DictionaryArray`] without performing validation
///
/// # Safety
///
/// Safe provided [`Self::try_new`] would not return an error
pub unsafe fn new_unchecked(keys: PrimitiveArray<K>, values: ArrayRef) -> Self {
let data_type = DataType::Dictionary(
Box::new(keys.data_type().clone()),
Box::new(values.data_type().clone()),
);
Self {
data_type,
keys,
values,
is_ordered: false,
}
}
/// Deconstruct this array into its constituent parts
pub fn into_parts(self) -> (PrimitiveArray<K>, ArrayRef) {
(self.keys, self.values)
}
/// Return an array view of the keys of this dictionary as a PrimitiveArray.
pub fn keys(&self) -> &PrimitiveArray<K> {
&self.keys
}
/// If `value` is present in `values` (aka the dictionary),
/// returns the corresponding key (index into the `values`
/// array). Otherwise returns `None`.
///
/// Panics if `values` is not a [`StringArray`].
pub fn lookup_key(&self, value: &str) -> Option<K::Native> {
let rd_buf: &StringArray = self.values.as_any().downcast_ref::<StringArray>().unwrap();
(0..rd_buf.len())
.position(|i| rd_buf.value(i) == value)
.and_then(K::Native::from_usize)
}
/// Returns a reference to the dictionary values array
pub fn values(&self) -> &ArrayRef {
&self.values
}
/// Returns a clone of the value type of this list.
pub fn value_type(&self) -> DataType {
self.values.data_type().clone()
}
/// The length of the dictionary is the length of the keys array.
pub fn len(&self) -> usize {
self.keys.len()
}
/// Whether this dictionary is empty
pub fn is_empty(&self) -> bool {
self.keys.is_empty()
}
/// Currently exists for compatibility purposes with Arrow IPC.
pub fn is_ordered(&self) -> bool {
self.is_ordered
}
/// Return an iterator over the keys (indexes into the dictionary)
pub fn keys_iter(&self) -> impl Iterator<Item = Option<usize>> + '_ {
self.keys.iter().map(|key| key.map(|k| k.as_usize()))
}
/// Return the value of `keys` (the dictionary key) at index `i`,
/// cast to `usize`, `None` if the value at `i` is `NULL`.
pub fn key(&self, i: usize) -> Option<usize> {
self.keys.is_valid(i).then(|| self.keys.value(i).as_usize())
}
/// Returns a zero-copy slice of this array with the indicated offset and length.
pub fn slice(&self, offset: usize, length: usize) -> Self {
Self {
data_type: self.data_type.clone(),
keys: self.keys.slice(offset, length),
values: self.values.clone(),
is_ordered: self.is_ordered,
}
}
/// Downcast this dictionary to a [`TypedDictionaryArray`]
///
/// ```
/// use arrow_array::{Array, ArrayAccessor, DictionaryArray, StringArray, types::Int32Type};
///
/// let orig = [Some("a"), Some("b"), None];
/// let dictionary = DictionaryArray::<Int32Type>::from_iter(orig);
/// let typed = dictionary.downcast_dict::<StringArray>().unwrap();
/// assert_eq!(typed.value(0), "a");
/// assert_eq!(typed.value(1), "b");
/// assert!(typed.is_null(2));
/// ```
///
pub fn downcast_dict<V: 'static>(&self) -> Option<TypedDictionaryArray<'_, K, V>> {
let values = self.values.as_any().downcast_ref()?;
Some(TypedDictionaryArray {
dictionary: self,
values,
})
}
/// Returns a new dictionary with the same keys as the current instance
/// but with a different set of dictionary values
///
/// This can be used to perform an operation on the values of a dictionary
///
/// # Panics
///
/// Panics if `values` has a length less than the current values
///
/// ```
/// # use std::sync::Arc;
/// # use arrow_array::builder::PrimitiveDictionaryBuilder;
/// # use arrow_array::{Int8Array, Int64Array, ArrayAccessor};
/// # use arrow_array::types::{Int32Type, Int8Type};
///
/// // Construct a Dict(Int32, Int8)
/// let mut builder = PrimitiveDictionaryBuilder::<Int32Type, Int8Type>::with_capacity(2, 200);
/// for i in 0..100 {
/// builder.append(i % 2).unwrap();
/// }
///
/// let dictionary = builder.finish();
///
/// // Perform a widening cast of dictionary values
/// let typed_dictionary = dictionary.downcast_dict::<Int8Array>().unwrap();
/// let values: Int64Array = typed_dictionary.values().unary(|x| x as i64);
///
/// // Create a Dict(Int32,
/// let new = dictionary.with_values(Arc::new(values));
///
/// // Verify values are as expected
/// let new_typed = new.downcast_dict::<Int64Array>().unwrap();
/// for i in 0..100 {
/// assert_eq!(new_typed.value(i), (i % 2) as i64)
/// }
/// ```
///
pub fn with_values(&self, values: ArrayRef) -> Self {
assert!(values.len() >= self.values.len());
let data_type =
DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(values.data_type().clone()));
Self {
data_type,
keys: self.keys.clone(),
values,
is_ordered: false,
}
}
/// Returns `PrimitiveDictionaryBuilder` of this dictionary array for mutating
/// its keys and values if the underlying data buffer is not shared by others.
pub fn into_primitive_dict_builder<V>(self) -> Result<PrimitiveDictionaryBuilder<K, V>, Self>
where
V: ArrowPrimitiveType,
{
if !self.value_type().is_primitive() {
return Err(self);
}
let key_array = self.keys().clone();
let value_array = self.values().as_primitive::<V>().clone();
drop(self.keys);
drop(self.values);
let key_builder = key_array.into_builder();
let value_builder = value_array.into_builder();
match (key_builder, value_builder) {
(Ok(key_builder), Ok(value_builder)) => Ok(unsafe {
PrimitiveDictionaryBuilder::new_from_builders(key_builder, value_builder)
}),
(Err(key_array), Ok(mut value_builder)) => {
Err(Self::try_new(key_array, Arc::new(value_builder.finish())).unwrap())
}
(Ok(mut key_builder), Err(value_array)) => {
Err(Self::try_new(key_builder.finish(), Arc::new(value_array)).unwrap())
}
(Err(key_array), Err(value_array)) => {
Err(Self::try_new(key_array, Arc::new(value_array)).unwrap())
}
}
}
/// Applies an unary and infallible function to a mutable dictionary array.
/// Mutable dictionary array means that the buffers are not shared with other arrays.
/// As a result, this mutates the buffers directly without allocating new buffers.
///
/// # Implementation
///
/// This will apply the function for all dictionary values, including those on null slots.
/// This implies that the operation must be infallible for any value of the corresponding type
/// or this function may panic.
/// # Example
/// ```
/// # use std::sync::Arc;
/// # use arrow_array::{Array, ArrayAccessor, DictionaryArray, StringArray, types::{Int8Type, Int32Type}};
/// # use arrow_array::{Int8Array, Int32Array};
/// let values = Int32Array::from(vec![Some(10), Some(20), None]);
/// let keys = Int8Array::from_iter_values([0, 0, 1, 2]);
/// let dictionary = DictionaryArray::<Int8Type>::try_new(keys, Arc::new(values)).unwrap();
/// let c = dictionary.unary_mut::<_, Int32Type>(|x| x + 1).unwrap();
/// let typed = c.downcast_dict::<Int32Array>().unwrap();
/// assert_eq!(typed.value(0), 11);
/// assert_eq!(typed.value(1), 11);
/// assert_eq!(typed.value(2), 21);
/// ```
pub fn unary_mut<F, V>(self, op: F) -> Result<DictionaryArray<K>, DictionaryArray<K>>
where
V: ArrowPrimitiveType,
F: Fn(V::Native) -> V::Native,
{
let mut builder: PrimitiveDictionaryBuilder<K, V> = self.into_primitive_dict_builder()?;
builder
.values_slice_mut()
.iter_mut()
.for_each(|v| *v = op(*v));
Ok(builder.finish())
}
/// Computes an occupancy mask for this dictionary's values
///
/// For each value in [`Self::values`] the corresponding bit will be set in the
/// returned mask if it is referenced by a key in this [`DictionaryArray`]
pub fn occupancy(&self) -> BooleanBuffer {
let len = self.values.len();
let mut builder = BooleanBufferBuilder::new(len);
builder.resize(len);
let slice = builder.as_slice_mut();
match self.keys.nulls().filter(|n| n.null_count() > 0) {
Some(n) => {
let v = self.keys.values();
n.valid_indices()
.for_each(|idx| set_bit(slice, v[idx].as_usize()))
}
None => {
let v = self.keys.values();
v.iter().for_each(|v| set_bit(slice, v.as_usize()))
}
}
builder.finish()
}
}
/// Constructs a `DictionaryArray` from an array data reference.
impl<T: ArrowDictionaryKeyType> From<ArrayData> for DictionaryArray<T> {
fn from(data: ArrayData) -> Self {
assert_eq!(
data.buffers().len(),
1,
"DictionaryArray data should contain a single buffer only (keys)."
);
assert_eq!(
data.child_data().len(),
1,
"DictionaryArray should contain a single child array (values)."
);
if let DataType::Dictionary(key_data_type, _) = data.data_type() {
assert_eq!(
&T::DATA_TYPE,
key_data_type.as_ref(),
"DictionaryArray's data type must match, expected {} got {}",
T::DATA_TYPE,
key_data_type
);
let values = make_array(data.child_data()[0].clone());
let data_type = data.data_type().clone();
// create a zero-copy of the keys' data
// SAFETY:
// ArrayData is valid and verified type above
let keys = PrimitiveArray::<T>::from(unsafe {
data.into_builder()
.data_type(T::DATA_TYPE)
.child_data(vec![])
.build_unchecked()
});
Self {
data_type,
keys,
values,
is_ordered: false,
}
} else {
panic!("DictionaryArray must have Dictionary data type.")
}
}
}
impl<T: ArrowDictionaryKeyType> From<DictionaryArray<T>> for ArrayData {
fn from(array: DictionaryArray<T>) -> Self {
let builder = array
.keys
.into_data()
.into_builder()
.data_type(array.data_type)
.child_data(vec![array.values.to_data()]);
unsafe { builder.build_unchecked() }
}
}
/// Constructs a `DictionaryArray` from an iterator of optional strings.
///
/// # Example:
/// ```
/// use arrow_array::{DictionaryArray, PrimitiveArray, StringArray, types::Int8Type};
///
/// let test = vec!["a", "a", "b", "c"];
/// let array: DictionaryArray<Int8Type> = test
/// .iter()
/// .map(|&x| if x == "b" { None } else { Some(x) })
/// .collect();
/// assert_eq!(
/// "DictionaryArray {keys: PrimitiveArray<Int8>\n[\n 0,\n 0,\n null,\n 1,\n] values: StringArray\n[\n \"a\",\n \"c\",\n]}\n",
/// format!("{:?}", array)
/// );
/// ```
impl<'a, T: ArrowDictionaryKeyType> FromIterator<Option<&'a str>> for DictionaryArray<T> {
fn from_iter<I: IntoIterator<Item = Option<&'a str>>>(iter: I) -> Self {
let it = iter.into_iter();
let (lower, _) = it.size_hint();
let mut builder = StringDictionaryBuilder::with_capacity(lower, 256, 1024);
builder.extend(it);
builder.finish()
}
}
/// Constructs a `DictionaryArray` from an iterator of strings.
///
/// # Example:
///
/// ```
/// use arrow_array::{DictionaryArray, PrimitiveArray, StringArray, types::Int8Type};
///
/// let test = vec!["a", "a", "b", "c"];
/// let array: DictionaryArray<Int8Type> = test.into_iter().collect();
/// assert_eq!(
/// "DictionaryArray {keys: PrimitiveArray<Int8>\n[\n 0,\n 0,\n 1,\n 2,\n] values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n",
/// format!("{:?}", array)
/// );
/// ```
impl<'a, T: ArrowDictionaryKeyType> FromIterator<&'a str> for DictionaryArray<T> {
fn from_iter<I: IntoIterator<Item = &'a str>>(iter: I) -> Self {
let it = iter.into_iter();
let (lower, _) = it.size_hint();
let mut builder = StringDictionaryBuilder::with_capacity(lower, 256, 1024);
it.for_each(|i| {
builder
.append(i)
.expect("Unable to append a value to a dictionary array.");
});
builder.finish()
}
}
impl<T: ArrowDictionaryKeyType> Array for DictionaryArray<T> {
fn as_any(&self) -> &dyn Any {
self
}
fn to_data(&self) -> ArrayData {
self.clone().into()
}
fn into_data(self) -> ArrayData {
self.into()
}
fn data_type(&self) -> &DataType {
&self.data_type
}
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.slice(offset, length))
}
fn len(&self) -> usize {
self.keys.len()
}
fn is_empty(&self) -> bool {
self.keys.is_empty()
}
fn offset(&self) -> usize {
self.keys.offset()
}
fn nulls(&self) -> Option<&NullBuffer> {
self.keys.nulls()
}
fn logical_nulls(&self) -> Option<NullBuffer> {
match self.values.nulls() {
None => self.nulls().cloned(),
Some(value_nulls) => {
let mut builder = BooleanBufferBuilder::new(self.len());
match self.keys.nulls() {
Some(n) => builder.append_buffer(n.inner()),
None => builder.append_n(self.len(), true),
}
for (idx, k) in self.keys.values().iter().enumerate() {
let k = k.as_usize();
// Check range to allow for nulls
if k < value_nulls.len() && value_nulls.is_null(k) {
builder.set_bit(idx, false);
}
}
Some(builder.finish().into())
}
}
}
fn is_nullable(&self) -> bool {
!self.is_empty() && (self.nulls().is_some() || self.values.is_nullable())
}
fn get_buffer_memory_size(&self) -> usize {
self.keys.get_buffer_memory_size() + self.values.get_buffer_memory_size()
}
fn get_array_memory_size(&self) -> usize {
std::mem::size_of::<Self>()
+ self.keys.get_buffer_memory_size()
+ self.values.get_array_memory_size()
}
}
impl<T: ArrowDictionaryKeyType> std::fmt::Debug for DictionaryArray<T> {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
writeln!(
f,
"DictionaryArray {{keys: {:?} values: {:?}}}",
self.keys, self.values
)
}
}
/// A [`DictionaryArray`] typed on its child values array
///
/// Implements [`ArrayAccessor`] allowing fast access to its elements
///
/// ```
/// use arrow_array::{DictionaryArray, StringArray, types::Int32Type};
///
/// let orig = ["a", "b", "a", "b"];
/// let dictionary = DictionaryArray::<Int32Type>::from_iter(orig);
///
/// // `TypedDictionaryArray` allows you to access the values directly
/// let typed = dictionary.downcast_dict::<StringArray>().unwrap();
///
/// for (maybe_val, orig) in typed.into_iter().zip(orig) {
/// assert_eq!(maybe_val.unwrap(), orig)
/// }
/// ```
pub struct TypedDictionaryArray<'a, K: ArrowDictionaryKeyType, V> {
/// The dictionary array
dictionary: &'a DictionaryArray<K>,
/// The values of the dictionary
values: &'a V,
}
// Manually implement `Clone` to avoid `V: Clone` type constraint
impl<'a, K: ArrowDictionaryKeyType, V> Clone for TypedDictionaryArray<'a, K, V> {
fn clone(&self) -> Self {
*self
}
}
impl<'a, K: ArrowDictionaryKeyType, V> Copy for TypedDictionaryArray<'a, K, V> {}
impl<'a, K: ArrowDictionaryKeyType, V> std::fmt::Debug for TypedDictionaryArray<'a, K, V> {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
writeln!(f, "TypedDictionaryArray({:?})", self.dictionary)
}
}
impl<'a, K: ArrowDictionaryKeyType, V> TypedDictionaryArray<'a, K, V> {
/// Returns the keys of this [`TypedDictionaryArray`]
pub fn keys(&self) -> &'a PrimitiveArray<K> {
self.dictionary.keys()
}
/// Returns the values of this [`TypedDictionaryArray`]
pub fn values(&self) -> &'a V {
self.values
}
}
impl<'a, K: ArrowDictionaryKeyType, V: Sync> Array for TypedDictionaryArray<'a, K, V> {
fn as_any(&self) -> &dyn Any {
self.dictionary
}
fn to_data(&self) -> ArrayData {
self.dictionary.to_data()
}
fn into_data(self) -> ArrayData {
self.dictionary.into_data()
}
fn data_type(&self) -> &DataType {
self.dictionary.data_type()
}
fn slice(&self, offset: usize, length: usize) -> ArrayRef {
Arc::new(self.dictionary.slice(offset, length))
}
fn len(&self) -> usize {
self.dictionary.len()
}
fn is_empty(&self) -> bool {
self.dictionary.is_empty()
}
fn offset(&self) -> usize {
self.dictionary.offset()
}
fn nulls(&self) -> Option<&NullBuffer> {
self.dictionary.nulls()
}
fn logical_nulls(&self) -> Option<NullBuffer> {
self.dictionary.logical_nulls()
}
fn is_nullable(&self) -> bool {
self.dictionary.is_nullable()
}
fn get_buffer_memory_size(&self) -> usize {
self.dictionary.get_buffer_memory_size()
}
fn get_array_memory_size(&self) -> usize {
self.dictionary.get_array_memory_size()
}
}
impl<'a, K, V> IntoIterator for TypedDictionaryArray<'a, K, V>
where
K: ArrowDictionaryKeyType,
Self: ArrayAccessor,
{
type Item = Option<<Self as ArrayAccessor>::Item>;
type IntoIter = ArrayIter<Self>;
fn into_iter(self) -> Self::IntoIter {
ArrayIter::new(self)
}
}
impl<'a, K, V> ArrayAccessor for TypedDictionaryArray<'a, K, V>
where
K: ArrowDictionaryKeyType,
V: Sync + Send,
&'a V: ArrayAccessor,
<&'a V as ArrayAccessor>::Item: Default,
{
type Item = <&'a V as ArrayAccessor>::Item;
fn value(&self, index: usize) -> Self::Item {
assert!(
index < self.len(),
"Trying to access an element at index {} from a TypedDictionaryArray of length {}",
index,
self.len()
);
unsafe { self.value_unchecked(index) }
}
unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
let val = self.dictionary.keys.value_unchecked(index);
let value_idx = val.as_usize();
// As dictionary keys are only verified for non-null indexes
// we must check the value is within bounds
match value_idx < self.values.len() {
true => self.values.value_unchecked(value_idx),
false => Default::default(),
}
}
}
/// A [`DictionaryArray`] with the key type erased
///
/// This can be used to efficiently implement kernels for all possible dictionary
/// keys without needing to create specialized implementations for each key type
///
/// For example
///
/// ```
/// # use arrow_array::*;
/// # use arrow_array::cast::AsArray;
/// # use arrow_array::builder::PrimitiveDictionaryBuilder;
/// # use arrow_array::types::*;
/// # use arrow_schema::ArrowError;
/// # use std::sync::Arc;
///
/// fn to_string(a: &dyn Array) -> Result<ArrayRef, ArrowError> {
/// if let Some(d) = a.as_any_dictionary_opt() {
/// // Recursively handle dictionary input
/// let r = to_string(d.values().as_ref())?;
/// return Ok(d.with_values(r));
/// }
/// downcast_primitive_array! {
/// a => Ok(Arc::new(a.iter().map(|x| x.map(|x| format!("{x:?}"))).collect::<StringArray>())),
/// d => Err(ArrowError::InvalidArgumentError(format!("{d:?} not supported")))
/// }
/// }
///
/// let result = to_string(&Int32Array::from(vec![1, 2, 3])).unwrap();
/// let actual = result.as_string::<i32>().iter().map(Option::unwrap).collect::<Vec<_>>();
/// assert_eq!(actual, &["1", "2", "3"]);
///
/// let mut dict = PrimitiveDictionaryBuilder::<Int32Type, UInt16Type>::new();
/// dict.extend([Some(1), Some(1), Some(2), Some(3), Some(2)]);
/// let dict = dict.finish();
///
/// let r = to_string(&dict).unwrap();
/// let r = r.as_dictionary::<Int32Type>().downcast_dict::<StringArray>().unwrap();
/// assert_eq!(r.keys(), dict.keys()); // Keys are the same
///
/// let actual = r.into_iter().map(Option::unwrap).collect::<Vec<_>>();
/// assert_eq!(actual, &["1", "1", "2", "3", "2"]);
/// ```
///
/// See [`AsArray::as_any_dictionary_opt`] and [`AsArray::as_any_dictionary`]
pub trait AnyDictionaryArray: Array {
/// Returns the primitive keys of this dictionary as an [`Array`]
fn keys(&self) -> &dyn Array;
/// Returns the values of this dictionary
fn values(&self) -> &ArrayRef;
/// Returns the keys of this dictionary as usize
///
/// The values for nulls will be arbitrary, but are guaranteed
/// to be in the range `0..self.values.len()`
///
/// # Panic
///
/// Panics if `values.len() == 0`
fn normalized_keys(&self) -> Vec<usize>;
/// Create a new [`DictionaryArray`] replacing `values` with the new values
///
/// See [`DictionaryArray::with_values`]
fn with_values(&self, values: ArrayRef) -> ArrayRef;
}
impl<K: ArrowDictionaryKeyType> AnyDictionaryArray for DictionaryArray<K> {
fn keys(&self) -> &dyn Array {
&self.keys
}
fn values(&self) -> &ArrayRef {
self.values()
}
fn normalized_keys(&self) -> Vec<usize> {
let v_len = self.values().len();
assert_ne!(v_len, 0);
let iter = self.keys().values().iter();
iter.map(|x| x.as_usize().min(v_len - 1)).collect()
}
fn with_values(&self, values: ArrayRef) -> ArrayRef {
Arc::new(self.with_values(values))
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::cast::as_dictionary_array;
use crate::{Int16Array, Int32Array, Int8Array};
use arrow_buffer::{Buffer, ToByteSlice};
#[test]
fn test_dictionary_array() {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int8)
.len(8)
.add_buffer(Buffer::from(
[10_i8, 11, 12, 13, 14, 15, 16, 17].to_byte_slice(),
))
.build()
.unwrap();
// Construct a buffer for value offsets, for the nested array:
let keys = Buffer::from([2_i16, 3, 4].to_byte_slice());
// Construct a dictionary array from the above two
let key_type = DataType::Int16;
let value_type = DataType::Int8;
let dict_data_type = DataType::Dictionary(Box::new(key_type), Box::new(value_type));
let dict_data = ArrayData::builder(dict_data_type.clone())
.len(3)
.add_buffer(keys.clone())
.add_child_data(value_data.clone())
.build()
.unwrap();
let dict_array = Int16DictionaryArray::from(dict_data);
let values = dict_array.values();
assert_eq!(value_data, values.to_data());
assert_eq!(DataType::Int8, dict_array.value_type());
assert_eq!(3, dict_array.len());
// Null count only makes sense in terms of the component arrays.
assert_eq!(0, dict_array.null_count());
assert_eq!(0, dict_array.values().null_count());
assert_eq!(dict_array.keys(), &Int16Array::from(vec![2_i16, 3, 4]));
// Now test with a non-zero offset
let dict_data = ArrayData::builder(dict_data_type)
.len(2)
.offset(1)
.add_buffer(keys)
.add_child_data(value_data.clone())
.build()
.unwrap();
let dict_array = Int16DictionaryArray::from(dict_data);
let values = dict_array.values();
assert_eq!(value_data, values.to_data());
assert_eq!(DataType::Int8, dict_array.value_type());
assert_eq!(2, dict_array.len());
assert_eq!(dict_array.keys(), &Int16Array::from(vec![3_i16, 4]));
}
#[test]
fn test_dictionary_array_fmt_debug() {
let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(3, 2);
builder.append(12345678).unwrap();
builder.append_null();
builder.append(22345678).unwrap();
let array = builder.finish();
assert_eq!(
"DictionaryArray {keys: PrimitiveArray<UInt8>\n[\n 0,\n null,\n 1,\n] values: PrimitiveArray<UInt32>\n[\n 12345678,\n 22345678,\n]}\n",
format!("{array:?}")
);
let mut builder = PrimitiveDictionaryBuilder::<UInt8Type, UInt32Type>::with_capacity(20, 2);
for _ in 0..20 {
builder.append(1).unwrap();
}
let array = builder.finish();
assert_eq!(
"DictionaryArray {keys: PrimitiveArray<UInt8>\n[\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n] values: PrimitiveArray<UInt32>\n[\n 1,\n]}\n",
format!("{array:?}")
);
}
#[test]
fn test_dictionary_array_from_iter() {
let test = vec!["a", "a", "b", "c"];
let array: DictionaryArray<Int8Type> = test
.iter()
.map(|&x| if x == "b" { None } else { Some(x) })
.collect();
assert_eq!(
"DictionaryArray {keys: PrimitiveArray<Int8>\n[\n 0,\n 0,\n null,\n 1,\n] values: StringArray\n[\n \"a\",\n \"c\",\n]}\n",
format!("{array:?}")
);
let array: DictionaryArray<Int8Type> = test.into_iter().collect();
assert_eq!(
"DictionaryArray {keys: PrimitiveArray<Int8>\n[\n 0,\n 0,\n 1,\n 2,\n] values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n",
format!("{array:?}")
);
}
#[test]
fn test_dictionary_array_reverse_lookup_key() {
let test = vec!["a", "a", "b", "c"];
let array: DictionaryArray<Int8Type> = test.into_iter().collect();
assert_eq!(array.lookup_key("c"), Some(2));
// Direction of building a dictionary is the iterator direction
let test = vec!["t3", "t3", "t2", "t2", "t1", "t3", "t4", "t1", "t0"];
let array: DictionaryArray<Int8Type> = test.into_iter().collect();
assert_eq!(array.lookup_key("t1"), Some(2));
assert_eq!(array.lookup_key("non-existent"), None);
}
#[test]
fn test_dictionary_keys_as_primitive_array() {
let test = vec!["a", "b", "c", "a"];
let array: DictionaryArray<Int8Type> = test.into_iter().collect();
let keys = array.keys();
assert_eq!(&DataType::Int8, keys.data_type());
assert_eq!(0, keys.null_count());
assert_eq!(&[0, 1, 2, 0], keys.values());
}
#[test]
fn test_dictionary_keys_as_primitive_array_with_null() {
let test = vec![Some("a"), None, Some("b"), None, None, Some("a")];
let array: DictionaryArray<Int32Type> = test.into_iter().collect();
let keys = array.keys();
assert_eq!(&DataType::Int32, keys.data_type());
assert_eq!(3, keys.null_count());
assert!(keys.is_valid(0));
assert!(!keys.is_valid(1));
assert!(keys.is_valid(2));
assert!(!keys.is_valid(3));
assert!(!keys.is_valid(4));
assert!(keys.is_valid(5));
assert_eq!(0, keys.value(0));
assert_eq!(1, keys.value(2));
assert_eq!(0, keys.value(5));
}
#[test]
fn test_dictionary_all_nulls() {
let test = vec![None, None, None];
let array: DictionaryArray<Int32Type> = test.into_iter().collect();
array
.into_data()
.validate_full()
.expect("All null array has valid array data");
}
#[test]
fn test_dictionary_iter() {
// Construct a value array
let values = Int8Array::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]);
let keys = Int16Array::from_iter_values([2_i16, 3, 4]);
// Construct a dictionary array from the above two
let dict_array = DictionaryArray::new(keys, Arc::new(values));
let mut key_iter = dict_array.keys_iter();
assert_eq!(2, key_iter.next().unwrap().unwrap());
assert_eq!(3, key_iter.next().unwrap().unwrap());
assert_eq!(4, key_iter.next().unwrap().unwrap());
assert!(key_iter.next().is_none());
let mut iter = dict_array
.values()
.as_any()
.downcast_ref::<Int8Array>()
.unwrap()
.take_iter(dict_array.keys_iter());
assert_eq!(12, iter.next().unwrap().unwrap());
assert_eq!(13, iter.next().unwrap().unwrap());
assert_eq!(14, iter.next().unwrap().unwrap());
assert!(iter.next().is_none());
}
#[test]
fn test_dictionary_iter_with_null() {
let test = vec![Some("a"), None, Some("b"), None, None, Some("a")];
let array: DictionaryArray<Int32Type> = test.into_iter().collect();
let mut iter = array
.values()
.as_any()
.downcast_ref::<StringArray>()
.unwrap()
.take_iter(array.keys_iter());
assert_eq!("a", iter.next().unwrap().unwrap());
assert!(iter.next().unwrap().is_none());
assert_eq!("b", iter.next().unwrap().unwrap());
assert!(iter.next().unwrap().is_none());
assert!(iter.next().unwrap().is_none());
assert_eq!("a", iter.next().unwrap().unwrap());
assert!(iter.next().is_none());
}
#[test]
fn test_dictionary_key() {
let keys = Int8Array::from(vec![Some(2), None, Some(1)]);
let values = StringArray::from(vec!["foo", "bar", "baz", "blarg"]);
let array = DictionaryArray::new(keys, Arc::new(values));
assert_eq!(array.key(0), Some(2));
assert_eq!(array.key(1), None);
assert_eq!(array.key(2), Some(1));
}
#[test]
fn test_try_new() {
let values: StringArray = [Some("foo"), Some("bar"), Some("baz")]
.into_iter()
.collect();
let keys: Int32Array = [Some(0), Some(2), None, Some(1)].into_iter().collect();
let array = DictionaryArray::new(keys, Arc::new(values));
assert_eq!(array.keys().data_type(), &DataType::Int32);
assert_eq!(array.values().data_type(), &DataType::Utf8);
assert_eq!(array.null_count(), 1);
assert!(array.keys().is_valid(0));
assert!(array.keys().is_valid(1));
assert!(array.keys().is_null(2));
assert!(array.keys().is_valid(3));
assert_eq!(array.keys().value(0), 0);
assert_eq!(array.keys().value(1), 2);
assert_eq!(array.keys().value(3), 1);
assert_eq!(
"DictionaryArray {keys: PrimitiveArray<Int32>\n[\n 0,\n 2,\n null,\n 1,\n] values: StringArray\n[\n \"foo\",\n \"bar\",\n \"baz\",\n]}\n",
format!("{array:?}")
);
}
#[test]
#[should_panic(expected = "Invalid dictionary key 3 at index 1, expected 0 <= key < 2")]
fn test_try_new_index_too_large() {
let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect();
// dictionary only has 2 values, so offset 3 is out of bounds
let keys: Int32Array = [Some(0), Some(3)].into_iter().collect();
DictionaryArray::new(keys, Arc::new(values));
}
#[test]
#[should_panic(expected = "Invalid dictionary key -100 at index 0, expected 0 <= key < 2")]
fn test_try_new_index_too_small() {
let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect();
let keys: Int32Array = [Some(-100)].into_iter().collect();
DictionaryArray::new(keys, Arc::new(values));
}
#[test]
#[should_panic(expected = "DictionaryArray's data type must match, expected Int64 got Int32")]
fn test_from_array_data_validation() {
let a = DictionaryArray::<Int32Type>::from_iter(["32"]);
let _ = DictionaryArray::<Int64Type>::from(a.into_data());
}
#[test]
fn test_into_primitive_dict_builder() {
let values = Int32Array::from_iter_values([10_i32, 12, 15]);
let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]);
let dict_array = DictionaryArray::new(keys, Arc::new(values));
let boxed: ArrayRef = Arc::new(dict_array);
let col: DictionaryArray<Int8Type> = as_dictionary_array(&boxed).clone();
drop(boxed);
let mut builder = col.into_primitive_dict_builder::<Int32Type>().unwrap();
let slice = builder.values_slice_mut();
assert_eq!(slice, &[10, 12, 15]);
slice[0] = 4;
slice[1] = 2;
slice[2] = 1;
let values = Int32Array::from_iter_values([4_i32, 2, 1]);
let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]);
let expected = DictionaryArray::new(keys, Arc::new(values));
let new_array = builder.finish();
assert_eq!(expected, new_array);
}
#[test]
fn test_into_primitive_dict_builder_cloned_array() {
let values = Int32Array::from_iter_values([10_i32, 12, 15]);
let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]);
let dict_array = DictionaryArray::new(keys, Arc::new(values));
let boxed: ArrayRef = Arc::new(dict_array);
let col: DictionaryArray<Int8Type> = DictionaryArray::<Int8Type>::from(boxed.to_data());
let err = col.into_primitive_dict_builder::<Int32Type>();
let returned = err.unwrap_err();
let values = Int32Array::from_iter_values([10_i32, 12, 15]);
let keys = Int8Array::from_iter_values([1_i8, 0, 2, 0]);
let expected = DictionaryArray::new(keys, Arc::new(values));
assert_eq!(expected, returned);
}
#[test]
fn test_occupancy() {
let keys = Int32Array::new((100..200).collect(), None);
let values = Int32Array::from(vec![0; 1024]);
let dict = DictionaryArray::new(keys, Arc::new(values));
for (idx, v) in dict.occupancy().iter().enumerate() {
let expected = (100..200).contains(&idx);
assert_eq!(v, expected, "{idx}");
}
let keys = Int32Array::new(
(0..100).collect(),
Some((0..100).map(|x| x % 4 == 0).collect()),
);
let values = Int32Array::from(vec![0; 1024]);
let dict = DictionaryArray::new(keys, Arc::new(values));
for (idx, v) in dict.occupancy().iter().enumerate() {
let expected = idx % 4 == 0 && idx < 100;
assert_eq!(v, expected, "{idx}");
}
}
#[test]
fn test_iterator_nulls() {
let keys = Int32Array::new(
vec![0, 700, 1, 2].into(),
Some(NullBuffer::from(vec![true, false, true, true])),
);
let values = Int32Array::from(vec![Some(50), None, Some(2)]);
let dict = DictionaryArray::new(keys, Arc::new(values));
let values: Vec<_> = dict
.downcast_dict::<Int32Array>()
.unwrap()
.into_iter()
.collect();
assert_eq!(values, &[Some(50), None, None, Some(2)])
}
#[test]
fn test_normalized_keys() {
let values = vec![132, 0, 1].into();
let nulls = NullBuffer::from(vec![false, true, true]);
let keys = Int32Array::new(values, Some(nulls));
let dictionary = DictionaryArray::new(keys, Arc::new(Int32Array::new_null(2)));
assert_eq!(&dictionary.normalized_keys(), &[1, 0, 1])
}
}