blob: cfbda38c0b286f393476dcc464593039fd566e6b [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use super::PrimitiveBuilder;
use crate::array::{
Array, ArrayBuilder, ArrayRef, DictionaryArray, StringArray, StringBuilder,
};
use crate::datatypes::{ArrowDictionaryKeyType, ArrowNativeType, DataType};
use crate::error::{ArrowError, Result};
use hashbrown::hash_map::RawEntryMut;
use hashbrown::HashMap;
use std::any::Any;
use std::sync::Arc;
/// Array builder for `DictionaryArray` that stores Strings. For example to map a set of byte indices
/// to String values. Note that the use of a `HashMap` here will not scale to very large
/// arrays or result in an ordered dictionary.
///
/// ```
/// use arrow::{
/// array::{
/// Int8Array, StringArray,
/// PrimitiveBuilder, StringBuilder, StringDictionaryBuilder,
/// },
/// datatypes::Int8Type,
/// };
///
/// // Create a dictionary array indexed by bytes whose values are Strings.
/// // It can thus hold up to 256 distinct string values.
///
/// let key_builder = PrimitiveBuilder::<Int8Type>::new(100);
/// let value_builder = StringBuilder::new(100);
/// let mut builder = StringDictionaryBuilder::new(key_builder, value_builder);
///
/// // The builder builds the dictionary value by value
/// builder.append("abc").unwrap();
/// builder.append_null();
/// builder.append("def").unwrap();
/// builder.append("def").unwrap();
/// builder.append("abc").unwrap();
/// let array = builder.finish();
///
/// assert_eq!(
/// array.keys(),
/// &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
/// );
///
/// // Values are polymorphic and so require a downcast.
/// let av = array.values();
/// let ava: &StringArray = av.as_any().downcast_ref::<StringArray>().unwrap();
///
/// assert_eq!(ava.value(0), "abc");
/// assert_eq!(ava.value(1), "def");
///
/// ```
#[derive(Debug)]
pub struct StringDictionaryBuilder<K>
where
K: ArrowDictionaryKeyType,
{
state: ahash::RandomState,
/// Used to provide a lookup from string value to key type
///
/// Note: K's hash implementation is not used, instead the raw entry
/// API is used to store keys w.r.t the hash of the strings themselves
///
dedup: HashMap<K::Native, (), ()>,
keys_builder: PrimitiveBuilder<K>,
values_builder: StringBuilder,
}
impl<K> StringDictionaryBuilder<K>
where
K: ArrowDictionaryKeyType,
{
/// Creates a new `StringDictionaryBuilder` from a keys builder and a value builder.
pub fn new(keys_builder: PrimitiveBuilder<K>, values_builder: StringBuilder) -> Self {
Self {
state: Default::default(),
dedup: HashMap::with_capacity_and_hasher(keys_builder.capacity(), ()),
keys_builder,
values_builder,
}
}
/// Creates a new `StringDictionaryBuilder` from a keys builder and a dictionary
/// which is initialized with the given values.
/// The indices of those dictionary values are used as keys.
///
/// # Example
///
/// ```
/// use arrow::datatypes::Int16Type;
/// use arrow::array::{StringArray, StringDictionaryBuilder, PrimitiveBuilder, Int16Array};
/// use std::convert::TryFrom;
///
/// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]);
///
/// let mut builder = StringDictionaryBuilder::new_with_dictionary(PrimitiveBuilder::<Int16Type>::new(3), &dictionary_values).unwrap();
/// builder.append("def").unwrap();
/// builder.append_null();
/// builder.append("abc").unwrap();
///
/// let dictionary_array = builder.finish();
///
/// let keys = dictionary_array.keys();
///
/// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)]));
/// ```
pub fn new_with_dictionary(
keys_builder: PrimitiveBuilder<K>,
dictionary_values: &StringArray,
) -> Result<Self> {
let state = ahash::RandomState::default();
let dict_len = dictionary_values.len();
let mut dedup = HashMap::with_capacity_and_hasher(dict_len, ());
let values_len = dictionary_values.value_data().len();
let mut values_builder = StringBuilder::with_capacity(dict_len, values_len);
for (idx, maybe_value) in dictionary_values.iter().enumerate() {
match maybe_value {
Some(value) => {
let hash = compute_hash(&state, value.as_bytes());
let key = K::Native::from_usize(idx)
.ok_or(ArrowError::DictionaryKeyOverflowError)?;
let entry =
dedup.raw_entry_mut().from_hash(hash, |key: &K::Native| {
value.as_bytes() == get_bytes(&values_builder, key)
});
if let RawEntryMut::Vacant(v) = entry {
v.insert_with_hasher(hash, key, (), |key| {
compute_hash(&state, get_bytes(&values_builder, key))
});
}
values_builder.append_value(value);
}
None => values_builder.append_null(),
}
}
Ok(Self {
state,
dedup,
keys_builder,
values_builder,
})
}
}
impl<K> ArrayBuilder for StringDictionaryBuilder<K>
where
K: ArrowDictionaryKeyType,
{
/// Returns the builder as an non-mutable `Any` reference.
fn as_any(&self) -> &dyn Any {
self
}
/// Returns the builder as an mutable `Any` reference.
fn as_any_mut(&mut self) -> &mut dyn Any {
self
}
/// Returns the boxed builder as a box of `Any`.
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
self
}
/// Returns the number of array slots in the builder
fn len(&self) -> usize {
self.keys_builder.len()
}
/// Returns whether the number of array slots is zero
fn is_empty(&self) -> bool {
self.keys_builder.is_empty()
}
/// Builds the array and reset this builder.
fn finish(&mut self) -> ArrayRef {
Arc::new(self.finish())
}
}
impl<K> StringDictionaryBuilder<K>
where
K: ArrowDictionaryKeyType,
{
/// Append a primitive value to the array. Return an existing index
/// if already present in the values array or a new index if the
/// value is appended to the values array.
///
/// Returns an error if the new index would overflow the key type.
pub fn append(&mut self, value: impl AsRef<str>) -> Result<K::Native> {
let value = value.as_ref();
let state = &self.state;
let storage = &mut self.values_builder;
let hash = compute_hash(state, value.as_bytes());
let entry = self
.dedup
.raw_entry_mut()
.from_hash(hash, |key| value.as_bytes() == get_bytes(storage, key));
let key = match entry {
RawEntryMut::Occupied(entry) => *entry.into_key(),
RawEntryMut::Vacant(entry) => {
let index = storage.len();
storage.append_value(value);
let key = K::Native::from_usize(index)
.ok_or(ArrowError::DictionaryKeyOverflowError)?;
*entry
.insert_with_hasher(hash, key, (), |key| {
compute_hash(state, get_bytes(storage, key))
})
.0
}
};
self.keys_builder.append_value(key);
Ok(key)
}
#[inline]
pub fn append_null(&mut self) {
self.keys_builder.append_null()
}
/// Builds the `DictionaryArray` and reset this builder.
pub fn finish(&mut self) -> DictionaryArray<K> {
self.dedup.clear();
let values = self.values_builder.finish();
let keys = self.keys_builder.finish();
let data_type =
DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(DataType::Utf8));
let builder = keys
.into_data()
.into_builder()
.data_type(data_type)
.child_data(vec![values.into_data()]);
DictionaryArray::from(unsafe { builder.build_unchecked() })
}
}
fn compute_hash(hasher: &ahash::RandomState, value: &[u8]) -> u64 {
use std::hash::{BuildHasher, Hash, Hasher};
let mut state = hasher.build_hasher();
value.hash(&mut state);
state.finish()
}
fn get_bytes<'a, K: ArrowNativeType>(values: &'a StringBuilder, key: &K) -> &'a [u8] {
let offsets = values.offsets_slice();
let values = values.values_slice();
let idx = key.to_usize().unwrap();
let end_offset = offsets[idx + 1].to_usize().unwrap();
let start_offset = offsets[idx].to_usize().unwrap();
&values[start_offset..end_offset]
}
#[cfg(test)]
mod tests {
use super::*;
use crate::array::Array;
use crate::array::Int8Array;
use crate::datatypes::Int16Type;
use crate::datatypes::Int8Type;
#[test]
fn test_string_dictionary_builder() {
let key_builder = PrimitiveBuilder::<Int8Type>::new(5);
let value_builder = StringBuilder::new(2);
let mut builder = StringDictionaryBuilder::new(key_builder, value_builder);
builder.append("abc").unwrap();
builder.append_null();
builder.append("def").unwrap();
builder.append("def").unwrap();
builder.append("abc").unwrap();
let array = builder.finish();
assert_eq!(
array.keys(),
&Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
);
// Values are polymorphic and so require a downcast.
let av = array.values();
let ava: &StringArray = av.as_any().downcast_ref::<StringArray>().unwrap();
assert_eq!(ava.value(0), "abc");
assert_eq!(ava.value(1), "def");
}
#[test]
fn test_string_dictionary_builder_with_existing_dictionary() {
let dictionary = StringArray::from(vec![None, Some("def"), Some("abc")]);
let key_builder = PrimitiveBuilder::<Int8Type>::new(6);
let mut builder =
StringDictionaryBuilder::new_with_dictionary(key_builder, &dictionary)
.unwrap();
builder.append("abc").unwrap();
builder.append_null();
builder.append("def").unwrap();
builder.append("def").unwrap();
builder.append("abc").unwrap();
builder.append("ghi").unwrap();
let array = builder.finish();
assert_eq!(
array.keys(),
&Int8Array::from(vec![Some(2), None, Some(1), Some(1), Some(2), Some(3)])
);
// Values are polymorphic and so require a downcast.
let av = array.values();
let ava: &StringArray = av.as_any().downcast_ref::<StringArray>().unwrap();
assert!(!ava.is_valid(0));
assert_eq!(ava.value(1), "def");
assert_eq!(ava.value(2), "abc");
assert_eq!(ava.value(3), "ghi");
}
#[test]
fn test_string_dictionary_builder_with_reserved_null_value() {
let dictionary: Vec<Option<&str>> = vec![None];
let dictionary = StringArray::from(dictionary);
let key_builder = PrimitiveBuilder::<Int16Type>::new(4);
let mut builder =
StringDictionaryBuilder::new_with_dictionary(key_builder, &dictionary)
.unwrap();
builder.append("abc").unwrap();
builder.append_null();
builder.append("def").unwrap();
builder.append("abc").unwrap();
let array = builder.finish();
assert!(array.is_null(1));
assert!(!array.is_valid(1));
let keys = array.keys();
assert_eq!(keys.value(0), 1);
assert!(keys.is_null(1));
// zero initialization is currently guaranteed by Buffer allocation and resizing
assert_eq!(keys.value(1), 0);
assert_eq!(keys.value(2), 2);
assert_eq!(keys.value(3), 1);
}
}