| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| use std::any::Any; |
| use std::fmt; |
| use std::iter::IntoIterator; |
| use std::mem; |
| use std::{convert::From, iter::FromIterator}; |
| |
| use super::{ |
| make_array, Array, ArrayData, ArrayRef, PrimitiveArray, PrimitiveBuilder, |
| StringArray, StringBuilder, StringDictionaryBuilder, |
| }; |
| use crate::datatypes::ArrowNativeType; |
| use crate::datatypes::{ArrowDictionaryKeyType, ArrowPrimitiveType, DataType}; |
| |
| /// A dictionary array where each element is a single value indexed by an integer key. |
| /// This is mostly used to represent strings or a limited set of primitive types as integers, |
| /// for example when doing NLP analysis or representing chromosomes by name. |
| /// |
| /// Example **with nullable** data: |
| /// |
| /// ``` |
| /// use arrow::array::{DictionaryArray, Int8Array}; |
| /// use arrow::datatypes::Int8Type; |
| /// let test = vec!["a", "a", "b", "c"]; |
| /// let array : DictionaryArray<Int8Type> = test.iter().map(|&x| if x == "b" {None} else {Some(x)}).collect(); |
| /// assert_eq!(array.keys(), &Int8Array::from(vec![Some(0), Some(0), None, Some(1)])); |
| /// ``` |
| /// |
| /// Example **without nullable** data: |
| /// |
| /// ``` |
| /// use arrow::array::{DictionaryArray, Int8Array}; |
| /// use arrow::datatypes::Int8Type; |
| /// let test = vec!["a", "a", "b", "c"]; |
| /// let array : DictionaryArray<Int8Type> = test.into_iter().collect(); |
| /// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2])); |
| /// ``` |
| pub struct DictionaryArray<K: ArrowPrimitiveType> { |
| /// Data of this dictionary. Note that this is _not_ compatible with the C Data interface, |
| /// as, in the current implementation, `values` below are the first child of this struct. |
| data: ArrayData, |
| |
| /// The keys of this dictionary. These are constructed from the buffer and null bitmap |
| /// of `data`. |
| /// Also, note that these do not correspond to the true values of this array. Rather, they map |
| /// to the real values. |
| keys: PrimitiveArray<K>, |
| |
| /// Array of dictionary values (can by any DataType). |
| values: ArrayRef, |
| |
| /// Values are ordered. |
| is_ordered: bool, |
| } |
| |
| impl<'a, K: ArrowPrimitiveType> DictionaryArray<K> { |
| /// Return an array view of the keys of this dictionary as a PrimitiveArray. |
| pub fn keys(&self) -> &PrimitiveArray<K> { |
| &self.keys |
| } |
| |
| /// Returns the lookup key by doing reverse dictionary lookup |
| pub fn lookup_key(&self, value: &str) -> Option<K::Native> { |
| let rd_buf: &StringArray = |
| self.values.as_any().downcast_ref::<StringArray>().unwrap(); |
| |
| (0..rd_buf.len()) |
| .position(|i| rd_buf.value(i) == value) |
| .map(K::Native::from_usize) |
| .flatten() |
| } |
| |
| /// Returns a reference to the dictionary values array |
| pub fn values(&self) -> &ArrayRef { |
| &self.values |
| } |
| |
| /// Returns a clone of the value type of this list. |
| pub fn value_type(&self) -> DataType { |
| self.values.data_ref().data_type().clone() |
| } |
| |
| /// The length of the dictionary is the length of the keys array. |
| pub fn len(&self) -> usize { |
| self.keys.len() |
| } |
| |
| /// Whether this dictionary is empty |
| pub fn is_empty(&self) -> bool { |
| self.keys.is_empty() |
| } |
| |
| // Currently exists for compatibility purposes with Arrow IPC. |
| pub fn is_ordered(&self) -> bool { |
| self.is_ordered |
| } |
| } |
| |
| /// Constructs a `DictionaryArray` from an array data reference. |
| impl<T: ArrowPrimitiveType> From<ArrayData> for DictionaryArray<T> { |
| fn from(data: ArrayData) -> Self { |
| assert_eq!( |
| data.buffers().len(), |
| 1, |
| "DictionaryArray data should contain a single buffer only (keys)." |
| ); |
| assert_eq!( |
| data.child_data().len(), |
| 1, |
| "DictionaryArray should contain a single child array (values)." |
| ); |
| |
| if let DataType::Dictionary(key_data_type, _) = data.data_type() { |
| if key_data_type.as_ref() != &T::DATA_TYPE { |
| panic!("DictionaryArray's data type must match.") |
| }; |
| // create a zero-copy of the keys' data |
| let keys = PrimitiveArray::<T>::from(ArrayData::new( |
| T::DATA_TYPE, |
| data.len(), |
| Some(data.null_count()), |
| data.null_buffer().cloned(), |
| data.offset(), |
| data.buffers().to_vec(), |
| vec![], |
| )); |
| let values = make_array(data.child_data()[0].clone()); |
| Self { |
| data, |
| keys, |
| values, |
| is_ordered: false, |
| } |
| } else { |
| panic!("DictionaryArray must have Dictionary data type.") |
| } |
| } |
| } |
| |
| /// Constructs a `DictionaryArray` from an iterator of optional strings. |
| impl<'a, T: ArrowPrimitiveType + ArrowDictionaryKeyType> FromIterator<Option<&'a str>> |
| for DictionaryArray<T> |
| { |
| fn from_iter<I: IntoIterator<Item = Option<&'a str>>>(iter: I) -> Self { |
| let it = iter.into_iter(); |
| let (lower, _) = it.size_hint(); |
| let key_builder = PrimitiveBuilder::<T>::new(lower); |
| let value_builder = StringBuilder::new(256); |
| let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); |
| it.for_each(|i| { |
| if let Some(i) = i { |
| // Note: impl ... for Result<DictionaryArray<T>> fails with |
| // error[E0117]: only traits defined in the current crate can be implemented for arbitrary types |
| builder |
| .append(i) |
| .expect("Unable to append a value to a dictionary array."); |
| } else { |
| builder |
| .append_null() |
| .expect("Unable to append a null value to a dictionary array."); |
| } |
| }); |
| |
| builder.finish() |
| } |
| } |
| |
| /// Constructs a `DictionaryArray` from an iterator of strings. |
| impl<'a, T: ArrowPrimitiveType + ArrowDictionaryKeyType> FromIterator<&'a str> |
| for DictionaryArray<T> |
| { |
| fn from_iter<I: IntoIterator<Item = &'a str>>(iter: I) -> Self { |
| let it = iter.into_iter(); |
| let (lower, _) = it.size_hint(); |
| let key_builder = PrimitiveBuilder::<T>::new(lower); |
| let value_builder = StringBuilder::new(256); |
| let mut builder = StringDictionaryBuilder::new(key_builder, value_builder); |
| it.for_each(|i| { |
| builder |
| .append(i) |
| .expect("Unable to append a value to a dictionary array."); |
| }); |
| |
| builder.finish() |
| } |
| } |
| |
| impl<T: ArrowPrimitiveType> Array for DictionaryArray<T> { |
| fn as_any(&self) -> &Any { |
| self |
| } |
| |
| fn data(&self) -> &ArrayData { |
| &self.data |
| } |
| |
| fn get_buffer_memory_size(&self) -> usize { |
| // Since both `keys` and `values` derive (are references from) `data`, we only need to account for `data`. |
| self.data.get_buffer_memory_size() |
| } |
| |
| fn get_array_memory_size(&self) -> usize { |
| self.data.get_array_memory_size() |
| + self.keys.get_array_memory_size() |
| + self.values.get_array_memory_size() |
| + mem::size_of_val(self) |
| } |
| } |
| |
| impl<T: ArrowPrimitiveType> fmt::Debug for DictionaryArray<T> { |
| fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| writeln!( |
| f, |
| "DictionaryArray {{keys: {:?} values: {:?}}}", |
| self.keys, self.values |
| ) |
| } |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| |
| use crate::{ |
| array::Int16Array, |
| datatypes::{Int32Type, Int8Type, UInt32Type, UInt8Type}, |
| }; |
| use crate::{ |
| array::Int16DictionaryArray, array::PrimitiveDictionaryBuilder, |
| datatypes::DataType, |
| }; |
| use crate::{buffer::Buffer, datatypes::ToByteSlice}; |
| |
| #[test] |
| fn test_dictionary_array() { |
| // Construct a value array |
| let value_data = ArrayData::builder(DataType::Int8) |
| .len(8) |
| .add_buffer(Buffer::from( |
| &[10_i8, 11, 12, 13, 14, 15, 16, 17].to_byte_slice(), |
| )) |
| .build(); |
| |
| // Construct a buffer for value offsets, for the nested array: |
| let keys = Buffer::from(&[2_i16, 3, 4].to_byte_slice()); |
| |
| // Construct a dictionary array from the above two |
| let key_type = DataType::Int16; |
| let value_type = DataType::Int8; |
| let dict_data_type = |
| DataType::Dictionary(Box::new(key_type), Box::new(value_type)); |
| let dict_data = ArrayData::builder(dict_data_type.clone()) |
| .len(3) |
| .add_buffer(keys.clone()) |
| .add_child_data(value_data.clone()) |
| .build(); |
| let dict_array = Int16DictionaryArray::from(dict_data); |
| |
| let values = dict_array.values(); |
| assert_eq!(&value_data, values.data()); |
| assert_eq!(DataType::Int8, dict_array.value_type()); |
| assert_eq!(3, dict_array.len()); |
| |
| // Null count only makes sense in terms of the component arrays. |
| assert_eq!(0, dict_array.null_count()); |
| assert_eq!(0, dict_array.values().null_count()); |
| assert_eq!(dict_array.keys(), &Int16Array::from(vec![2_i16, 3, 4])); |
| |
| // Now test with a non-zero offset |
| let dict_data = ArrayData::builder(dict_data_type) |
| .len(2) |
| .offset(1) |
| .add_buffer(keys) |
| .add_child_data(value_data.clone()) |
| .build(); |
| let dict_array = Int16DictionaryArray::from(dict_data); |
| |
| let values = dict_array.values(); |
| assert_eq!(&value_data, values.data()); |
| assert_eq!(DataType::Int8, dict_array.value_type()); |
| assert_eq!(2, dict_array.len()); |
| assert_eq!(dict_array.keys(), &Int16Array::from(vec![3_i16, 4])); |
| } |
| |
| #[test] |
| fn test_dictionary_array_fmt_debug() { |
| let key_builder = PrimitiveBuilder::<UInt8Type>::new(3); |
| let value_builder = PrimitiveBuilder::<UInt32Type>::new(2); |
| let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); |
| builder.append(12345678).unwrap(); |
| builder.append_null().unwrap(); |
| builder.append(22345678).unwrap(); |
| let array = builder.finish(); |
| assert_eq!( |
| "DictionaryArray {keys: PrimitiveArray<UInt8>\n[\n 0,\n null,\n 1,\n] values: PrimitiveArray<UInt32>\n[\n 12345678,\n 22345678,\n]}\n", |
| format!("{:?}", array) |
| ); |
| |
| let key_builder = PrimitiveBuilder::<UInt8Type>::new(20); |
| let value_builder = PrimitiveBuilder::<UInt32Type>::new(2); |
| let mut builder = PrimitiveDictionaryBuilder::new(key_builder, value_builder); |
| for _ in 0..20 { |
| builder.append(1).unwrap(); |
| } |
| let array = builder.finish(); |
| assert_eq!( |
| "DictionaryArray {keys: PrimitiveArray<UInt8>\n[\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n] values: PrimitiveArray<UInt32>\n[\n 1,\n]}\n", |
| format!("{:?}", array) |
| ); |
| } |
| |
| #[test] |
| fn test_dictionary_array_from_iter() { |
| let test = vec!["a", "a", "b", "c"]; |
| let array: DictionaryArray<Int8Type> = test |
| .iter() |
| .map(|&x| if x == "b" { None } else { Some(x) }) |
| .collect(); |
| assert_eq!( |
| "DictionaryArray {keys: PrimitiveArray<Int8>\n[\n 0,\n 0,\n null,\n 1,\n] values: StringArray\n[\n \"a\",\n \"c\",\n]}\n", |
| format!("{:?}", array) |
| ); |
| |
| let array: DictionaryArray<Int8Type> = test.into_iter().collect(); |
| assert_eq!( |
| "DictionaryArray {keys: PrimitiveArray<Int8>\n[\n 0,\n 0,\n 1,\n 2,\n] values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n", |
| format!("{:?}", array) |
| ); |
| } |
| |
| #[test] |
| fn test_dictionary_array_reverse_lookup_key() { |
| let test = vec!["a", "a", "b", "c"]; |
| let array: DictionaryArray<Int8Type> = test.into_iter().collect(); |
| |
| assert_eq!(array.lookup_key("c"), Some(2)); |
| |
| // Direction of building a dictionary is the iterator direction |
| let test = vec!["t3", "t3", "t2", "t2", "t1", "t3", "t4", "t1", "t0"]; |
| let array: DictionaryArray<Int8Type> = test.into_iter().collect(); |
| |
| assert_eq!(array.lookup_key("t1"), Some(2)); |
| assert_eq!(array.lookup_key("non-existent"), None); |
| } |
| |
| #[test] |
| fn test_dictionary_keys_as_primitive_array() { |
| let test = vec!["a", "b", "c", "a"]; |
| let array: DictionaryArray<Int8Type> = test.into_iter().collect(); |
| |
| let keys = array.keys(); |
| assert_eq!(&DataType::Int8, keys.data_type()); |
| assert_eq!(0, keys.null_count()); |
| assert_eq!(&[0, 1, 2, 0], keys.values()); |
| } |
| |
| #[test] |
| fn test_dictionary_keys_as_primitive_array_with_null() { |
| let test = vec![Some("a"), None, Some("b"), None, None, Some("a")]; |
| let array: DictionaryArray<Int32Type> = test.into_iter().collect(); |
| |
| let keys = array.keys(); |
| assert_eq!(&DataType::Int32, keys.data_type()); |
| assert_eq!(3, keys.null_count()); |
| |
| assert_eq!(true, keys.is_valid(0)); |
| assert_eq!(false, keys.is_valid(1)); |
| assert_eq!(true, keys.is_valid(2)); |
| assert_eq!(false, keys.is_valid(3)); |
| assert_eq!(false, keys.is_valid(4)); |
| assert_eq!(true, keys.is_valid(5)); |
| |
| assert_eq!(0, keys.value(0)); |
| assert_eq!(1, keys.value(2)); |
| assert_eq!(0, keys.value(5)); |
| } |
| } |