| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| //! Defines concat kernel for `ArrayRef` |
| //! |
| //! Example: |
| //! |
| //! ``` |
| //! use arrow::array::{ArrayRef, StringArray}; |
| //! use arrow::compute::concat; |
| //! |
| //! let arr = concat(&[ |
| //! &StringArray::from(vec!["hello", "world"]), |
| //! &StringArray::from(vec!["!"]), |
| //! ]).unwrap(); |
| //! assert_eq!(arr.len(), 3); |
| //! ``` |
| |
| use crate::array::*; |
| use crate::datatypes::DataType; |
| use crate::error::{ArrowError, Result}; |
| |
| fn compute_str_values_length<Offset: StringOffsetSizeTrait>( |
| arrays: &[&ArrayData], |
| ) -> usize { |
| arrays |
| .iter() |
| .map(|&data| { |
| // get the length of the value buffer |
| let buf_len = data.buffers()[1].len(); |
| // find the offset of the buffer |
| // this returns a slice of offsets, starting from the offset of the array |
| // so we can take the first value |
| let offset = data.buffer::<Offset>(0)[0]; |
| buf_len - offset.to_usize().unwrap() |
| }) |
| .sum() |
| } |
| |
| /// Concatenate multiple [Array] of the same type into a single [ArrayRef]. |
| pub fn concat(arrays: &[&Array]) -> Result<ArrayRef> { |
| if arrays.is_empty() { |
| return Err(ArrowError::ComputeError( |
| "concat requires input of at least one array".to_string(), |
| )); |
| } |
| |
| if arrays |
| .iter() |
| .any(|array| array.data_type() != arrays[0].data_type()) |
| { |
| return Err(ArrowError::InvalidArgumentError( |
| "It is not possible to concatenate arrays of different data types." |
| .to_string(), |
| )); |
| } |
| |
| let lengths = arrays.iter().map(|array| array.len()).collect::<Vec<_>>(); |
| let capacity = lengths.iter().sum(); |
| |
| let arrays = arrays.iter().map(|a| a.data()).collect::<Vec<_>>(); |
| |
| let mut mutable = match arrays[0].data_type() { |
| DataType::Utf8 => { |
| let str_values_size = compute_str_values_length::<i32>(&arrays); |
| MutableArrayData::with_capacities( |
| arrays, |
| false, |
| Capacities::Binary(capacity, Some(str_values_size)), |
| ) |
| } |
| DataType::LargeUtf8 => { |
| let str_values_size = compute_str_values_length::<i64>(&arrays); |
| MutableArrayData::with_capacities( |
| arrays, |
| false, |
| Capacities::Binary(capacity, Some(str_values_size)), |
| ) |
| } |
| _ => MutableArrayData::new(arrays, false, capacity), |
| }; |
| |
| for (i, len) in lengths.iter().enumerate() { |
| mutable.extend(i, 0, *len) |
| } |
| |
| Ok(make_array(mutable.freeze())) |
| } |
| |
| #[cfg(test)] |
| mod tests { |
| use super::*; |
| use crate::datatypes::*; |
| use std::sync::Arc; |
| |
| #[test] |
| fn test_concat_empty_vec() { |
| let re = concat(&[]); |
| assert!(re.is_err()); |
| } |
| |
| #[test] |
| fn test_concat_incompatible_datatypes() { |
| let re = concat(&[ |
| &PrimitiveArray::<Int64Type>::from(vec![Some(-1), Some(2), None]), |
| &StringArray::from(vec![Some("hello"), Some("bar"), Some("world")]), |
| ]); |
| assert!(re.is_err()); |
| } |
| |
| #[test] |
| fn test_concat_string_arrays() -> Result<()> { |
| let arr = concat(&[ |
| &StringArray::from(vec!["hello", "world"]), |
| &StringArray::from(vec!["2", "3", "4"]), |
| &StringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]), |
| ])?; |
| |
| let expected_output = Arc::new(StringArray::from(vec![ |
| Some("hello"), |
| Some("world"), |
| Some("2"), |
| Some("3"), |
| Some("4"), |
| Some("foo"), |
| Some("bar"), |
| None, |
| Some("baz"), |
| ])) as ArrayRef; |
| |
| assert_eq!(&arr, &expected_output); |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_concat_primitive_arrays() -> Result<()> { |
| let arr = concat(&[ |
| &PrimitiveArray::<Int64Type>::from(vec![ |
| Some(-1), |
| Some(-1), |
| Some(2), |
| None, |
| None, |
| ]), |
| &PrimitiveArray::<Int64Type>::from(vec![ |
| Some(101), |
| Some(102), |
| Some(103), |
| None, |
| ]), |
| &PrimitiveArray::<Int64Type>::from(vec![Some(256), Some(512), Some(1024)]), |
| ])?; |
| |
| let expected_output = Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
| Some(-1), |
| Some(-1), |
| Some(2), |
| None, |
| None, |
| Some(101), |
| Some(102), |
| Some(103), |
| None, |
| Some(256), |
| Some(512), |
| Some(1024), |
| ])) as ArrayRef; |
| |
| assert_eq!(&arr, &expected_output); |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_concat_primitive_array_slices() -> Result<()> { |
| let input_1 = PrimitiveArray::<Int64Type>::from(vec![ |
| Some(-1), |
| Some(-1), |
| Some(2), |
| None, |
| None, |
| ]) |
| .slice(1, 3); |
| |
| let input_2 = PrimitiveArray::<Int64Type>::from(vec![ |
| Some(101), |
| Some(102), |
| Some(103), |
| None, |
| ]) |
| .slice(1, 3); |
| let arr = concat(&[input_1.as_ref(), input_2.as_ref()])?; |
| |
| let expected_output = Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
| Some(-1), |
| Some(2), |
| None, |
| Some(102), |
| Some(103), |
| None, |
| ])) as ArrayRef; |
| |
| assert_eq!(&arr, &expected_output); |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_concat_boolean_primitive_arrays() -> Result<()> { |
| let arr = concat(&[ |
| &BooleanArray::from(vec![ |
| Some(true), |
| Some(true), |
| Some(false), |
| None, |
| None, |
| Some(false), |
| ]), |
| &BooleanArray::from(vec![None, Some(false), Some(true), Some(false)]), |
| ])?; |
| |
| let expected_output = Arc::new(BooleanArray::from(vec![ |
| Some(true), |
| Some(true), |
| Some(false), |
| None, |
| None, |
| Some(false), |
| None, |
| Some(false), |
| Some(true), |
| Some(false), |
| ])) as ArrayRef; |
| |
| assert_eq!(&arr, &expected_output); |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_concat_primitive_list_arrays() -> Result<()> { |
| let list1 = vec![ |
| Some(vec![Some(-1), Some(-1), Some(2), None, None]), |
| Some(vec![]), |
| None, |
| Some(vec![Some(10)]), |
| ]; |
| let list1_array = |
| ListArray::from_iter_primitive::<Int64Type, _, _>(list1.clone()); |
| |
| let list2 = vec![ |
| None, |
| Some(vec![Some(100), None, Some(101)]), |
| Some(vec![Some(102)]), |
| ]; |
| let list2_array = |
| ListArray::from_iter_primitive::<Int64Type, _, _>(list2.clone()); |
| |
| let list3 = vec![Some(vec![Some(1000), Some(1001)])]; |
| let list3_array = |
| ListArray::from_iter_primitive::<Int64Type, _, _>(list3.clone()); |
| |
| let array_result = concat(&[&list1_array, &list2_array, &list3_array])?; |
| |
| let expected = list1 |
| .into_iter() |
| .chain(list2.into_iter()) |
| .chain(list3.into_iter()); |
| let array_expected = ListArray::from_iter_primitive::<Int64Type, _, _>(expected); |
| |
| assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_concat_struct_arrays() -> Result<()> { |
| let field = Field::new("field", DataType::Int64, true); |
| let input_primitive_1: ArrayRef = |
| Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
| Some(-1), |
| Some(-1), |
| Some(2), |
| None, |
| None, |
| ])); |
| let input_struct_1 = StructArray::from(vec![(field.clone(), input_primitive_1)]); |
| |
| let input_primitive_2: ArrayRef = |
| Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
| Some(101), |
| Some(102), |
| Some(103), |
| None, |
| ])); |
| let input_struct_2 = StructArray::from(vec![(field.clone(), input_primitive_2)]); |
| |
| let input_primitive_3: ArrayRef = |
| Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
| Some(256), |
| Some(512), |
| Some(1024), |
| ])); |
| let input_struct_3 = StructArray::from(vec![(field, input_primitive_3)]); |
| |
| let arr = concat(&[&input_struct_1, &input_struct_2, &input_struct_3])?; |
| |
| let expected_primitive_output = Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
| Some(-1), |
| Some(-1), |
| Some(2), |
| None, |
| None, |
| Some(101), |
| Some(102), |
| Some(103), |
| None, |
| Some(256), |
| Some(512), |
| Some(1024), |
| ])) as ArrayRef; |
| |
| let actual_primitive = arr |
| .as_any() |
| .downcast_ref::<StructArray>() |
| .unwrap() |
| .column(0); |
| assert_eq!(actual_primitive, &expected_primitive_output); |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_concat_struct_array_slices() -> Result<()> { |
| let field = Field::new("field", DataType::Int64, true); |
| let input_primitive_1: ArrayRef = |
| Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
| Some(-1), |
| Some(-1), |
| Some(2), |
| None, |
| None, |
| ])); |
| let input_struct_1 = StructArray::from(vec![(field.clone(), input_primitive_1)]); |
| |
| let input_primitive_2: ArrayRef = |
| Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
| Some(101), |
| Some(102), |
| Some(103), |
| None, |
| ])); |
| let input_struct_2 = StructArray::from(vec![(field, input_primitive_2)]); |
| |
| let arr = concat(&[ |
| input_struct_1.slice(1, 3).as_ref(), |
| input_struct_2.slice(1, 2).as_ref(), |
| ])?; |
| |
| let expected_primitive_output = Arc::new(PrimitiveArray::<Int64Type>::from(vec![ |
| Some(-1), |
| Some(2), |
| None, |
| Some(102), |
| Some(103), |
| ])) as ArrayRef; |
| |
| let actual_primitive = arr |
| .as_any() |
| .downcast_ref::<StructArray>() |
| .unwrap() |
| .column(0); |
| assert_eq!(actual_primitive, &expected_primitive_output); |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_string_array_slices() -> Result<()> { |
| let input_1 = StringArray::from(vec!["hello", "A", "B", "C"]); |
| let input_2 = StringArray::from(vec!["world", "D", "E", "Z"]); |
| |
| let arr = concat(&[input_1.slice(1, 3).as_ref(), input_2.slice(1, 2).as_ref()])?; |
| |
| let expected_output = StringArray::from(vec!["A", "B", "C", "D", "E"]); |
| |
| let actual_output = arr.as_any().downcast_ref::<StringArray>().unwrap(); |
| assert_eq!(actual_output, &expected_output); |
| |
| Ok(()) |
| } |
| |
| #[test] |
| fn test_string_array_with_null_slices() -> Result<()> { |
| let input_1 = StringArray::from(vec![Some("hello"), None, Some("A"), Some("C")]); |
| let input_2 = StringArray::from(vec![None, Some("world"), Some("D"), None]); |
| |
| let arr = concat(&[input_1.slice(1, 3).as_ref(), input_2.slice(1, 2).as_ref()])?; |
| |
| let expected_output = |
| StringArray::from(vec![None, Some("A"), Some("C"), Some("world"), Some("D")]); |
| |
| let actual_output = arr.as_any().downcast_ref::<StringArray>().unwrap(); |
| assert_eq!(actual_output, &expected_output); |
| |
| Ok(()) |
| } |
| |
| fn collect_string_dictionary( |
| dictionary: &DictionaryArray<Int32Type>, |
| ) -> Vec<Option<String>> { |
| let values = dictionary.values(); |
| let values = values.as_any().downcast_ref::<StringArray>().unwrap(); |
| |
| dictionary |
| .keys() |
| .iter() |
| .map(|key| key.map(|key| values.value(key as _).to_string())) |
| .collect() |
| } |
| |
| fn concat_dictionary( |
| input_1: DictionaryArray<Int32Type>, |
| input_2: DictionaryArray<Int32Type>, |
| ) -> Vec<Option<String>> { |
| let concat = concat(&[&input_1 as _, &input_2 as _]).unwrap(); |
| let concat = concat |
| .as_any() |
| .downcast_ref::<DictionaryArray<Int32Type>>() |
| .unwrap(); |
| |
| collect_string_dictionary(concat) |
| } |
| |
| #[test] |
| fn test_string_dictionary_array() { |
| let input_1: DictionaryArray<Int32Type> = |
| vec!["hello", "A", "B", "hello", "hello", "C"] |
| .into_iter() |
| .collect(); |
| let input_2: DictionaryArray<Int32Type> = |
| vec!["hello", "E", "E", "hello", "F", "E"] |
| .into_iter() |
| .collect(); |
| |
| let expected: Vec<_> = vec![ |
| "hello", "A", "B", "hello", "hello", "C", "hello", "E", "E", "hello", "F", |
| "E", |
| ] |
| .into_iter() |
| .map(|x| Some(x.to_string())) |
| .collect(); |
| |
| let concat = concat_dictionary(input_1, input_2); |
| assert_eq!(concat, expected); |
| } |
| |
| #[test] |
| fn test_string_dictionary_array_nulls() { |
| let input_1: DictionaryArray<Int32Type> = |
| vec![Some("foo"), Some("bar"), None, Some("fiz")] |
| .into_iter() |
| .collect(); |
| let input_2: DictionaryArray<Int32Type> = vec![None].into_iter().collect(); |
| let expected = vec![ |
| Some("foo".to_string()), |
| Some("bar".to_string()), |
| None, |
| Some("fiz".to_string()), |
| None, |
| ]; |
| |
| let concat = concat_dictionary(input_1, input_2); |
| assert_eq!(concat, expected); |
| } |
| |
| #[test] |
| fn test_concat_string_sizes() -> Result<()> { |
| let a: LargeStringArray = ((0..150).map(|_| Some("foo"))).collect(); |
| let b: LargeStringArray = ((0..150).map(|_| Some("foo"))).collect(); |
| let c = LargeStringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]); |
| // 150 * 3 = 450 |
| // 150 * 3 = 450 |
| // 3 * 3 = 9 |
| // ------------+ |
| // 909 |
| // closest 64 byte aligned cap = 960 |
| |
| let arr = concat(&[&a, &b, &c])?; |
| // this would have been 1280 if we did not precompute the value lengths. |
| assert_eq!(arr.data().buffers()[1].capacity(), 960); |
| |
| Ok(()) |
| } |
| } |