blob: d9956b89687602af5d41bff97ce6e10873bbc080 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//! Defines kernel to extract a substring of a \[Large\]StringArray
use crate::{array::*, buffer::Buffer};
use crate::{
datatypes::DataType,
error::{ArrowError, Result},
};
#[allow(clippy::unnecessary_wraps)]
fn generic_substring<OffsetSize: StringOffsetSizeTrait>(
array: &GenericStringArray<OffsetSize>,
start: OffsetSize,
length: &Option<OffsetSize>,
) -> Result<ArrayRef> {
// compute current offsets
let offsets = array.data_ref().clone().buffers()[0].clone();
let offsets: &[OffsetSize] = unsafe { offsets.typed_data::<OffsetSize>() };
// compute null bitmap (copy)
let null_bit_buffer = array.data_ref().null_buffer().cloned();
// compute values
let values = &array.data_ref().buffers()[1];
let data = values.as_slice();
let mut new_values = Vec::new(); // we have no way to estimate how much this will be.
let mut new_offsets: Vec<OffsetSize> = Vec::with_capacity(array.len() + 1);
let mut length_so_far = OffsetSize::zero();
new_offsets.push(length_so_far);
(0..array.len()).for_each(|i| {
// the length of this entry
let length_i: OffsetSize = offsets[i + 1] - offsets[i];
// compute where we should start slicing this entry
let start = offsets[i]
+ if start >= OffsetSize::zero() {
start
} else {
length_i + start
};
let start = start.max(offsets[i]).min(offsets[i + 1]);
// compute the length of the slice
let length: OffsetSize = length
.unwrap_or(length_i)
// .max(0) is not needed as it is guaranteed
.min(offsets[i + 1] - start); // so we do not go beyond this entry
length_so_far += length;
new_offsets.push(length_so_far);
// we need usize for ranges
let start = start.to_usize().unwrap();
let length = length.to_usize().unwrap();
new_values.extend_from_slice(&data[start..start + length]);
});
let data = ArrayData::new(
<OffsetSize as StringOffsetSizeTrait>::DATA_TYPE,
array.len(),
None,
null_bit_buffer,
0,
vec![
Buffer::from_slice_ref(&new_offsets),
Buffer::from_slice_ref(&new_values),
],
vec![],
);
Ok(make_array(data))
}
/// Returns an ArrayRef with a substring starting from `start` and with optional length `length` of each of the elements in `array`.
/// `start` can be negative, in which case the start counts from the end of the string.
/// this function errors when the passed array is not a \[Large\]String array.
pub fn substring(array: &Array, start: i64, length: &Option<u64>) -> Result<ArrayRef> {
match array.data_type() {
DataType::LargeUtf8 => generic_substring(
array
.as_any()
.downcast_ref::<LargeStringArray>()
.expect("A large string is expected"),
start,
&length.map(|e| e as i64),
),
DataType::Utf8 => generic_substring(
array
.as_any()
.downcast_ref::<StringArray>()
.expect("A string is expected"),
start as i32,
&length.map(|e| e as i32),
),
_ => Err(ArrowError::ComputeError(format!(
"substring does not support type {:?}",
array.data_type()
))),
}
}
#[cfg(test)]
mod tests {
use super::*;
fn with_nulls<T: 'static + Array + PartialEq + From<Vec<Option<&'static str>>>>(
) -> Result<()> {
let cases = vec![
// identity
(
vec![Some("hello"), None, Some("word")],
0,
None,
vec![Some("hello"), None, Some("word")],
),
// 0 length -> Nothing
(
vec![Some("hello"), None, Some("word")],
0,
Some(0),
vec![Some(""), None, Some("")],
),
// high start -> Nothing
(
vec![Some("hello"), None, Some("word")],
1000,
Some(0),
vec![Some(""), None, Some("")],
),
// high negative start -> identity
(
vec![Some("hello"), None, Some("word")],
-1000,
None,
vec![Some("hello"), None, Some("word")],
),
// high length -> identity
(
vec![Some("hello"), None, Some("word")],
0,
Some(1000),
vec![Some("hello"), None, Some("word")],
),
];
cases.into_iter().try_for_each::<_, Result<()>>(
|(array, start, length, expected)| {
let array = T::from(array);
let result: ArrayRef = substring(&array, start, &length)?;
assert_eq!(array.len(), result.len());
let result = result.as_any().downcast_ref::<T>().unwrap();
let expected = T::from(expected);
assert_eq!(&expected, result);
Ok(())
},
)?;
Ok(())
}
#[test]
fn with_nulls_string() -> Result<()> {
with_nulls::<StringArray>()
}
#[test]
fn with_nulls_large_string() -> Result<()> {
with_nulls::<LargeStringArray>()
}
fn without_nulls<T: 'static + Array + PartialEq + From<Vec<Option<&'static str>>>>(
) -> Result<()> {
let cases = vec![
// increase start
(
vec!["hello", "", "word"],
0,
None,
vec!["hello", "", "word"],
),
(vec!["hello", "", "word"], 1, None, vec!["ello", "", "ord"]),
(vec!["hello", "", "word"], 2, None, vec!["llo", "", "rd"]),
(vec!["hello", "", "word"], 3, None, vec!["lo", "", "d"]),
(vec!["hello", "", "word"], 10, None, vec!["", "", ""]),
// increase start negatively
(vec!["hello", "", "word"], -1, None, vec!["o", "", "d"]),
(vec!["hello", "", "word"], -2, None, vec!["lo", "", "rd"]),
(vec!["hello", "", "word"], -3, None, vec!["llo", "", "ord"]),
(
vec!["hello", "", "word"],
-10,
None,
vec!["hello", "", "word"],
),
// increase length
(vec!["hello", "", "word"], 1, Some(1), vec!["e", "", "o"]),
(vec!["hello", "", "word"], 1, Some(2), vec!["el", "", "or"]),
(
vec!["hello", "", "word"],
1,
Some(3),
vec!["ell", "", "ord"],
),
(
vec!["hello", "", "word"],
1,
Some(4),
vec!["ello", "", "ord"],
),
(vec!["hello", "", "word"], -3, Some(1), vec!["l", "", "o"]),
(vec!["hello", "", "word"], -3, Some(2), vec!["ll", "", "or"]),
(
vec!["hello", "", "word"],
-3,
Some(3),
vec!["llo", "", "ord"],
),
(
vec!["hello", "", "word"],
-3,
Some(4),
vec!["llo", "", "ord"],
),
];
cases.into_iter().try_for_each::<_, Result<()>>(
|(array, start, length, expected)| {
let array = StringArray::from(array);
let result = substring(&array, start, &length)?;
assert_eq!(array.len(), result.len());
let result = result.as_any().downcast_ref::<StringArray>().unwrap();
let expected = StringArray::from(expected);
assert_eq!(&expected, result,);
Ok(())
},
)?;
Ok(())
}
#[test]
fn without_nulls_string() -> Result<()> {
without_nulls::<StringArray>()
}
#[test]
fn without_nulls_large_string() -> Result<()> {
without_nulls::<LargeStringArray>()
}
}