feat: support `Utf8View` type in `starts_with` function (#11787)
* feat: support `Utf8View` for `starts_with`
* style: clippy
* simplify string view handling
* fix: allow utf8 and largeutf8 to be cast into utf8view
* fix: fix test
* Apply suggestions from code review
Co-authored-by: Yongting You <2010youy01@gmail.com>
Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
* style: fix format
* feat: add addiontal tests
* tests: improve tests
* fix: fix null case
* tests: one more null test
* Test comments and execution tests
---------
Co-authored-by: Yongting You <2010youy01@gmail.com>
Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs
index 676903d..9faeb8a 100644
--- a/datafusion/expr/src/expr_schema.rs
+++ b/datafusion/expr/src/expr_schema.rs
@@ -148,6 +148,7 @@
.iter()
.map(|e| e.get_type(schema))
.collect::<Result<Vec<_>>>()?;
+
// verify that function is invoked with correct number and type of arguments as defined in `TypeSignature`
data_types_with_scalar_udf(&arg_data_types, func).map_err(|err| {
plan_datafusion_err!(
diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs
index 66807c3..4f27765 100644
--- a/datafusion/expr/src/type_coercion/functions.rs
+++ b/datafusion/expr/src/type_coercion/functions.rs
@@ -583,6 +583,10 @@
(Interval(_), _) if matches!(type_from, Utf8 | LargeUtf8) => {
Some(type_into.clone())
}
+ // We can go into a Utf8View from a Utf8 or LargeUtf8
+ (Utf8View, _) if matches!(type_from, Utf8 | LargeUtf8 | Null) => {
+ Some(type_into.clone())
+ }
// Any type can be coerced into strings
(Utf8 | LargeUtf8, _) => Some(type_into.clone()),
(Null, _) if can_cast_types(type_from, type_into) => Some(type_into.clone()),
@@ -647,6 +651,18 @@
use arrow::datatypes::Field;
#[test]
+ fn test_string_conversion() {
+ let cases = vec![
+ (DataType::Utf8View, DataType::Utf8, true),
+ (DataType::Utf8View, DataType::LargeUtf8, true),
+ ];
+
+ for case in cases {
+ assert_eq!(can_coerce_from(&case.0, &case.1), case.2);
+ }
+ }
+
+ #[test]
fn test_maybe_data_types() {
// this vec contains: arg1, arg2, expected result
let cases = vec![
diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs
index 05bd960..8450697 100644
--- a/datafusion/functions/src/string/starts_with.rs
+++ b/datafusion/functions/src/string/starts_with.rs
@@ -18,10 +18,10 @@
use std::any::Any;
use std::sync::Arc;
-use arrow::array::{ArrayRef, OffsetSizeTrait};
+use arrow::array::ArrayRef;
use arrow::datatypes::DataType;
-use datafusion_common::{cast::as_generic_string_array, internal_err, Result};
+use datafusion_common::{internal_err, Result};
use datafusion_expr::ColumnarValue;
use datafusion_expr::TypeSignature::*;
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
@@ -30,12 +30,8 @@
/// Returns true if string starts with prefix.
/// starts_with('alphabet', 'alph') = 't'
-pub fn starts_with<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
- let left = as_generic_string_array::<T>(&args[0])?;
- let right = as_generic_string_array::<T>(&args[1])?;
-
- let result = arrow::compute::kernels::comparison::starts_with(left, right)?;
-
+pub fn starts_with(args: &[ArrayRef]) -> Result<ArrayRef> {
+ let result = arrow::compute::kernels::comparison::starts_with(&args[0], &args[1])?;
Ok(Arc::new(result) as ArrayRef)
}
@@ -52,14 +48,15 @@
impl StartsWithFunc {
pub fn new() -> Self {
- use DataType::*;
Self {
signature: Signature::one_of(
vec![
- Exact(vec![Utf8, Utf8]),
- Exact(vec![Utf8, LargeUtf8]),
- Exact(vec![LargeUtf8, Utf8]),
- Exact(vec![LargeUtf8, LargeUtf8]),
+ // Planner attempts coercion to the target type starting with the most preferred candidate.
+ // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`.
+ // If that fails, it proceeds to `(Utf8, Utf8)`.
+ Exact(vec![DataType::Utf8View, DataType::Utf8View]),
+ Exact(vec![DataType::Utf8, DataType::Utf8]),
+ Exact(vec![DataType::LargeUtf8, DataType::LargeUtf8]),
],
Volatility::Immutable,
),
@@ -81,18 +78,73 @@
}
fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
- use DataType::*;
-
- Ok(Boolean)
+ Ok(DataType::Boolean)
}
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
- DataType::Utf8 => make_scalar_function(starts_with::<i32>, vec![])(args),
- DataType::LargeUtf8 => {
- return make_scalar_function(starts_with::<i64>, vec![])(args);
+ DataType::Utf8View | DataType::Utf8 | DataType::LargeUtf8 => {
+ make_scalar_function(starts_with, vec![])(args)
}
- _ => internal_err!("Unsupported data type"),
+ _ => internal_err!("Unsupported data types for starts_with. Expected Utf8, LargeUtf8 or Utf8View")?,
}
}
}
+
+#[cfg(test)]
+mod tests {
+ use crate::utils::test::test_function;
+ use arrow::array::{Array, BooleanArray};
+ use arrow::datatypes::DataType::Boolean;
+ use datafusion_common::{Result, ScalarValue};
+ use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+ use super::*;
+
+ #[test]
+ fn test_functions() -> Result<()> {
+ // Generate test cases for starts_with
+ let test_cases = vec![
+ (Some("alphabet"), Some("alph"), Some(true)),
+ (Some("alphabet"), Some("bet"), Some(false)),
+ (
+ Some("somewhat large string"),
+ Some("somewhat large"),
+ Some(true),
+ ),
+ (Some("somewhat large string"), Some("large"), Some(false)),
+ ]
+ .into_iter()
+ .flat_map(|(a, b, c)| {
+ let utf_8_args = vec![
+ ColumnarValue::Scalar(ScalarValue::Utf8(a.map(|s| s.to_string()))),
+ ColumnarValue::Scalar(ScalarValue::Utf8(b.map(|s| s.to_string()))),
+ ];
+
+ let large_utf_8_args = vec![
+ ColumnarValue::Scalar(ScalarValue::LargeUtf8(a.map(|s| s.to_string()))),
+ ColumnarValue::Scalar(ScalarValue::LargeUtf8(b.map(|s| s.to_string()))),
+ ];
+
+ let utf_8_view_args = vec![
+ ColumnarValue::Scalar(ScalarValue::Utf8View(a.map(|s| s.to_string()))),
+ ColumnarValue::Scalar(ScalarValue::Utf8View(b.map(|s| s.to_string()))),
+ ];
+
+ vec![(utf_8_args, c), (large_utf_8_args, c), (utf_8_view_args, c)]
+ });
+
+ for (args, expected) in test_cases {
+ test_function!(
+ StartsWithFunc::new(),
+ &args,
+ Ok(expected),
+ bool,
+ Boolean,
+ BooleanArray
+ );
+ }
+
+ Ok(())
+ }
+}
diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt
index 763b4e9..584d3b3 100644
--- a/datafusion/sqllogictest/test_files/string_view.slt
+++ b/datafusion/sqllogictest/test_files/string_view.slt
@@ -355,6 +355,75 @@
01)Aggregate: groupBy=[[]], aggr=[[count(DISTINCT test.column1_utf8), count(DISTINCT test.column1_utf8view), count(DISTINCT test.column1_dict)]]
02)--TableScan: test projection=[column1_utf8, column1_utf8view, column1_dict]
+### `STARTS_WITH`
+
+# Test STARTS_WITH with utf8view against utf8view, utf8, and largeutf8
+# (should be no casts)
+query TT
+EXPLAIN SELECT
+ STARTS_WITH(column1_utf8view, column2_utf8view) as c1,
+ STARTS_WITH(column1_utf8view, column2_utf8) as c2,
+ STARTS_WITH(column1_utf8view, column2_large_utf8) as c3
+FROM test;
+----
+logical_plan
+01)Projection: starts_with(test.column1_utf8view, test.column2_utf8view) AS c1, starts_with(test.column1_utf8view, CAST(test.column2_utf8 AS Utf8View)) AS c2, starts_with(test.column1_utf8view, CAST(test.column2_large_utf8 AS Utf8View)) AS c3
+02)--TableScan: test projection=[column2_utf8, column2_large_utf8, column1_utf8view, column2_utf8view]
+
+query BBB
+SELECT
+ STARTS_WITH(column1_utf8view, column2_utf8view) as c1,
+ STARTS_WITH(column1_utf8view, column2_utf8) as c2,
+ STARTS_WITH(column1_utf8view, column2_large_utf8) as c3
+FROM test;
+----
+false false false
+true true true
+true true true
+NULL NULL NULL
+
+# Test STARTS_WITH with utf8 against utf8view, utf8, and largeutf8
+# Should work, but will have to cast to common types
+# should cast utf8 -> utf8view and largeutf8 -> utf8view
+query TT
+EXPLAIN SELECT
+ STARTS_WITH(column1_utf8, column2_utf8view) as c1,
+ STARTS_WITH(column1_utf8, column2_utf8) as c3,
+ STARTS_WITH(column1_utf8, column2_large_utf8) as c4
+FROM test;
+----
+logical_plan
+01)Projection: starts_with(__common_expr_1, test.column2_utf8view) AS c1, starts_with(test.column1_utf8, test.column2_utf8) AS c3, starts_with(__common_expr_1, CAST(test.column2_large_utf8 AS Utf8View)) AS c4
+02)--Projection: CAST(test.column1_utf8 AS Utf8View) AS __common_expr_1, test.column1_utf8, test.column2_utf8, test.column2_large_utf8, test.column2_utf8view
+03)----TableScan: test projection=[column1_utf8, column2_utf8, column2_large_utf8, column2_utf8view]
+
+query BBB
+ SELECT
+ STARTS_WITH(column1_utf8, column2_utf8view) as c1,
+ STARTS_WITH(column1_utf8, column2_utf8) as c3,
+ STARTS_WITH(column1_utf8, column2_large_utf8) as c4
+FROM test;
+----
+false false false
+true true true
+true true true
+NULL NULL NULL
+
+
+# Test STARTS_WITH with utf8view against literals
+# In this case, the literals should be cast to utf8view. The columns
+# should not be cast to utf8.
+query TT
+EXPLAIN SELECT
+ STARTS_WITH(column1_utf8view, 'äöüß') as c1,
+ STARTS_WITH(column1_utf8view, '') as c2,
+ STARTS_WITH(column1_utf8view, NULL) as c3,
+ STARTS_WITH(NULL, column1_utf8view) as c4
+FROM test;
+----
+logical_plan
+01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4
+02)--TableScan: test projection=[column1_utf8view]
statement ok
drop table test;
@@ -376,6 +445,5 @@
----
2024-01-23
-
statement ok
drop table dates;