blob: bb5a68e90cbbeabbcb3740009ef7daaa0ca5ca9a [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
use std::any::Any;
use std::sync::Arc;
use arrow::array::{
new_null_array, Array, ArrayRef, AsArray, Float32Array, Float64Array,
};
use arrow::compute;
use arrow::datatypes::{DataType, Float64Type};
use arrow::record_batch::RecordBatch;
use datafusion::common::{exec_err, internal_err, ScalarValue};
use datafusion::error::Result;
use datafusion::logical_expr::sort_properties::{ExprProperties, SortProperties};
use datafusion::logical_expr::Volatility;
use datafusion::logical_expr::{
ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
};
use datafusion::prelude::*;
/// This example shows how to use the full ScalarUDFImpl API to implement a user
/// defined function. As in the `simple_udf.rs` example, this struct implements
/// a function that takes two arguments and returns the first argument raised to
/// the power of the second argument `a^b`.
///
/// To do so, we must implement the `ScalarUDFImpl` trait.
#[derive(Debug, PartialEq, Eq, Hash)]
struct PowUdf {
signature: Signature,
aliases: Vec<String>,
}
impl PowUdf {
/// Create a new instance of the `PowUdf` struct
fn new() -> Self {
Self {
signature: Signature::exact(
// this function will always take two arguments of type f64
vec![DataType::Float64, DataType::Float64],
// this function is deterministic and will always return the same
// result for the same input
Volatility::Immutable,
),
// we will also add an alias of "my_pow"
aliases: vec!["my_pow".to_string()],
}
}
}
impl ScalarUDFImpl for PowUdf {
/// We implement as_any so that we can downcast the ScalarUDFImpl trait object
fn as_any(&self) -> &dyn Any {
self
}
/// Return the name of this function
fn name(&self) -> &str {
"pow"
}
/// Return the "signature" of this function -- namely what types of arguments it will take
fn signature(&self) -> &Signature {
&self.signature
}
/// What is the type of value that will be returned by this function? In
/// this case it will always be a constant value, but it could also be a
/// function of the input types.
fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
Ok(DataType::Float64)
}
/// This function actually calculates the results of the scalar function.
///
/// This is the same way that functions provided with DataFusion are invoked,
/// which permits important special cases:
///
///1. When one or both of the arguments are single values (constants).
/// For example `pow(a, 2)`
/// 2. When the input arrays can be reused (avoid allocating a new output array)
///
/// However, it also means the implementation is more complex than when
/// using `create_udf`.
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
// The other fields of the `args` struct are used for more specialized
// uses, and are not needed in this example
let ScalarFunctionArgs { mut args, .. } = args;
// DataFusion has arranged for the correct inputs to be passed to this
// function, but we check again to make sure
assert_eq!(args.len(), 2);
// take ownership of arguments by popping in reverse order
let exp = args.pop().unwrap();
let base = args.pop().unwrap();
assert_eq!(base.data_type(), DataType::Float64);
assert_eq!(exp.data_type(), DataType::Float64);
match (base, exp) {
// For demonstration purposes we also implement the scalar / scalar
// case here, but it is not typically required for high performance.
//
// For performance it is most important to optimize cases where at
// least one argument is an array. If all arguments are constants,
// the DataFusion expression simplification logic will often invoke
// this path once during planning, and simply use the result during
// execution.
(
ColumnarValue::Scalar(ScalarValue::Float64(base)),
ColumnarValue::Scalar(ScalarValue::Float64(exp)),
) => {
// compute the output. Note DataFusion treats `None` as NULL.
let res = match (base, exp) {
(Some(base), Some(exp)) => Some(base.powf(exp)),
// one or both arguments were NULL
_ => None,
};
Ok(ColumnarValue::Scalar(ScalarValue::from(res)))
}
// special case if the exponent is a constant
(
ColumnarValue::Array(base_array),
ColumnarValue::Scalar(ScalarValue::Float64(exp)),
) => {
let result_array = match exp {
// a ^ null = null
None => new_null_array(base_array.data_type(), base_array.len()),
// a ^ exp
Some(exp) => {
// DataFusion has ensured both arguments are Float64:
let base_array = base_array.as_primitive::<Float64Type>();
// calculate the result for every row. The `unary`
// kernel creates very fast "vectorized" code and
// handles things like null values for us.
let res: Float64Array =
compute::unary(base_array, |base| base.powf(exp));
Arc::new(res)
}
};
Ok(ColumnarValue::Array(result_array))
}
// special case if the base is a constant.
//
// Note this case is very similar to the previous case, so we could
// use the same pattern. However, for this case we demonstrate an
// even more advanced pattern to potentially avoid allocating a new array
(
ColumnarValue::Scalar(ScalarValue::Float64(base)),
ColumnarValue::Array(exp_array),
) => {
let res = match base {
None => new_null_array(exp_array.data_type(), exp_array.len()),
Some(base) => maybe_pow_in_place(base, exp_array)?,
};
Ok(ColumnarValue::Array(res))
}
// Both arguments are arrays so we have to perform the calculation
// for every row
//
// Note this could also be done in place using `binary_mut` as
// is done in `maybe_pow_in_place` but here we use binary for simplicity
(ColumnarValue::Array(base_array), ColumnarValue::Array(exp_array)) => {
let res: Float64Array = compute::binary(
base_array.as_primitive::<Float64Type>(),
exp_array.as_primitive::<Float64Type>(),
|base, exp| base.powf(exp),
)?;
Ok(ColumnarValue::Array(Arc::new(res)))
}
// if the types were not float, it is a bug in DataFusion
_ => {
internal_err!("Invalid argument types to pow function")
}
}
}
/// We will also add an alias of "my_pow"
fn aliases(&self) -> &[String] {
&self.aliases
}
fn output_ordering(&self, input: &[ExprProperties]) -> Result<SortProperties> {
// The POW function preserves the order of its argument.
Ok(input[0].sort_properties)
}
}
/// Evaluate `base ^ exp` *without* allocating a new array, if possible
fn maybe_pow_in_place(base: f64, exp_array: ArrayRef) -> Result<ArrayRef> {
// Calling `unary` creates a new array for the results. Avoiding
// allocations is a common optimization in performance critical code.
// arrow-rs allows this optimization via the `unary_mut`
// and `binary_mut` kernels in certain cases
//
// These kernels can only be used if there are no other references to
// the arrays (exp_array has to be the last remaining reference).
let owned_array = exp_array
// as in the previous example, we first downcast to &Float64Array
.as_primitive::<Float64Type>()
// non-obviously, we call clone here to get an owned `Float64Array`.
// Calling clone() is relatively inexpensive as it increments
// some ref counts but doesn't clone the data)
//
// Once we have the owned Float64Array we can drop the original
// exp_array (untyped) reference
.clone();
// We *MUST* drop the reference to `exp_array` explicitly so that
// owned_array is the only reference remaining in this function.
//
// Note that depending on the query there may still be other references
// to the underlying buffers, which would prevent reuse. The only way to
// know for sure is the result of `compute::unary_mut`
drop(exp_array);
// If we have the only reference, compute the result directly into the same
// allocation as was used for the input array
match compute::unary_mut(owned_array, |exp| base.powf(exp)) {
Err(_orig_array) => {
// unary_mut will return the original array if there are other
// references into the underling buffer (and thus reuse is
// impossible)
//
// In a real implementation, this case should fall back to
// calling `unary` and allocate a new array; In this example
// we will return an error for demonstration purposes
exec_err!("Could not reuse array for maybe_pow_in_place")
}
// a result of OK means the operation was run successfully
Ok(res) => Ok(Arc::new(res)),
}
}
/// create local execution context with an in-memory table:
///
/// ```text
/// +-----+-----+
/// | a | b |
/// +-----+-----+
/// | 2.1 | 1.0 |
/// | 3.1 | 2.0 |
/// | 4.1 | 3.0 |
/// | 5.1 | 4.0 |
/// +-----+-----+
/// ```
fn create_context() -> Result<SessionContext> {
// define data.
let a: ArrayRef = Arc::new(Float32Array::from(vec![2.1, 3.1, 4.1, 5.1]));
let b: ArrayRef = Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)])?;
// declare a new context. In Spark API, this corresponds to a new SparkSession
let ctx = SessionContext::new();
// declare a table in memory. In Spark API, this corresponds to createDataFrame(...).
ctx.register_batch("t", batch)?;
Ok(ctx)
}
/// In this example we register `PowUdf` as a user defined function
/// and invoke it via the DataFrame API and SQL
pub async fn advanced_udf() -> Result<()> {
let ctx = create_context()?;
// create the UDF
let pow = ScalarUDF::from(PowUdf::new());
// register the UDF with the context so it can be invoked by name and from SQL
ctx.register_udf(pow.clone());
// get a DataFrame from the context for scanning the "t" table
let df = ctx.table("t").await?;
// Call pow(a, 10) using the DataFrame API
let df = df.select(vec![pow.call(vec![col("a"), lit(10i32)])])?;
// note that the second argument is passed as an i32, not f64. DataFusion
// automatically coerces the types to match the UDF's defined signature.
// print the results
df.show().await?;
// You can also invoke both pow(2, 10) and its alias my_pow(a, b) using SQL
ctx.sql("SELECT pow(2, 10), my_pow(a, b) FROM t")
.await?
.show()
.await?;
// You can also invoke pow_in_place by passing a constant base and a
// column `a` as the exponent . If there is only a single
// reference to `a` the code works well
ctx.sql("SELECT pow(2, a) FROM t").await?.show().await?;
// However, if there are multiple references to `a` in the evaluation
// the array storage can not be reused
let err = ctx
.sql("SELECT pow(2, a), pow(3, a) FROM t")
.await?
.show()
.await
.unwrap_err();
assert_eq!(
err.to_string(),
"Execution error: Could not reuse array for maybe_pow_in_place"
);
Ok(())
}