datafusion-examples/examples/udf/advanced_udf.rs - datafusion - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 use std::any::Any;
 use std::sync::Arc;

 use arrow::array::{
     new_null_array, Array, ArrayRef, AsArray, Float32Array, Float64Array,
 };
 use arrow::compute;
 use arrow::datatypes::{DataType, Float64Type};
 use arrow::record_batch::RecordBatch;
 use datafusion::common::{exec_err, internal_err, ScalarValue};
 use datafusion::error::Result;
 use datafusion::logical_expr::sort_properties::{ExprProperties, SortProperties};
 use datafusion::logical_expr::Volatility;
 use datafusion::logical_expr::{
     ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
 };
 use datafusion::prelude::*;

 /// This example shows how to use the full ScalarUDFImpl API to implement a user
 /// defined function. As in the `simple_udf.rs` example, this struct implements
 /// a function that takes two arguments and returns the first argument raised to
 /// the power of the second argument `a^b`.
 ///
 /// To do so, we must implement the `ScalarUDFImpl` trait.
 #[derive(Debug, PartialEq, Eq, Hash)]
 struct PowUdf {
     signature: Signature,
     aliases: Vec<String>,
 }

 impl PowUdf {
     /// Create a new instance of the `PowUdf` struct
     fn new() -> Self {
         Self {
             signature: Signature::exact(
                 // this function will always take two arguments of type f64
                 vec![DataType::Float64, DataType::Float64],
                 // this function is deterministic and will always return the same
                 // result for the same input
                 Volatility::Immutable,
             ),
             // we will also add an alias of "my_pow"
             aliases: vec!["my_pow".to_string()],
         }
     }
 }

 impl ScalarUDFImpl for PowUdf {
     /// We implement as_any so that we can downcast the ScalarUDFImpl trait object
     fn as_any(&self) -> &dyn Any {
         self
     }

     /// Return the name of this function
     fn name(&self) -> &str {
         "pow"
     }

     /// Return the "signature" of this function -- namely what types of arguments it will take
     fn signature(&self) -> &Signature {
         &self.signature
     }

     /// What is the type of value that will be returned by this function? In
     /// this case it will always be a constant value, but it could also be a
     /// function of the input types.
     fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
         Ok(DataType::Float64)
     }

     /// This function actually calculates the results of the scalar function.
     ///
     /// This is the same way that functions provided with DataFusion are invoked,
     /// which permits important special cases:
     ///
     ///1. When one or both of the arguments are single values (constants).
     ///   For example `pow(a, 2)`
     /// 2. When the input arrays can be reused (avoid allocating a new output array)
     ///
     /// However, it also means the implementation is more complex than when
     /// using `create_udf`.
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
         // The other fields of the `args` struct are used for more specialized
         // uses, and are not needed in this example
         let ScalarFunctionArgs { mut args, .. } = args;
         // DataFusion has arranged for the correct inputs to be passed to this
         // function, but we check again to make sure
         assert_eq!(args.len(), 2);
         // take ownership of arguments by popping in reverse order
         let exp = args.pop().unwrap();
         let base = args.pop().unwrap();
         assert_eq!(base.data_type(), DataType::Float64);
         assert_eq!(exp.data_type(), DataType::Float64);

         match (base, exp) {
             // For demonstration purposes we also implement the scalar / scalar
             // case here, but it is not typically required for high performance.
             //
             // For performance it is most important to optimize cases where at
             // least one argument is an array. If all arguments are constants,
             // the DataFusion expression simplification logic will often invoke
             // this path once during planning, and simply use the result during
             // execution.
             (
                 ColumnarValue::Scalar(ScalarValue::Float64(base)),
                 ColumnarValue::Scalar(ScalarValue::Float64(exp)),
             ) => {
                 // compute the output. Note DataFusion treats `None` as NULL.
                 let res = match (base, exp) {
                     (Some(base), Some(exp)) => Some(base.powf(exp)),
                     // one or both arguments were NULL
                     _ => None,
                 };
                 Ok(ColumnarValue::Scalar(ScalarValue::from(res)))
             }
             // special case if the exponent is a constant
             (
                 ColumnarValue::Array(base_array),
                 ColumnarValue::Scalar(ScalarValue::Float64(exp)),
             ) => {
                 let result_array = match exp {
                     // a ^ null = null
                     None => new_null_array(base_array.data_type(), base_array.len()),
                     // a ^ exp
                     Some(exp) => {
                         // DataFusion has ensured both arguments are Float64:
                         let base_array = base_array.as_primitive::<Float64Type>();
                         // calculate the result for every row. The `unary`
                         // kernel creates very fast "vectorized" code and
                         // handles things like null values for us.
                         let res: Float64Array =
                             compute::unary(base_array, |base| base.powf(exp));
                         Arc::new(res)
                     }
                 };
                 Ok(ColumnarValue::Array(result_array))
             }

             // special case if the base is a constant.
             //
             // Note this case is very similar to the previous case, so we could
             // use the same pattern. However, for this case we demonstrate an
             // even more advanced pattern to potentially avoid allocating a new array
             (
                 ColumnarValue::Scalar(ScalarValue::Float64(base)),
                 ColumnarValue::Array(exp_array),
             ) => {
                 let res = match base {
                     None => new_null_array(exp_array.data_type(), exp_array.len()),
                     Some(base) => maybe_pow_in_place(base, exp_array)?,
                 };
                 Ok(ColumnarValue::Array(res))
             }
             // Both arguments are arrays so we have to perform the calculation
             // for every row
             //
             // Note this could also be done in place using `binary_mut` as
             // is done in `maybe_pow_in_place` but here we use binary for simplicity
             (ColumnarValue::Array(base_array), ColumnarValue::Array(exp_array)) => {
                 let res: Float64Array = compute::binary(
                     base_array.as_primitive::<Float64Type>(),
                     exp_array.as_primitive::<Float64Type>(),
                     |base, exp| base.powf(exp),
                 )?;
                 Ok(ColumnarValue::Array(Arc::new(res)))
             }
             // if the types were not float, it is a bug in DataFusion
             _ => {
                 internal_err!("Invalid argument types to pow function")
             }
         }
     }

     /// We will also add an alias of "my_pow"
     fn aliases(&self) -> &[String] {
         &self.aliases
     }

     fn output_ordering(&self, input: &[ExprProperties]) -> Result<SortProperties> {
         // The POW function preserves the order of its argument.
         Ok(input[0].sort_properties)
     }
 }

 /// Evaluate `base ^ exp` *without* allocating a new array, if possible
 fn maybe_pow_in_place(base: f64, exp_array: ArrayRef) -> Result<ArrayRef> {
     // Calling `unary` creates a new array for the results. Avoiding
     // allocations is a common optimization in performance critical code.
     // arrow-rs allows this optimization via the `unary_mut`
     // and `binary_mut` kernels in certain cases
     //
     // These kernels can only be used if there are no other references to
     // the arrays (exp_array has to be the last remaining reference).
     let owned_array = exp_array
         // as in the previous example, we first downcast to &Float64Array
         .as_primitive::<Float64Type>()
         // non-obviously, we call clone here to get an owned `Float64Array`.
         // Calling clone() is relatively inexpensive as it increments
         // some ref counts but doesn't clone the data)
         //
         // Once we have the owned Float64Array we can drop the original
         // exp_array (untyped) reference
         .clone();

     // We *MUST* drop the reference to `exp_array` explicitly so that
     // owned_array is the only reference remaining in this function.
     //
     // Note that depending on the query there may still be other references
     // to the underlying buffers, which would prevent reuse. The only way to
     // know for sure is the result of `compute::unary_mut`
     drop(exp_array);

     // If we have the only reference, compute the result directly into the same
     // allocation as was used for the input array
     match compute::unary_mut(owned_array, |exp| base.powf(exp)) {
         Err(_orig_array) => {
             // unary_mut will return the original array if there are other
             // references into the underling buffer (and thus reuse is
             // impossible)
             //
             // In a real implementation, this case should fall back to
             // calling `unary` and allocate a new array; In this example
             // we will return an error for demonstration purposes
             exec_err!("Could not reuse array for maybe_pow_in_place")
         }
         // a result of OK means the operation was run successfully
         Ok(res) => Ok(Arc::new(res)),
     }
 }

 /// create local execution context with an in-memory table:
 ///
 /// ```text
 /// +-----+-----+
 /// | a   | b   |
 /// +-----+-----+
 /// | 2.1 | 1.0 |
 /// | 3.1 | 2.0 |
 /// | 4.1 | 3.0 |
 /// | 5.1 | 4.0 |
 /// +-----+-----+
 /// ```
 fn create_context() -> Result<SessionContext> {
     // define data.
     let a: ArrayRef = Arc::new(Float32Array::from(vec![2.1, 3.1, 4.1, 5.1]));
     let b: ArrayRef = Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
     let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)])?;

     // declare a new context. In Spark API, this corresponds to a new SparkSession
     let ctx = SessionContext::new();

     // declare a table in memory. In Spark API, this corresponds to createDataFrame(...).
     ctx.register_batch("t", batch)?;
     Ok(ctx)
 }

 /// In this example we register `PowUdf` as a user defined function
 /// and invoke it via the DataFrame API and SQL
 pub async fn advanced_udf() -> Result<()> {
     let ctx = create_context()?;

     // create the UDF
     let pow = ScalarUDF::from(PowUdf::new());

     // register the UDF with the context so it can be invoked by name and from SQL
     ctx.register_udf(pow.clone());

     // get a DataFrame from the context for scanning the "t" table
     let df = ctx.table("t").await?;

     // Call pow(a, 10) using the DataFrame API
     let df = df.select(vec![pow.call(vec![col("a"), lit(10i32)])])?;

     // note that the second argument is passed as an i32, not f64. DataFusion
     // automatically coerces the types to match the UDF's defined signature.

     // print the results
     df.show().await?;

     // You can also invoke both pow(2, 10) and its alias my_pow(a, b) using SQL
     ctx.sql("SELECT pow(2, 10), my_pow(a, b) FROM t")
         .await?
         .show()
         .await?;

     // You can also invoke pow_in_place by passing a constant base and a
     // column `a` as the exponent . If there is only a single
     // reference to `a` the code works well
     ctx.sql("SELECT pow(2, a) FROM t").await?.show().await?;

     // However, if there are multiple references to `a` in the evaluation
     // the array storage can not be reused
     let err = ctx
         .sql("SELECT pow(2, a), pow(3, a) FROM t")
         .await?
         .show()
         .await
         .unwrap_err();
     assert_eq!(
         err.to_string(),
         "Execution error: Could not reuse array for maybe_pow_in_place"
     );

     Ok(())
 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	use std::any::Any;
	use std::sync::Arc;

	use arrow::array::{
	new_null_array, Array, ArrayRef, AsArray, Float32Array, Float64Array,
	};
	use arrow::compute;
	use arrow::datatypes::{DataType, Float64Type};
	use arrow::record_batch::RecordBatch;
	use datafusion::common::{exec_err, internal_err, ScalarValue};
	use datafusion::error::Result;
	use datafusion::logical_expr::sort_properties::{ExprProperties, SortProperties};
	use datafusion::logical_expr::Volatility;
	use datafusion::logical_expr::{
	ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature,
	};
	use datafusion::prelude::*;

	/// This example shows how to use the full ScalarUDFImpl API to implement a user
	/// defined function. As in the `simple_udf.rs` example, this struct implements
	/// a function that takes two arguments and returns the first argument raised to
	/// the power of the second argument `a^b`.
	///
	/// To do so, we must implement the `ScalarUDFImpl` trait.
	#[derive(Debug, PartialEq, Eq, Hash)]
	struct PowUdf {
	signature: Signature,
	aliases: Vec<String>,
	}

	impl PowUdf {
	/// Create a new instance of the `PowUdf` struct
	fn new() -> Self {
	Self {
	signature: Signature::exact(
	// this function will always take two arguments of type f64
	vec![DataType::Float64, DataType::Float64],
	// this function is deterministic and will always return the same
	// result for the same input
	Volatility::Immutable,
	),
	// we will also add an alias of "my_pow"
	aliases: vec!["my_pow".to_string()],
	}
	}
	}

	impl ScalarUDFImpl for PowUdf {
	/// We implement as_any so that we can downcast the ScalarUDFImpl trait object
	fn as_any(&self) -> &dyn Any {
	self
	}

	/// Return the name of this function
	fn name(&self) -> &str {
	"pow"
	}

	/// Return the "signature" of this function -- namely what types of arguments it will take
	fn signature(&self) -> &Signature {
	&self.signature
	}

	/// What is the type of value that will be returned by this function? In
	/// this case it will always be a constant value, but it could also be a
	/// function of the input types.
	fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
	Ok(DataType::Float64)
	}

	/// This function actually calculates the results of the scalar function.
	///
	/// This is the same way that functions provided with DataFusion are invoked,
	/// which permits important special cases:
	///
	///1. When one or both of the arguments are single values (constants).
	/// For example `pow(a, 2)`
	/// 2. When the input arrays can be reused (avoid allocating a new output array)
	///
	/// However, it also means the implementation is more complex than when
	/// using `create_udf`.
	fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
	// The other fields of the `args` struct are used for more specialized
	// uses, and are not needed in this example
	let ScalarFunctionArgs { mut args, .. } = args;
	// DataFusion has arranged for the correct inputs to be passed to this
	// function, but we check again to make sure
	assert_eq!(args.len(), 2);
	// take ownership of arguments by popping in reverse order
	let exp = args.pop().unwrap();
	let base = args.pop().unwrap();
	assert_eq!(base.data_type(), DataType::Float64);
	assert_eq!(exp.data_type(), DataType::Float64);

	match (base, exp) {
	// For demonstration purposes we also implement the scalar / scalar
	// case here, but it is not typically required for high performance.
	//
	// For performance it is most important to optimize cases where at
	// least one argument is an array. If all arguments are constants,
	// the DataFusion expression simplification logic will often invoke
	// this path once during planning, and simply use the result during
	// execution.
	(
	ColumnarValue::Scalar(ScalarValue::Float64(base)),
	ColumnarValue::Scalar(ScalarValue::Float64(exp)),
	) => {
	// compute the output. Note DataFusion treats `None` as NULL.
	let res = match (base, exp) {
	(Some(base), Some(exp)) => Some(base.powf(exp)),
	// one or both arguments were NULL
	_ => None,
	};
	Ok(ColumnarValue::Scalar(ScalarValue::from(res)))
	}
	// special case if the exponent is a constant
	(
	ColumnarValue::Array(base_array),
	ColumnarValue::Scalar(ScalarValue::Float64(exp)),
	) => {
	let result_array = match exp {
	// a ^ null = null
	None => new_null_array(base_array.data_type(), base_array.len()),
	// a ^ exp
	Some(exp) => {
	// DataFusion has ensured both arguments are Float64:
	let base_array = base_array.as_primitive::<Float64Type>();
	// calculate the result for every row. The `unary`
	// kernel creates very fast "vectorized" code and
	// handles things like null values for us.
	let res: Float64Array =
	compute::unary(base_array, \|base\| base.powf(exp));
	Arc::new(res)
	}
	};
	Ok(ColumnarValue::Array(result_array))
	}

	// special case if the base is a constant.
	//
	// Note this case is very similar to the previous case, so we could
	// use the same pattern. However, for this case we demonstrate an
	// even more advanced pattern to potentially avoid allocating a new array
	(
	ColumnarValue::Scalar(ScalarValue::Float64(base)),
	ColumnarValue::Array(exp_array),
	) => {
	let res = match base {
	None => new_null_array(exp_array.data_type(), exp_array.len()),
	Some(base) => maybe_pow_in_place(base, exp_array)?,
	};
	Ok(ColumnarValue::Array(res))
	}
	// Both arguments are arrays so we have to perform the calculation
	// for every row
	//
	// Note this could also be done in place using `binary_mut` as
	// is done in `maybe_pow_in_place` but here we use binary for simplicity
	(ColumnarValue::Array(base_array), ColumnarValue::Array(exp_array)) => {
	let res: Float64Array = compute::binary(
	base_array.as_primitive::<Float64Type>(),
	exp_array.as_primitive::<Float64Type>(),
	\|base, exp\| base.powf(exp),
	)?;
	Ok(ColumnarValue::Array(Arc::new(res)))
	}
	// if the types were not float, it is a bug in DataFusion
	_ => {
	internal_err!("Invalid argument types to pow function")
	}
	}
	}

	/// We will also add an alias of "my_pow"
	fn aliases(&self) -> &[String] {
	&self.aliases
	}

	fn output_ordering(&self, input: &[ExprProperties]) -> Result<SortProperties> {
	// The POW function preserves the order of its argument.
	Ok(input[0].sort_properties)
	}
	}

	/// Evaluate `base ^ exp` without allocating a new array, if possible
	fn maybe_pow_in_place(base: f64, exp_array: ArrayRef) -> Result<ArrayRef> {
	// Calling `unary` creates a new array for the results. Avoiding
	// allocations is a common optimization in performance critical code.
	// arrow-rs allows this optimization via the `unary_mut`
	// and `binary_mut` kernels in certain cases
	//
	// These kernels can only be used if there are no other references to
	// the arrays (exp_array has to be the last remaining reference).
	let owned_array = exp_array
	// as in the previous example, we first downcast to &Float64Array
	.as_primitive::<Float64Type>()
	// non-obviously, we call clone here to get an owned `Float64Array`.
	// Calling clone() is relatively inexpensive as it increments
	// some ref counts but doesn't clone the data)
	//
	// Once we have the owned Float64Array we can drop the original
	// exp_array (untyped) reference
	.clone();

	// We MUST drop the reference to `exp_array` explicitly so that
	// owned_array is the only reference remaining in this function.
	//
	// Note that depending on the query there may still be other references
	// to the underlying buffers, which would prevent reuse. The only way to
	// know for sure is the result of `compute::unary_mut`
	drop(exp_array);

	// If we have the only reference, compute the result directly into the same
	// allocation as was used for the input array
	match compute::unary_mut(owned_array, \|exp\| base.powf(exp)) {
	Err(_orig_array) => {
	// unary_mut will return the original array if there are other
	// references into the underling buffer (and thus reuse is
	// impossible)
	//
	// In a real implementation, this case should fall back to
	// calling `unary` and allocate a new array; In this example
	// we will return an error for demonstration purposes
	exec_err!("Could not reuse array for maybe_pow_in_place")
	}
	// a result of OK means the operation was run successfully
	Ok(res) => Ok(Arc::new(res)),
	}
	}

	/// create local execution context with an in-memory table:
	///
	/// ```text
	/// +-----+-----+
	/// \| a \| b \|
	/// +-----+-----+
	/// \| 2.1 \| 1.0 \|
	/// \| 3.1 \| 2.0 \|
	/// \| 4.1 \| 3.0 \|
	/// \| 5.1 \| 4.0 \|
	/// +-----+-----+
	/// ```
	fn create_context() -> Result<SessionContext> {
	// define data.
	let a: ArrayRef = Arc::new(Float32Array::from(vec![2.1, 3.1, 4.1, 5.1]));
	let b: ArrayRef = Arc::new(Float64Array::from(vec![1.0, 2.0, 3.0, 4.0]));
	let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", b)])?;

	// declare a new context. In Spark API, this corresponds to a new SparkSession
	let ctx = SessionContext::new();

	// declare a table in memory. In Spark API, this corresponds to createDataFrame(...).
	ctx.register_batch("t", batch)?;
	Ok(ctx)
	}

	/// In this example we register `PowUdf` as a user defined function
	/// and invoke it via the DataFrame API and SQL
	pub async fn advanced_udf() -> Result<()> {
	let ctx = create_context()?;

	// create the UDF
	let pow = ScalarUDF::from(PowUdf::new());

	// register the UDF with the context so it can be invoked by name and from SQL
	ctx.register_udf(pow.clone());

	// get a DataFrame from the context for scanning the "t" table
	let df = ctx.table("t").await?;

	// Call pow(a, 10) using the DataFrame API
	let df = df.select(vec![pow.call(vec![col("a"), lit(10i32)])])?;

	// note that the second argument is passed as an i32, not f64. DataFusion
	// automatically coerces the types to match the UDF's defined signature.

	// print the results
	df.show().await?;

	// You can also invoke both pow(2, 10) and its alias my_pow(a, b) using SQL
	ctx.sql("SELECT pow(2, 10), my_pow(a, b) FROM t")
	.await?
	.show()
	.await?;

	// You can also invoke pow_in_place by passing a constant base and a
	// column `a` as the exponent . If there is only a single
	// reference to `a` the code works well
	ctx.sql("SELECT pow(2, a) FROM t").await?.show().await?;

	// However, if there are multiple references to `a` in the evaluation
	// the array storage can not be reused
	let err = ctx
	.sql("SELECT pow(2, a), pow(3, a) FROM t")
	.await?
	.show()
	.await
	.unwrap_err();
	assert_eq!(
	err.to_string(),
	"Execution error: Could not reuse array for maybe_pow_in_place"
	);

	Ok(())
	}