examples/spark/pyspark_udfs/pandas_udfs.py - hamilton - Git at Google

 """Pandas UDFs.

 Has to only contain map operations! Aggregations, filters, etc. are not supported.

 Notes:

 1. Functions that are annotated with pd.Series for all inputs and output, will be converted to pandas UDFs.
 Note - we could broaden support for more function type signatures, but this is a good start. Please make an
 issue/discussion if this is something you'd like to see.
 2. You need to use the `h_typing.column` annotation for the output type. This is because we need to know what
 underlying primitive type the pandas series is. This is a limitation of the pandas UDFs on pyspark.
 3. If a function is deemed to be a pandas_udf one, Hamilton will try to satisfy running these UDFs with columns from
 the dataframe ONLY. This is different from how the vanilla UDFs behave. This is partially a limitation of pandas UDFs.
 4. Pandas_udfs operate over chunks of data, and can thus operate in a vectorized manner. This is a big performance gain
 over vanilla UDFs.
 5. You can have non-pandas_udf functions in the same file, and will be run as row based UDFs.

 """
 import pandas as pd

 from hamilton.htypes import column


 def spend_per_signup(spend: pd.Series, signups: pd.Series) -> column[pd.Series, float]:
     """The cost per signup in relation to spend."""
     return spend / signups


 def augmented_mean(foo: float, bar: float) -> float:
     """Shows you can include functions that don't depend on columns in the dataframe if you want to do
     other things with Hamilton at the same time as computing. If Hamilton does not find a match in the
     dataframe it'll look for a match in the inputs dictionary.
     """
     return foo + bar


 def spend_zero_mean(spend: pd.Series, spend_mean: pd.Series) -> column[pd.Series, float]:
     """Computes zero mean spend.
     Note:
         `spend_mean` here HAS TO come from the dataframe or the input dictionary.
     """
     return spend - spend_mean


 def spend_zero_mean_unit_variance(
     spend_zero_mean: pd.Series, spend_std_dev: pd.Series
 ) -> column[pd.Series, float]:
     """Function showing one way to make spend have zero mean and unit variance.
     Note:
         `spend_std_dev` here HAS TO come from the pyspark dataframe.
     """
     return spend_zero_mean / spend_std_dev
	"""Pandas UDFs.

	Has to only contain map operations! Aggregations, filters, etc. are not supported.

	Notes:

	1. Functions that are annotated with pd.Series for all inputs and output, will be converted to pandas UDFs.
	Note - we could broaden support for more function type signatures, but this is a good start. Please make an
	issue/discussion if this is something you'd like to see.
	2. You need to use the `h_typing.column` annotation for the output type. This is because we need to know what
	underlying primitive type the pandas series is. This is a limitation of the pandas UDFs on pyspark.
	3. If a function is deemed to be a pandas_udf one, Hamilton will try to satisfy running these UDFs with columns from
	the dataframe ONLY. This is different from how the vanilla UDFs behave. This is partially a limitation of pandas UDFs.
	4. Pandas_udfs operate over chunks of data, and can thus operate in a vectorized manner. This is a big performance gain
	over vanilla UDFs.
	5. You can have non-pandas_udf functions in the same file, and will be run as row based UDFs.

	"""
	import pandas as pd

	from hamilton.htypes import column


	def spend_per_signup(spend: pd.Series, signups: pd.Series) -> column[pd.Series, float]:
	"""The cost per signup in relation to spend."""
	return spend / signups


	def augmented_mean(foo: float, bar: float) -> float:
	"""Shows you can include functions that don't depend on columns in the dataframe if you want to do
	other things with Hamilton at the same time as computing. If Hamilton does not find a match in the
	dataframe it'll look for a match in the inputs dictionary.
	"""
	return foo + bar


	def spend_zero_mean(spend: pd.Series, spend_mean: pd.Series) -> column[pd.Series, float]:
	"""Computes zero mean spend.
	Note:
	`spend_mean` here HAS TO come from the dataframe or the input dictionary.
	"""
	return spend - spend_mean


	def spend_zero_mean_unit_variance(
	spend_zero_mean: pd.Series, spend_std_dev: pd.Series
	) -> column[pd.Series, float]:
	"""Function showing one way to make spend have zero mean and unit variance.
	Note:
	`spend_std_dev` here HAS TO come from the pyspark dataframe.
	"""
	return spend_zero_mean / spend_std_dev