blob: 6b7c8d3435e8377751b3f0cafbc8a492d6b4b888 [file] [log] [blame]
"""Pyspark UDFs
Has to only contain map operations!
Notes:
1. Hamilton will first try to satisfy running these UDFs with columns from the dataframe, else it will take from the
input dictionary that are not part of the pyspark dataframe.
2. UDFs defined this way operate in a row-by-row fashion, so they are not vectorized.
"""
def spend_per_signup(spend: float, signups: float) -> float:
"""The cost per signup in relation to spend."""
return spend / signups
def augmented_mean(foo: float, bar: float) -> float:
"""Shows you can include functions that don't depend on columns in the dataframe if you want to do
other things with Hamilton at the same time as computing. If Hamilton does not find a match in the
dataframe it'll look for a match in the inputs dictionary."""
return foo + bar
def spend_zero_mean(spend: float, spend_mean: float) -> float:
"""Computes zero mean spend.
Note:
`spend_mean` here COULD come from the dataframe OR the input dictionary.
"""
return spend - spend_mean
def spend_zero_mean_unit_variance(spend_zero_mean: float, spend_std_dev: float) -> float:
"""Function showing one way to make spend have zero mean and unit variance.
Note:
`spend_std_dev` here COULD come from the pyspark dataframe OR the input dictionary.
"""
return spend_zero_mean / spend_std_dev