blob: 829d5b8f14ba29832277fd063fb9810c4d80699c [file] [log] [blame]
"""
Module to transform iris data into features.
"""
import numpy as np
import pandas as pd
from hamilton.function_modifiers import parameterize_sources
RAW_FEATURES = ["sepal_length_cm", "sepal_width_cm", "petal_length_cm", "petal_width_cm"]
# Here is more terse code that does the same thing as the below *_log functions.
# Any `@parameterize*` decorator is just a less verbose way of defining functions that differ
# slightly. We don't see anything wrong with verbose code - so we recommend err'ing on the side of
# verbosity, but otherwise for this example show the terser code.
# @parameterize_sources(**{f"{col}_log": {"col": col} for col in RAW_FEATURES})
# def log_value(col: pd.Series) -> pd.Series:
# """Log value of {col}."""
# return np.log(col)
def sepal_length_cm_log(sepal_length_cm: pd.Series) -> pd.Series:
"""Log value of sepal_length_cm_."""
return np.log(sepal_length_cm)
def sepal_width_cm_log(sepal_width_cm: pd.Series) -> pd.Series:
"""Log value of sepal_width_cm_."""
return np.log(sepal_width_cm)
def petal_length_cm_log(petal_length_cm: pd.Series) -> pd.Series:
"""Log value of petal_length_cm_."""
return np.log(petal_length_cm)
def petal_width_cm_log(petal_width_cm: pd.Series) -> pd.Series:
"""Log value of petal_width_cm_."""
return np.log(petal_width_cm)
@parameterize_sources(**{f"{col}_mean": {"col": col} for col in RAW_FEATURES})
def mean_value(col: pd.Series) -> float:
"""Mean of {col}."""
return col.mean()
@parameterize_sources(**{f"{col}_std": {"col": col} for col in RAW_FEATURES})
def std_value(col: pd.Series) -> float:
"""Standard deviation of {col}."""
return col.std()
@parameterize_sources(
**{
f"{col}_normalized": {"col": col, "col_mean": f"{col}_mean", "col_std": f"{col}_std"}
for col in RAW_FEATURES
}
)
def normalized_value(col: pd.Series, col_mean: float, col_std: float) -> pd.Series:
"""Normalized column of {col}."""
return (col - col_mean) / col_std
def data_set_v1(
sepal_length_cm_normalized: pd.Series,
sepal_width_cm_normalized: pd.Series,
petal_length_cm_normalized: pd.Series,
petal_width_cm_normalized: pd.Series,
target_class: pd.Series,
) -> pd.DataFrame:
"""Explicitly define the feature set we want to use."""
return pd.DataFrame(
{
"sepal_length_cm_normalized": sepal_length_cm_normalized,
"sepal_width_cm_normalized": sepal_width_cm_normalized,
"petal_length_cm_normalized": petal_length_cm_normalized,
"petal_width_cm_normalized": petal_width_cm_normalized,
"target_class": target_class,
}
)
def data_set_v2(
sepal_length_cm_normalized: pd.Series,
sepal_width_cm_normalized: pd.Series,
petal_length_cm_normalized: pd.Series,
petal_width_cm_normalized: pd.Series,
sepal_length_cm_log: pd.Series,
sepal_width_cm_log: pd.Series,
petal_length_cm_log: pd.Series,
petal_width_cm_log: pd.Series,
target_class: pd.Series,
) -> pd.DataFrame:
"""Explicitly define the feature set we want to use. This one adds `log` features."""
return pd.DataFrame(
{
"sepal_length_cm_normalized": sepal_length_cm_normalized,
"sepal_width_cm_normalized": sepal_width_cm_normalized,
"petal_length_cm_normalized": petal_length_cm_normalized,
"petal_width_cm_normalized": petal_width_cm_normalized,
"sepal_length_cm_log": sepal_length_cm_log,
"sepal_width_cm_log": sepal_width_cm_log,
"petal_length_cm_log": petal_length_cm_log,
"petal_width_cm_log": petal_width_cm_log,
"target_class": target_class,
}
)