blob: f1646e4f02e9f6aad7b0d2b8915fbc309c9134e8 [file] [log] [blame]
from typing import List
import numpy as np
import pandas as pd
from hamilton.function_modifiers import config, extract_columns
"""Module level constants are convenient when you want your Hamilton driver to
produce a list of outputs. For example, you can do:
```
import hamilton
import prepare_data
dr = hamilton.driver.Driver({}, prepare_data)
outputs = dr.execute(final_vars=prepare_data.ALL_FEATURES)
```
"""
SOURCE_COLUMNS = [
"id",
"reason_for_absence",
"month_of_absence",
"day_of_the_week",
"seasons",
"transportation_expense",
"distance_from_residence_to_work",
"service_time",
"age",
"work_load_average_per_day",
"hit_target",
"disciplinary_failure",
"education",
"son",
"social_drinker",
"social_smoker",
"pet",
"weight",
"height",
"body_mass_index",
"absenteeism_time_in_hours",
]
ALL_FEATURES = [
"id",
"reason_for_absence",
"distance_from_residence_to_work",
"service_time",
"social_drinker",
"social_smoker",
"age_zero_mean_unit_variance",
"has_children",
"has_pet",
"is_summer",
"day_of_the_week_2",
"day_of_the_week_3",
"day_of_the_week_4",
"day_of_the_week_5",
"day_of_the_week_6",
]
def _rename_columns(columns: List[str]) -> List[str]:
"""convert raw data column names to snakecase and make them compatible
with Hamilton's naming convention (need to be a valid Python function name)
NOTE functions prefixed with "_" are not loaded as Hamilton nodes
"""
return [c.strip().replace("/", "_per_").replace(" ", "_").lower() for c in columns]
@config.when(development_flag=False)
@extract_columns(*SOURCE_COLUMNS)
def sanitized_df__prod(raw_df: pd.DataFrame) -> pd.DataFrame:
"""convert raw data column names to snakecase and make them compatible
with Hamilton's naming convention (need to be a valid Python function name)
NOTE config when is used. Functions with a suffix of "__condition" describe
alternative of the same Hamilton node
"""
raw_df.columns = _rename_columns(raw_df.columns)
return raw_df
@config.when_not(development_flag=False)
@extract_columns(*SOURCE_COLUMNS)
def sanitized_df__dev(raw_df: pd.DataFrame) -> pd.DataFrame:
"""identical to sanitized_df__prod(), but loads only first 20 rows"""
raw_df.columns = _rename_columns(raw_df.columns)
return raw_df.head(20)
def age_mean(age: pd.Series) -> np.float64:
"""Average of age"""
return age.mean()
def age_zero_mean(age: pd.Series, age_mean: np.float64) -> pd.Series:
"""Zero mean of age"""
return age - age_mean
def age_std_dev(age: pd.Series) -> np.float64:
"""Standard deviation of age."""
return age.std()
def age_zero_mean_unit_variance(age_zero_mean: pd.Series, age_std_dev: np.float64) -> pd.Series:
"""Zero mean unit variance value of age"""
return age_zero_mean / age_std_dev
def has_children(son: pd.Series) -> pd.Series:
"""Single variable that says whether someone has any children or not."""
return (son > 0).astype(int)
def has_pet(pet: pd.Series) -> pd.Series:
"""Single variable that says whether someone has any pets or not."""
return (pet > 0).astype(int)
def is_summer(month_of_absence: pd.Series) -> pd.Series:
"""Is it summer in Brazil? i.e. months of December, January, February."""
return month_of_absence.isin([1, 2, 12]).astype(int)
@extract_columns(
"day_of_the_week_2",
"day_of_the_week_3",
"day_of_the_week_4",
"day_of_the_week_5",
"day_of_the_week_6",
)
def day_of_week_encoded(day_of_the_week: pd.Series) -> pd.DataFrame:
"""One hot encodes day of week into five dimensions -- Saturday & Sunday weren't present.
1 - Sunday, 2 - Monday, 3 - Tuesday, 4 - Wednesday, 5 - Thursday, 6 - Friday, 7 - Saturday.
NOTE it is not equivalent to pandas dt.dayofweek which has Monday as 0 and Sunday as 6
"""
return pd.get_dummies(day_of_the_week, prefix="day_of_the_week")