examples/prefect/prepare_data.py - hamilton - Git at Google

 from typing import List

 import numpy as np
 import pandas as pd

 from hamilton.function_modifiers import config, extract_columns

 """Module level constants are convenient when you want your Hamilton driver to
 produce a list of outputs. For example, you can do:

 ```
 import hamilton
 import prepare_data

 dr = hamilton.driver.Driver({}, prepare_data)
 outputs = dr.execute(final_vars=prepare_data.ALL_FEATURES)
 ```
 """
 SOURCE_COLUMNS = [
     "id",
     "reason_for_absence",
     "month_of_absence",
     "day_of_the_week",
     "seasons",
     "transportation_expense",
     "distance_from_residence_to_work",
     "service_time",
     "age",
     "work_load_average_per_day",
     "hit_target",
     "disciplinary_failure",
     "education",
     "son",
     "social_drinker",
     "social_smoker",
     "pet",
     "weight",
     "height",
     "body_mass_index",
     "absenteeism_time_in_hours",
 ]

 ALL_FEATURES = [
     "id",
     "reason_for_absence",
     "distance_from_residence_to_work",
     "service_time",
     "social_drinker",
     "social_smoker",
     "age_zero_mean_unit_variance",
     "has_children",
     "has_pet",
     "is_summer",
     "day_of_the_week_2",
     "day_of_the_week_3",
     "day_of_the_week_4",
     "day_of_the_week_5",
     "day_of_the_week_6",
 ]


 def _rename_columns(columns: List[str]) -> List[str]:
     """convert raw data column names to snakecase and make them compatible
     with Hamilton's naming convention (need to be a valid Python function name)

     NOTE functions prefixed with "_" are not loaded as Hamilton nodes
     """
     return [c.strip().replace("/", "_per_").replace(" ", "_").lower() for c in columns]


 @config.when(development_flag=False)
 @extract_columns(*SOURCE_COLUMNS)
 def sanitized_df__prod(raw_df: pd.DataFrame) -> pd.DataFrame:
     """convert raw data column names to snakecase and make them compatible
     with Hamilton's naming convention (need to be a valid Python function name)

     NOTE config when is used. Functions with a suffix of "__condition" describe
     alternative of the same Hamilton node
     """
     raw_df.columns = _rename_columns(raw_df.columns)

     return raw_df


 @config.when_not(development_flag=False)
 @extract_columns(*SOURCE_COLUMNS)
 def sanitized_df__dev(raw_df: pd.DataFrame) -> pd.DataFrame:
     """identical to sanitized_df__prod(), but loads only first 20 rows"""
     raw_df.columns = _rename_columns(raw_df.columns)

     return raw_df.head(20)


 def age_mean(age: pd.Series) -> np.float64:
     """Average of age"""
     return age.mean()


 def age_zero_mean(age: pd.Series, age_mean: np.float64) -> pd.Series:
     """Zero mean of age"""
     return age - age_mean


 def age_std_dev(age: pd.Series) -> np.float64:
     """Standard deviation of age."""
     return age.std()


 def age_zero_mean_unit_variance(age_zero_mean: pd.Series, age_std_dev: np.float64) -> pd.Series:
     """Zero mean unit variance value of age"""
     return age_zero_mean / age_std_dev


 def has_children(son: pd.Series) -> pd.Series:
     """Single variable that says whether someone has any children or not."""
     return (son > 0).astype(int)


 def has_pet(pet: pd.Series) -> pd.Series:
     """Single variable that says whether someone has any pets or not."""
     return (pet > 0).astype(int)


 def is_summer(month_of_absence: pd.Series) -> pd.Series:
     """Is it summer in Brazil? i.e. months of December, January, February."""
     return month_of_absence.isin([1, 2, 12]).astype(int)


 @extract_columns(
     "day_of_the_week_2",
     "day_of_the_week_3",
     "day_of_the_week_4",
     "day_of_the_week_5",
     "day_of_the_week_6",
 )
 def day_of_week_encoded(day_of_the_week: pd.Series) -> pd.DataFrame:
     """One hot encodes day of week into five dimensions -- Saturday & Sunday weren't present.
     1 - Sunday, 2 - Monday, 3 - Tuesday, 4 - Wednesday, 5 - Thursday, 6 - Friday, 7 - Saturday.

     NOTE it is not equivalent to pandas dt.dayofweek which has Monday as 0 and Sunday as 6
     """
     return pd.get_dummies(day_of_the_week, prefix="day_of_the_week")
	from typing import List

	import numpy as np
	import pandas as pd

	from hamilton.function_modifiers import config, extract_columns

	"""Module level constants are convenient when you want your Hamilton driver to
	produce a list of outputs. For example, you can do:

	```
	import hamilton
	import prepare_data

	dr = hamilton.driver.Driver({}, prepare_data)
	outputs = dr.execute(final_vars=prepare_data.ALL_FEATURES)
	```
	"""
	SOURCE_COLUMNS = [
	"id",
	"reason_for_absence",
	"month_of_absence",
	"day_of_the_week",
	"seasons",
	"transportation_expense",
	"distance_from_residence_to_work",
	"service_time",
	"age",
	"work_load_average_per_day",
	"hit_target",
	"disciplinary_failure",
	"education",
	"son",
	"social_drinker",
	"social_smoker",
	"pet",
	"weight",
	"height",
	"body_mass_index",
	"absenteeism_time_in_hours",
	]

	ALL_FEATURES = [
	"id",
	"reason_for_absence",
	"distance_from_residence_to_work",
	"service_time",
	"social_drinker",
	"social_smoker",
	"age_zero_mean_unit_variance",
	"has_children",
	"has_pet",
	"is_summer",
	"day_of_the_week_2",
	"day_of_the_week_3",
	"day_of_the_week_4",
	"day_of_the_week_5",
	"day_of_the_week_6",
	]


	def _rename_columns(columns: List[str]) -> List[str]:
	"""convert raw data column names to snakecase and make them compatible
	with Hamilton's naming convention (need to be a valid Python function name)

	NOTE functions prefixed with "_" are not loaded as Hamilton nodes
	"""
	return [c.strip().replace("/", "_per_").replace(" ", "_").lower() for c in columns]


	@config.when(development_flag=False)
	@extract_columns(*SOURCE_COLUMNS)
	def sanitized_df__prod(raw_df: pd.DataFrame) -> pd.DataFrame:
	"""convert raw data column names to snakecase and make them compatible
	with Hamilton's naming convention (need to be a valid Python function name)

	NOTE config when is used. Functions with a suffix of "__condition" describe
	alternative of the same Hamilton node
	"""
	raw_df.columns = _rename_columns(raw_df.columns)

	return raw_df


	@config.when_not(development_flag=False)
	@extract_columns(*SOURCE_COLUMNS)
	def sanitized_df__dev(raw_df: pd.DataFrame) -> pd.DataFrame:
	"""identical to sanitized_df__prod(), but loads only first 20 rows"""
	raw_df.columns = _rename_columns(raw_df.columns)

	return raw_df.head(20)


	def age_mean(age: pd.Series) -> np.float64:
	"""Average of age"""
	return age.mean()


	def age_zero_mean(age: pd.Series, age_mean: np.float64) -> pd.Series:
	"""Zero mean of age"""
	return age - age_mean


	def age_std_dev(age: pd.Series) -> np.float64:
	"""Standard deviation of age."""
	return age.std()


	def age_zero_mean_unit_variance(age_zero_mean: pd.Series, age_std_dev: np.float64) -> pd.Series:
	"""Zero mean unit variance value of age"""
	return age_zero_mean / age_std_dev


	def has_children(son: pd.Series) -> pd.Series:
	"""Single variable that says whether someone has any children or not."""
	return (son > 0).astype(int)


	def has_pet(pet: pd.Series) -> pd.Series:
	"""Single variable that says whether someone has any pets or not."""
	return (pet > 0).astype(int)


	def is_summer(month_of_absence: pd.Series) -> pd.Series:
	"""Is it summer in Brazil? i.e. months of December, January, February."""
	return month_of_absence.isin([1, 2, 12]).astype(int)


	@extract_columns(
	"day_of_the_week_2",
	"day_of_the_week_3",
	"day_of_the_week_4",
	"day_of_the_week_5",
	"day_of_the_week_6",
	)
	def day_of_week_encoded(day_of_the_week: pd.Series) -> pd.DataFrame:
	"""One hot encodes day of week into five dimensions -- Saturday & Sunday weren't present.
	1 - Sunday, 2 - Monday, 3 - Tuesday, 4 - Wednesday, 5 - Thursday, 6 - Friday, 7 - Saturday.

	NOTE it is not equivalent to pandas dt.dayofweek which has Monday as 0 and Sunday as 6
	"""
	return pd.get_dummies(day_of_the_week, prefix="day_of_the_week")