| """ |
| This is a module that contains our feature transforms. |
| """ |
| import pickle |
| from typing import Set |
| |
| import numpy as np |
| import pandas as pd |
| |
| # from sklearn.preprocessing import OneHotEncoder |
| from sklearn import impute # import KNNImputer |
| from sklearn import preprocessing |
| |
| from hamilton.function_modifiers import check_output, config |
| |
| |
| def rare_titles() -> Set[str]: |
| """Rare titles we've curated""" |
| return { |
| "Capt", |
| "Col", |
| "Don", |
| "Dona", |
| "Dr", |
| "Jonkheer", |
| "Lady", |
| "Major", |
| "Mlle", |
| "Mme", |
| "Ms", |
| "Rev", |
| "Sir", |
| "the Countess", |
| } |
| |
| |
| @check_output(data_type=np.int) |
| def family_size(parch: pd.Series, sibsp: pd.Series) -> pd.Series: |
| return parch + sibsp |
| |
| |
| def normalized_name(name: pd.Series) -> pd.Series: |
| """I believe this actually gets the honorific, not the name.""" |
| return name.apply(lambda x: x.split(",")[1].split(".")[0].strip()) |
| |
| |
| def title(normalized_name: pd.Series, rare_titles: Set[str]) -> pd.Series: |
| return normalized_name.apply(lambda n: "rare" if n in rare_titles else n) |
| |
| |
| def is_alone(family_size: pd.Series) -> pd.Series: |
| return (family_size == 1).astype(int) |
| |
| |
| def one_hot_encoder() -> preprocessing.OneHotEncoder: |
| return preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False) |
| |
| |
| @config.when(model_to_use="create_new") |
| def fit_categorical_encoder__create_new( |
| one_hot_encoder: preprocessing.OneHotEncoder, |
| embarked: pd.Series, |
| sex: pd.Series, |
| pclass: pd.Series, |
| title: pd.Series, |
| is_alone: pd.Series, |
| ) -> preprocessing.OneHotEncoder: |
| cat_df = pd.concat([embarked, sex, pclass, title, is_alone], axis=1) |
| one_hot_encoder.fit(cat_df) |
| return one_hot_encoder |
| |
| |
| @config.when(model_to_use="use_existing") |
| def fit_categorical_encoder__use_existing( |
| categorical_encoder_path: str, |
| ) -> preprocessing.OneHotEncoder: |
| with open(categorical_encoder_path, "rb") as f: |
| return pickle.load(f) |
| |
| |
| def categorical_df( |
| fit_categorical_encoder: preprocessing.OneHotEncoder, |
| embarked: pd.Series, |
| sex: pd.Series, |
| pclass: pd.Series, |
| title: pd.Series, |
| is_alone: pd.Series, |
| ) -> pd.DataFrame: |
| """This creates the dataframe of categorical features. |
| |
| The number of "features" output depends on the number of categories. |
| |
| :param fit_categorical_encoder: |
| :param embarked: |
| :param sex: |
| :param pclass: |
| :param title: |
| :param is_alone: |
| :return: |
| """ |
| cat_df = pd.concat([embarked, sex, pclass, title, is_alone], axis=1) |
| cat_df = fit_categorical_encoder.transform(cat_df) |
| df = pd.DataFrame(cat_df) |
| df.index = embarked.index |
| df.columns = [f"categorical_{c}" for c in df.columns] |
| return df |
| |
| |
| def knn_imputer(n_neighbors: int = 5) -> impute.KNNImputer: |
| return impute.KNNImputer(n_neighbors=n_neighbors) |
| |
| |
| @config.when(model_to_use="create_new") |
| def fit_knn_imputer__create_new( |
| knn_imputer: impute.KNNImputer, |
| age: pd.Series, |
| fare: pd.Series, |
| family_size: pd.Series, |
| ) -> impute.KNNImputer: |
| num_df = pd.concat([age, fare, family_size], axis=1) |
| knn_imputer.fit(num_df) |
| return knn_imputer |
| |
| |
| @config.when(model_to_use="use_existing") |
| def fit_knn_imputer__use_existing(knn_imputer_path: str) -> impute.KNNImputer: |
| with open(knn_imputer_path, "rb") as f: |
| return pickle.load(f) |
| |
| |
| def knn_imputed_df( |
| fit_knn_imputer: impute.KNNImputer, |
| age: pd.Series, |
| fare: pd.Series, |
| family_size: pd.Series, |
| ) -> pd.DataFrame: |
| """This creates the dataframe of KNN imputed numeric features. |
| |
| :param fit_knn_imputer: |
| :param age: |
| :param fare: |
| :param family_size: |
| :return: |
| """ |
| num_df = pd.concat([age, fare, family_size], axis=1) |
| imputed_df = fit_knn_imputer.transform(num_df) |
| df = pd.DataFrame(imputed_df) |
| df.index = age.index |
| df.columns = [f"knn_imputed_{c}" for c in df.columns] |
| return df |
| |
| |
| def robust_scaler() -> preprocessing.RobustScaler: |
| return preprocessing.RobustScaler() |
| |
| |
| @config.when(model_to_use="create_new") |
| def fit_scaler__create_new( |
| robust_scaler: preprocessing.RobustScaler, knn_imputed_df: pd.DataFrame |
| ) -> preprocessing.RobustScaler: |
| robust_scaler.fit(knn_imputed_df) |
| return robust_scaler |
| |
| |
| @config.when(model_to_use="use_existing") |
| def fit_scaler__use_existing(scaler_path: str) -> preprocessing.RobustScaler: |
| with open(scaler_path, "rb") as f: |
| return pickle.load(f) |
| |
| |
| def scaled_numeric_df( |
| fit_scaler: preprocessing.RobustScaler, knn_imputed_df: pd.DataFrame |
| ) -> pd.DataFrame: |
| """This creates the dataframe of scaled numeric features. |
| |
| :param fit_scaler: |
| :param knn_imputed_df: |
| :return: |
| """ |
| num_df = fit_scaler.transform(knn_imputed_df) |
| df = pd.DataFrame(num_df) |
| df.index = knn_imputed_df.index |
| df.columns = [f"scaled_numeric_{c}" for c in df.columns] |
| return df |
| |
| |
| def data_set( |
| scaled_numeric_df: pd.DataFrame, categorical_df: pd.DataFrame, target: pd.Series |
| ) -> pd.DataFrame: |
| """This function creates our dataset. |
| |
| Following what was in the code, this is how the features are stuck together. |
| |
| :param scaled_numeric_df: |
| :param categorical_df: |
| :param target: |
| :return: |
| """ |
| return pd.concat([scaled_numeric_df, categorical_df, target], axis=1) |
| |
| |
| def inference_set(scaled_numeric_df: pd.DataFrame, categorical_df: pd.DataFrame) -> pd.DataFrame: |
| """This function creates an inference set. |
| |
| :param scaled_numeric_df: |
| :param categorical_df: |
| :return: |
| """ |
| return pd.concat([scaled_numeric_df, categorical_df], axis=1) |