blob: 278b8af1c08155106395e91c0a9012f07b3f9fc0 [file] [log] [blame]
"""
This is a module that contains our feature transforms.
"""
import pickle
from typing import Set
import numpy as np
import pandas as pd
# from sklearn.preprocessing import OneHotEncoder
from sklearn import impute # import KNNImputer
from sklearn import preprocessing
from hamilton.function_modifiers import check_output, config
def rare_titles() -> Set[str]:
"""Rare titles we've curated"""
return {
"Capt",
"Col",
"Don",
"Dona",
"Dr",
"Jonkheer",
"Lady",
"Major",
"Mlle",
"Mme",
"Ms",
"Rev",
"Sir",
"the Countess",
}
@check_output(data_type=np.int)
def family_size(parch: pd.Series, sibsp: pd.Series) -> pd.Series:
return parch + sibsp
def normalized_name(name: pd.Series) -> pd.Series:
"""I believe this actually gets the honorific, not the name."""
return name.apply(lambda x: x.split(",")[1].split(".")[0].strip())
def title(normalized_name: pd.Series, rare_titles: Set[str]) -> pd.Series:
return normalized_name.apply(lambda n: "rare" if n in rare_titles else n)
def is_alone(family_size: pd.Series) -> pd.Series:
return (family_size == 1).astype(int)
def one_hot_encoder() -> preprocessing.OneHotEncoder:
return preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False)
@config.when(model_to_use="create_new")
def fit_categorical_encoder__create_new(
one_hot_encoder: preprocessing.OneHotEncoder,
embarked: pd.Series,
sex: pd.Series,
pclass: pd.Series,
title: pd.Series,
is_alone: pd.Series,
) -> preprocessing.OneHotEncoder:
cat_df = pd.concat([embarked, sex, pclass, title, is_alone], axis=1)
one_hot_encoder.fit(cat_df)
return one_hot_encoder
@config.when(model_to_use="use_existing")
def fit_categorical_encoder__use_existing(
categorical_encoder_path: str,
) -> preprocessing.OneHotEncoder:
with open(categorical_encoder_path, "rb") as f:
return pickle.load(f)
def categorical_df(
fit_categorical_encoder: preprocessing.OneHotEncoder,
embarked: pd.Series,
sex: pd.Series,
pclass: pd.Series,
title: pd.Series,
is_alone: pd.Series,
) -> pd.DataFrame:
"""This creates the dataframe of categorical features.
The number of "features" output depends on the number of categories.
:param fit_categorical_encoder:
:param embarked:
:param sex:
:param pclass:
:param title:
:param is_alone:
:return:
"""
cat_df = pd.concat([embarked, sex, pclass, title, is_alone], axis=1)
cat_df = fit_categorical_encoder.transform(cat_df)
df = pd.DataFrame(cat_df)
df.index = embarked.index
df.columns = [f"categorical_{c}" for c in df.columns]
return df
def knn_imputer(n_neighbors: int = 5) -> impute.KNNImputer:
return impute.KNNImputer(n_neighbors=n_neighbors)
@config.when(model_to_use="create_new")
def fit_knn_imputer__create_new(
knn_imputer: impute.KNNImputer,
age: pd.Series,
fare: pd.Series,
family_size: pd.Series,
) -> impute.KNNImputer:
num_df = pd.concat([age, fare, family_size], axis=1)
knn_imputer.fit(num_df)
return knn_imputer
@config.when(model_to_use="use_existing")
def fit_knn_imputer__use_existing(knn_imputer_path: str) -> impute.KNNImputer:
with open(knn_imputer_path, "rb") as f:
return pickle.load(f)
def knn_imputed_df(
fit_knn_imputer: impute.KNNImputer,
age: pd.Series,
fare: pd.Series,
family_size: pd.Series,
) -> pd.DataFrame:
"""This creates the dataframe of KNN imputed numeric features.
:param fit_knn_imputer:
:param age:
:param fare:
:param family_size:
:return:
"""
num_df = pd.concat([age, fare, family_size], axis=1)
imputed_df = fit_knn_imputer.transform(num_df)
df = pd.DataFrame(imputed_df)
df.index = age.index
df.columns = [f"knn_imputed_{c}" for c in df.columns]
return df
def robust_scaler() -> preprocessing.RobustScaler:
return preprocessing.RobustScaler()
@config.when(model_to_use="create_new")
def fit_scaler__create_new(
robust_scaler: preprocessing.RobustScaler, knn_imputed_df: pd.DataFrame
) -> preprocessing.RobustScaler:
robust_scaler.fit(knn_imputed_df)
return robust_scaler
@config.when(model_to_use="use_existing")
def fit_scaler__use_existing(scaler_path: str) -> preprocessing.RobustScaler:
with open(scaler_path, "rb") as f:
return pickle.load(f)
def scaled_numeric_df(
fit_scaler: preprocessing.RobustScaler, knn_imputed_df: pd.DataFrame
) -> pd.DataFrame:
"""This creates the dataframe of scaled numeric features.
:param fit_scaler:
:param knn_imputed_df:
:return:
"""
num_df = fit_scaler.transform(knn_imputed_df)
df = pd.DataFrame(num_df)
df.index = knn_imputed_df.index
df.columns = [f"scaled_numeric_{c}" for c in df.columns]
return df
def data_set(
scaled_numeric_df: pd.DataFrame, categorical_df: pd.DataFrame, target: pd.Series
) -> pd.DataFrame:
"""This function creates our dataset.
Following what was in the code, this is how the features are stuck together.
:param scaled_numeric_df:
:param categorical_df:
:param target:
:return:
"""
return pd.concat([scaled_numeric_df, categorical_df, target], axis=1)
def inference_set(scaled_numeric_df: pd.DataFrame, categorical_df: pd.DataFrame) -> pd.DataFrame:
"""This function creates an inference set.
:param scaled_numeric_df:
:param categorical_df:
:return:
"""
return pd.concat([scaled_numeric_df, categorical_df], axis=1)