blob: 788c875f47bbc5765c3c29a91f0f8c387e17e8ad [file] [log] [blame]
"""
This is a module that contains our feature transforms.
"""
from typing import Dict, List, Set
import pandas as pd
from sklearn import impute, model_selection, preprocessing # import KNNImputer
from hamilton.function_modifiers import extract_columns, extract_fields
def _sanitize_columns(
df_columns: List[str],
) -> List[str]:
"""Helper function to sanitize column names.
:param df_columns: the current column names
:return: sanitized column names
"""
return [c.strip().replace("/", "_per_").replace(" ", "_").lower() for c in df_columns]
@extract_columns("pclass", "sex", "age", "parch", "sibsp", "fare", "embarked", "name", "survived")
def passengers_df(titanic_data: pd.DataFrame) -> pd.DataFrame:
"""Function to take in a raw dataframe, check the output, and then extract columns.
:param raw_passengers_df: the raw dataset we want to bring in.
:return:
"""
titanic_data = titanic_data.copy()
titanic_data["pid"] = titanic_data.index
raw_passengers_df = titanic_data.set_index("pid") # create new DF.
raw_passengers_df.columns = _sanitize_columns(raw_passengers_df.columns)
return raw_passengers_df
def rare_titles() -> Set[str]:
"""Rare titles we've curated"""
return {
"Capt",
"Col",
"Don",
"Dona",
"Dr",
"Jonkheer",
"Lady",
"Major",
"Mlle",
"Mme",
"Ms",
"Rev",
"Sir",
"the Countess",
}
def family_size(parch: pd.Series, sibsp: pd.Series) -> pd.Series:
return parch + sibsp
def normalized_name(name: pd.Series) -> pd.Series:
"""I believe this actually gets the honorific, not the name."""
return name.apply(lambda x: x.split(",")[1].split(".")[0].strip())
def title(normalized_name: pd.Series, rare_titles: Set[str]) -> pd.Series:
return normalized_name.apply(lambda n: "rare" if n in rare_titles else n)
def is_alone(family_size: pd.Series) -> pd.Series:
return (family_size == 1).astype(int)
def one_hot_encoder() -> preprocessing.OneHotEncoder:
return preprocessing.OneHotEncoder(handle_unknown="ignore", sparse=False)
def fit_categorical_encoder(
one_hot_encoder: preprocessing.OneHotEncoder,
embarked: pd.Series,
sex: pd.Series,
pclass: pd.Series,
title: pd.Series,
is_alone: pd.Series,
) -> preprocessing.OneHotEncoder:
cat_df = pd.concat([embarked, sex, pclass, title, is_alone], axis=1)
cat_df.columns = cat_df.columns.astype(str)
one_hot_encoder.fit(cat_df)
return one_hot_encoder
def categorical_df(
fit_categorical_encoder: preprocessing.OneHotEncoder,
embarked: pd.Series,
sex: pd.Series,
pclass: pd.Series,
title: pd.Series,
is_alone: pd.Series,
) -> pd.DataFrame:
"""This creates the dataframe of categorical features.
The number of "features" output depends on the number of categories.
:param fit_categorical_encoder:
:param embarked:
:param sex:
:param pclass:
:param title:
:param is_alone:
:return:
"""
cat_df = pd.concat([embarked, sex, pclass, title, is_alone], axis=1)
cat_df.columns = cat_df.columns.astype(str)
cat_df = fit_categorical_encoder.transform(cat_df)
df = pd.DataFrame(cat_df)
df.index = embarked.index
df.columns = [f"categorical_{c}" for c in df.columns]
return df
def knn_imputer(n_neighbors: int = 5) -> impute.KNNImputer:
return impute.KNNImputer(n_neighbors=n_neighbors)
def fit_knn_imputer(
knn_imputer: impute.KNNImputer,
age: pd.Series,
fare: pd.Series,
family_size: pd.Series,
) -> impute.KNNImputer:
num_df = pd.concat([age, fare, family_size], axis=1)
num_df.columns = num_df.columns.astype(str)
knn_imputer.fit(num_df)
return knn_imputer
def knn_imputed_df(
fit_knn_imputer: impute.KNNImputer,
age: pd.Series,
fare: pd.Series,
family_size: pd.Series,
) -> pd.DataFrame:
"""This creates the dataframe of KNN imputed numeric features.
:param fit_knn_imputer:
:param age:
:param fare:
:param family_size:
:return:
"""
num_df = pd.concat([age, fare, family_size], axis=1)
num_df.columns = num_df.columns.astype(str)
imputed_df = fit_knn_imputer.transform(num_df)
df = pd.DataFrame(imputed_df)
df.index = age.index
df.columns = [f"knn_imputed_{c}" for c in df.columns]
return df
def robust_scaler() -> preprocessing.RobustScaler:
return preprocessing.RobustScaler()
def fit_scaler(
robust_scaler: preprocessing.RobustScaler, knn_imputed_df: pd.DataFrame
) -> preprocessing.RobustScaler:
robust_scaler.fit(knn_imputed_df)
return robust_scaler
def scaled_numeric_df(
fit_scaler: preprocessing.RobustScaler, knn_imputed_df: pd.DataFrame
) -> pd.DataFrame:
"""This creates the dataframe of scaled numeric features.
:param fit_scaler:
:param knn_imputed_df:
:return:
"""
num_df = fit_scaler.transform(knn_imputed_df)
df = pd.DataFrame(num_df)
df.index = knn_imputed_df.index
df.columns = [f"scaled_numeric_{c}" for c in df.columns]
return df
def data_set(
scaled_numeric_df: pd.DataFrame, categorical_df: pd.DataFrame, target: pd.Series
) -> pd.DataFrame:
"""This function creates our dataset.
Following what was in the code, this is how the features are stuck together.
:param scaled_numeric_df:
:param categorical_df:
:param target:
:return:
"""
return pd.concat([scaled_numeric_df, categorical_df, target], axis=1)
def inference_set(scaled_numeric_df: pd.DataFrame, categorical_df: pd.DataFrame) -> pd.DataFrame:
"""This function creates an inference set.
:param scaled_numeric_df:
:param categorical_df:
:return:
"""
return pd.concat([scaled_numeric_df, categorical_df], axis=1)
def target(survived: pd.Series) -> pd.Series:
"""Just hard coding this mapping that we want survived to be our target.
:param survived:
:return:
"""
target_col = survived.copy()
target_col.name = "target"
return target_col
@extract_fields({"train_set": pd.DataFrame, "test_set": pd.DataFrame})
def train_test_split(
data_set: pd.DataFrame, target: pd.Series, test_size: float
) -> Dict[str, pd.DataFrame]:
"""Splits the dataset into train & test.
:param data_set: the dataset with all features already computed
:param target: the target column. Used to stratify the training & test sets.
:param test_size: the size of the test set to produce.
:return:
"""
train, test = model_selection.train_test_split(data_set, stratify=target, test_size=test_size)
return {"train_set": train, "test_set": test}
def target_column_name() -> str:
"""Just hard coding this mapping that we want survived to be our target.
:return:
"""
return "target"