blob: 69de47762d7d6ad93dd082d6e9615c6d47fb538d [file] [log] [blame]
"""
This is a module that contains our "model fitting and related" transforms.
"""
import pickle
from typing import Dict
import numpy as np
import pandas as pd
from sklearn import base, linear_model, metrics, model_selection
from hamilton.function_modifiers import config, extract_fields
def target_column_name() -> str:
"""What column do we assume in the data set to be the target?"""
return "target"
def model_classifier(random_state: int) -> base.ClassifierMixin:
"""Creates an unfitted LR model object.
:param random_state:
:return:
"""
lr = linear_model.LogisticRegression(random_state=random_state)
return lr
@extract_fields({"train_set": pd.DataFrame, "test_set": pd.DataFrame})
def train_test_split(
data_set: pd.DataFrame, target: pd.Series, test_size: float
) -> Dict[str, pd.DataFrame]:
"""Splits the dataset into train & test.
:param data_set: the dataset with all features already computed
:param target: the target column. Used to stratify the training & test sets.
:param test_size: the size of the test set to produce.
:return:
"""
train, test = model_selection.train_test_split(data_set, stratify=target, test_size=test_size)
return {"train_set": train, "test_set": test}
@config.when(model_to_use="create_new")
def fit_model__create_new(
model_classifier: base.ClassifierMixin, train_set: pd.DataFrame, target_column_name: str
) -> base.ClassifierMixin:
"""Fits a new model.
:param model_classifier:
:param train_set:
:return:
"""
feature_cols = [c for c in train_set.columns if c != target_column_name]
model_classifier.fit(train_set[feature_cols], train_set[target_column_name])
return model_classifier
@config.when(model_to_use="use_existing")
def fit_model__use_existing(model_path: str) -> base.ClassifierMixin:
with open(model_path, "rb") as f:
return pickle.load(f)
def y_train_estimation(
fit_model: base.ClassifierMixin, train_set: pd.DataFrame, target_column_name: str
) -> np.ndarray:
feature_cols = [c for c in train_set.columns if c != target_column_name]
return fit_model.predict(train_set[feature_cols])
def y_train(train_set: pd.DataFrame, target_column_name: str) -> pd.Series:
return train_set[target_column_name]
def cm_train(y_train: pd.Series, y_train_estimation: np.ndarray) -> np.ndarray:
return metrics.confusion_matrix(y_train, y_train_estimation)
def y_test_estimation(
fit_model: base.ClassifierMixin, test_set: pd.DataFrame, target_column_name: str
) -> np.ndarray:
feature_cols = [c for c in test_set.columns if c != target_column_name]
return fit_model.predict(test_set[feature_cols])
def y_test(test_set: pd.DataFrame, target_column_name: str) -> pd.Series:
return test_set[target_column_name]
def cm_test(y_test: pd.Series, y_test_estimation: np.ndarray) -> np.ndarray:
return metrics.confusion_matrix(y_test, y_test_estimation)
def model_predict(fit_model: base.ClassifierMixin, inference_set: pd.DataFrame) -> np.ndarray:
return fit_model.predict(inference_set)