blob: 77843631236a75bb40464314d488f2cbab2ad3bd [file] [log] [blame]
"""This module contains basic code for model fitting."""
from typing import Dict
import numpy as np
import pandas as pd
from sklearn import base, linear_model, metrics, svm
from sklearn.model_selection import train_test_split
from hamilton import function_modifiers
@function_modifiers.config.when(clf="svm")
def prefit_clf__svm(gamma: float = 0.001) -> base.ClassifierMixin:
"""Returns an unfitted SVM classifier object.
:param gamma: ...
:return:
"""
return svm.SVC(gamma=gamma)
@function_modifiers.config.when(clf="logistic")
def prefit_clf__logreg(penalty: str) -> base.ClassifierMixin:
"""Returns an unfitted Logistic Regression classifier object.
:param penalty: One of {'l1', 'l2', 'elasticnet', None}.
:return:
"""
return linear_model.LogisticRegression(penalty)
@function_modifiers.extract_fields(
{"X_train": pd.DataFrame, "X_test": pd.DataFrame, "y_train": pd.Series, "y_test": pd.Series}
)
def train_test_split_func(
data_set: pd.DataFrame,
test_size_fraction: float,
shuffle_train_test_split: bool,
) -> Dict[str, np.ndarray]:
"""Function that creates the training & test splits.
It this then extracted out into constituent components and used downstream.
:param data_set:
:param test_size_fraction:
:param shuffle_train_test_split:
:return:
"""
assert "target_class" in data_set.columns, "target_class column must be present in the data set"
feature_set = data_set[[col for col in data_set.columns if col != "target_class"]]
target_class = data_set["target_class"]
X_train, X_test, y_train, y_test = train_test_split(
feature_set, target_class, test_size=test_size_fraction, shuffle=shuffle_train_test_split
)
return {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}
def fit_clf(
prefit_clf: base.ClassifierMixin, X_train: pd.DataFrame, y_train: pd.Series
) -> base.ClassifierMixin:
"""Calls fit on the classifier object; it mutates it."""
prefit_clf.fit(X_train, y_train)
return prefit_clf
def training_accuracy(
fit_clf: base.ClassifierMixin, X_train: pd.DataFrame, y_train: pd.Series
) -> float:
"""Returns accuracy on the training set."""
return metrics.accuracy_score(fit_clf.predict(X_train), y_train)
def testing_accuracy(
fit_clf: base.ClassifierMixin, X_test: pd.DataFrame, y_test: pd.Series
) -> float:
"""Returns accuracy on the test set."""
return metrics.accuracy_score(fit_clf.predict(X_test), y_test)