| import numpy as np |
| import scipy.stats as stats |
| from sklearn.metrics import get_scorer |
| |
| from hamilton.function_modifiers import config |
| |
| |
| def val_score(scorer_name: str, y_validation: np.ndarray, val_pred: np.ndarray) -> float: |
| """Compute the metric `scorer_name` from scikit-learn for the validation predictions""" |
| scorer = get_scorer(scorer_name) |
| score = np.abs(scorer._score_func(y_validation, val_pred)) |
| return score |
| |
| |
| def bootstrap_metric_sample( |
| scorer_name: str, y_validation: np.ndarray, bootstrap_iter: int = 1000 |
| ) -> np.ndarray: |
| """Bootstrap the `scorer_name` metric for a number of `bootstrap_iter` iterations |
| based on the empirical label distribution of `y_validation` |
| """ |
| scorer = get_scorer(scorer_name) |
| |
| n_examples = y_validation.shape[0] |
| unique_val, count_val = np.unique(y_validation, return_counts=True) |
| |
| scores = [] |
| for _ in range(bootstrap_iter): |
| random_draw = np.random.choice(unique_val, n_examples, p=count_val / count_val.sum()) |
| |
| score = np.abs(scorer._score_func(y_validation, random_draw)) |
| scores.append(score) |
| |
| return np.asarray(scores) |
| |
| |
| def statistical_ttest_one_sample( |
| bootstrap_metric_sample: np.ndarray, |
| val_score: float, |
| higher_is_better: bool = True, |
| ) -> dict: |
| """Since we are dealing with scores, the model metric is always 'higher is better'""" |
| |
| if higher_is_better: |
| sample_hypothesis = "less" |
| else: |
| sample_hypothesis = "greater" |
| |
| statistic, pvalue = stats.ttest_1samp( |
| bootstrap_metric_sample, val_score, alternative=sample_hypothesis |
| ) |
| return dict(test="one_sample_ttest", stat=statistic, pvalue=pvalue) |
| |
| |
| @config.when_in(task=["continuous_regression", "binary_classification"]) |
| def statistical_association__pearsonr(y_validation: np.ndarray, val_pred: np.ndarray) -> dict: |
| """measure the statistical association between true labels and predicted labels""" |
| |
| # for the binary case, Cramer's V is equivalent to Pearson's R (phi coefficient estimation) |
| # ref: https://blog.zenggyu.com/en/posts/2019-06-11-a-brief-introduction-to-various-correlation-coefficients-and-when-to-use-which/index.html |
| statistic, pvalue = stats.pearsonr(y_validation, val_pred) |
| |
| return dict( |
| test="pearsonr", |
| stat=statistic, |
| pvalue=pvalue, |
| ) |
| |
| |
| @config.when(task="ordinal_regression") |
| def statistical_association__kendalltau(y_validation: np.ndarray, val_pred: np.ndarray): |
| """measure the statistical association between true labels and predicted labels""" |
| |
| statistic, pvalue = stats.kendalltau(y_validation, val_pred) |
| |
| return dict( |
| test="kendalltau", |
| stat=statistic, |
| pvalue=pvalue, |
| ) |
| |
| |
| def model_results( |
| task: str, |
| label: str, |
| scorer_name: str, |
| bootstrap_iter: int, |
| val_score: float, |
| statistical_ttest_one_sample: dict, |
| statistical_association: dict, |
| ) -> dict: |
| """Collect key metrics and results into a single JSON serializable file""" |
| return { |
| "task": task, |
| "label": label, |
| "scorer_name": scorer_name, |
| "validation_score": val_score, |
| "significance": statistical_ttest_one_sample, |
| "association": statistical_association, |
| "bootstrap_iter": bootstrap_iter, |
| } |