| import numpy as np | 
 | import scipy.stats as stats | 
 | from sklearn.metrics import get_scorer | 
 |  | 
 | from hamilton.function_modifiers import config | 
 |  | 
 |  | 
 | def val_score(scorer_name: str, y_validation: np.ndarray, val_pred: np.ndarray) -> float: | 
 |     """Compute the metric `scorer_name` from scikit-learn for the validation predictions""" | 
 |     scorer = get_scorer(scorer_name) | 
 |     score = np.abs(scorer._score_func(y_validation, val_pred)) | 
 |     return score | 
 |  | 
 |  | 
 | def bootstrap_metric_sample( | 
 |     scorer_name: str, y_validation: np.ndarray, bootstrap_iter: int = 1000 | 
 | ) -> np.ndarray: | 
 |     """Bootstrap the `scorer_name` metric for a number of `bootstrap_iter` iterations | 
 |     based on the empirical label distribution of `y_validation` | 
 |     """ | 
 |     scorer = get_scorer(scorer_name) | 
 |  | 
 |     n_examples = y_validation.shape[0] | 
 |     unique_val, count_val = np.unique(y_validation, return_counts=True) | 
 |  | 
 |     scores = [] | 
 |     for _ in range(bootstrap_iter): | 
 |         random_draw = np.random.choice(unique_val, n_examples, p=count_val / count_val.sum()) | 
 |  | 
 |         score = np.abs(scorer._score_func(y_validation, random_draw)) | 
 |         scores.append(score) | 
 |  | 
 |     return np.asarray(scores) | 
 |  | 
 |  | 
 | def statistical_ttest_one_sample( | 
 |     bootstrap_metric_sample: np.ndarray, | 
 |     val_score: float, | 
 |     higher_is_better: bool = True, | 
 | ) -> dict: | 
 |     """Since we are dealing with scores, the model metric is always 'higher is better'""" | 
 |  | 
 |     if higher_is_better: | 
 |         sample_hypothesis = "less" | 
 |     else: | 
 |         sample_hypothesis = "greater" | 
 |  | 
 |     statistic, pvalue = stats.ttest_1samp( | 
 |         bootstrap_metric_sample, val_score, alternative=sample_hypothesis | 
 |     ) | 
 |     return dict(test="one_sample_ttest", stat=statistic, pvalue=pvalue) | 
 |  | 
 |  | 
 | @config.when_in(task=["continuous_regression", "binary_classification"]) | 
 | def statistical_association__pearsonr(y_validation: np.ndarray, val_pred: np.ndarray) -> dict: | 
 |     """measure the statistical association between true labels and predicted labels""" | 
 |  | 
 |     # for the binary case, Cramer's V is equivalent to Pearson's R (phi coefficient estimation) | 
 |     # ref: https://blog.zenggyu.com/en/posts/2019-06-11-a-brief-introduction-to-various-correlation-coefficients-and-when-to-use-which/index.html | 
 |     statistic, pvalue = stats.pearsonr(y_validation, val_pred) | 
 |  | 
 |     return dict( | 
 |         test="pearsonr", | 
 |         stat=statistic, | 
 |         pvalue=pvalue, | 
 |     ) | 
 |  | 
 |  | 
 | @config.when(task="ordinal_regression") | 
 | def statistical_association__kendalltau(y_validation: np.ndarray, val_pred: np.ndarray): | 
 |     """measure the statistical association between true labels and predicted labels""" | 
 |  | 
 |     statistic, pvalue = stats.kendalltau(y_validation, val_pred) | 
 |  | 
 |     return dict( | 
 |         test="kendalltau", | 
 |         stat=statistic, | 
 |         pvalue=pvalue, | 
 |     ) | 
 |  | 
 |  | 
 | def model_results( | 
 |     task: str, | 
 |     label: str, | 
 |     scorer_name: str, | 
 |     bootstrap_iter: int, | 
 |     val_score: float, | 
 |     statistical_ttest_one_sample: dict, | 
 |     statistical_association: dict, | 
 | ) -> dict: | 
 |     """Collect key metrics and results into a single JSON serializable file""" | 
 |     return { | 
 |         "task": task, | 
 |         "label": label, | 
 |         "scorer_name": scorer_name, | 
 |         "validation_score": val_score, | 
 |         "significance": statistical_ttest_one_sample, | 
 |         "association": statistical_association, | 
 |         "bootstrap_iter": bootstrap_iter, | 
 |     } |