Source code for mlproject.postprocess.t_test

"""
Function implementing Nadeau & Bengio corrected resampled paired t-test with varying fold sizes.
"""

from scipy.stats import t
from statistics import stdev, mean, sqrt


[docs] def corrected_resampled_ttest( scores_model_a: list[float], scores_model_b: list[float], n_train_list: list[int], n_test_list: list[int], alpha: float = 0.05, alternative: str = "two-sided", ) -> dict: """ Nadeau & Bengio corrected resampled paired t-test with varying fold sizes. This is the same idea as the standard corrected resampled t-test, but instead of a single n_test/n_train ratio, it uses the *average* ratio across splits: r_bar = (1/m) * sum_i (n_test_i / n_train_i) Then: Var(d_bar) ≈ (1/m + r_bar) * s^2 Parameters ---------- scores_model_a : list[float] List of test errors (e.g., MAE) for model A across splits. scores_model_b : list[float] List of test errors (e.g., MAE) for model B across splits. n_train_list : list[int] List of training set sizes for each split. n_test_list : list[int] List of test set sizes for each split. alpha : float, optional Significance level for the test (default is 0.05). alternative : str, optional The alternative hypothesis to test. Options are "two-sided", "greater", or "less Returns ---------- dict A dict that consists of the t-statistic, degrees of freedom, critical value, p-value, and average test/train ratio across splits with following keys * t_stat: The t-statistic value for the test * df: degrees of freedom * critical_value: Critical value for the given alpha and alternative hypothesis * p_value: p-value as per the specified alternative hypothesis * r_bar: average test/train ratio across splits References ----------- Nadeau, C., Bengio, Y. Inference for the Generalization Error. Machine Learning 52, 239–281 (2003). https://doi.org/10.1023/A:1024068626366 """ if len(scores_model_a) != len(scores_model_b): raise ValueError("scores_model_a and scores_model_b must have the same length.") if len(scores_model_a) != len(n_train_list) or len(scores_model_a) != len( n_test_list ): raise ValueError("Fold size lists must match the number of splits.") m = len(scores_model_a) if m < 2: raise ValueError("Need at least 2 splits.") diffs = [a - b for a, b in zip(scores_model_a, scores_model_b)] d_bar = mean(diffs) sd = stdev(diffs) # average test/train ratio across splits ratios = [nt / ntr for nt, ntr in zip(n_test_list, n_train_list)] r_bar = sum(ratios) / m if sd == 0.0: t_stat = 0.0 if d_bar == 0.0 else float("inf") * (1.0 if d_bar > 0 else -1.0) df = m - 1 if alternative == "two-sided": critical_value = t.ppf(1.0 - alpha / 2.0, df) p_value = 1.0 if d_bar == 0.0 else 0.0 elif alternative == "greater": critical_value = t.ppf(1.0 - alpha, df) p_value = 1.0 if d_bar <= 0.0 else 0.0 else: critical_value = t.ppf(alpha, df) p_value = 1.0 if d_bar >= 0.0 else 0.0 return { "t_stat": t_stat, "df": df, "critical_value": critical_value, "p_value": p_value, "r_bar": r_bar, } se = sqrt((1.0 / m) + r_bar) * sd t_stat = d_bar / se df = m - 1 if alternative == "two-sided": critical_value = t.ppf(1.0 - alpha / 2.0, df) p_value = 2.0 * t.sf(abs(t_stat), df) elif alternative == "greater": critical_value = t.ppf(1.0 - alpha, df) p_value = t.sf(t_stat, df) else: critical_value = t.ppf(alpha, df) p_value = t.cdf(t_stat, df) return { "t_stat": t_stat, "df": df, "critical_value": critical_value, "p_value": p_value, "r_bar": r_bar, }