Source code for mlproject.postprocess.utils

"""
Misc postprocessing utility functions
"""

import os
import pickle
import numpy as np
import pandas as pd
from mlproject.postprocess.t_test import corrected_resampled_ttest


[docs] def mean_absolute_percentage_error(y_true, y_pred, threshold=1e-5) -> float: """ Compute mean absolute percentage error, masked Masking is for when y_true is zero (causing a divide by zero error) or when y_true is very small (causing a massive skewing in the absolute percentage error). **Note: THIS WILL IGNORE ALL ENTRIES WHERE y_true's MAGNITUDE IS less than the threshold, hence the MAPE score is not representative of all entries if the truth array contains entries with magnitude very close to 0.** Parameters ---------- y_true : np.ndarray A 1-D array of true values y_pred : np.ndarray A 1-D array of predicted values threshold : float Entries with magnitude below this value will be ignored in the output. Returns ------- float Mean absolute percentage error, masked """ y_true = np.asarray(y_true) mask = np.abs(y_true) > threshold y_pred = np.asarray(y_pred) y_true = y_true[mask] y_pred = y_pred[mask] return np.mean(np.fabs((y_true - y_pred) / y_true))
[docs] def calculate_cohens_d_av(baseline_scores, new_scores): """ Calculate Cohen's d_av between two sets of scores. Parameters ---------- baseline_scores : list or np.ndarray A list or array of baseline scores. new_scores : list or np.ndarray A list or array of new scores. Returns ------- d_av : float Cohen's d_av value. """ # Convert to numpy arrays if not already s1 = np.array(baseline_scores) s2 = np.array(new_scores) # 1. Calculate Means mu1, mu2 = np.mean(s1), np.mean(s2) # 2. Calculate Standard Deviations sd1, sd2 = np.std(s1, ddof=1), np.std(s2, ddof=1) # 3. Calculate Mean Difference mean_diff = mu1 - mu2 # 4. Calculate Average Standard Deviation sd_avg = (sd1 + sd2) / 2 d_av = mean_diff / sd_avg return d_av
[docs] def calculate_relative_percentage_improvement(baseline_scores, new_scores): """ Calculate the relative percentage improvement from baseline to new scores. Parameters ---------- baseline_scores : list or np.ndarray A list or array of baseline scores. new_scores : list or np.ndarray A list or array of new scores. Returns ------- float Relative percentage improvement. """ baseline_mean = np.mean(baseline_scores) new_mean = np.mean(new_scores) improvement = baseline_mean - new_mean relative_improvement = (improvement / baseline_mean) * 100 return relative_improvement
[docs] def caclulate_percent_folds_improved(baseline_scores, new_scores): """ Calculate the percentage of folds that showed improvement from baseline to new scores. Parameters ---------- baseline_scores : list or np.ndarray A list or array of baseline scores. new_scores : list or np.ndarray A list or array of new scores. Returns ------- float Percentage of folds improved. """ baseline_scores = np.array(baseline_scores) new_scores = np.array(new_scores) improved_folds = np.sum(new_scores < baseline_scores) total_folds = len(baseline_scores) percent_improved = (improved_folds / total_folds) * 100 return percent_improved
[docs] def load_cv_results( models_dir: str, model_type: str, target_name: str, feat_set_type: str, n_folds: int, collect_sizes: bool = False, ): """ Load cross-validation results and aggregate test MAE errors. Parameters ---------- models_dir : str Base directory containing model results. model_type : str Model name/prefix (e.g., 'rf', 'modnet'). target_name : str Target property name. feat_set_type : str Subfolder suffix (e.g., 'matminer', 'matminer_lob'). n_folds : int Number of CV folds. collect_sizes : bool, optional If True, also return train/test set sizes per fold. Returns ------- mean_test_errors : list of float Mean test error for each fold. fold_test_errors : list of np.ndarray Raw test errors for each fold. n_train_list : list of int (optional) Number of training samples per fold. n_test_list : list of int (optional) Number of test samples per fold. """ mean_test_errors = [] fold_test_errors = [] n_train_list = [] n_test_list = [] base_path = os.path.join(models_dir, f"{model_type}_{target_name}_{feat_set_type}") for i in range(1, n_folds + 1): results_path = os.path.join(base_path, f"{i}_results.pkl") with open(results_path, "rb") as f: res = pickle.load(f) test_errors = np.asarray(res["test_errors"]) mean_test_errors.append(np.mean(test_errors)) fold_test_errors.append(test_errors) if collect_sizes: n_train_list.append(np.asarray(res["train_errors"]).size) n_test_list.append(test_errors.size) if collect_sizes: return mean_test_errors, fold_test_errors, n_train_list, n_test_list return mean_test_errors, fold_test_errors
[docs] def get_ttest_summary_df( target_name, models_dir, num_folds: int = 10, model_type: str = "rf", feature_set_types: list[str] = ["matminer", "matminer_lob"], alternative: str = "two-sided", ) -> pd.DataFrame: """ Get t-test model summary dataframe including effect size and relative improvement. Parameters ---------- target_name : str Target property name. models_dir : str Base directory containing model results. num_folds : int, optional Number of CV folds. Default is 10. model_type : str, optional Model name/prefix (e.g., 'rf', 'modnet'). Default is 'rf'. feature_set_types : list of str, optional List of feature set variants to compare. Default is ['matminer', 'matminer_lob'] Returns ------- summary_df : pd.DataFrame Summary dataframe with t-test results, effect size, and relative improvement. """ ( matminer_mean_test_errors, _matminer_fold_test_errors, matminer_n_train_list, matminer_n_test_list, ) = load_cv_results( models_dir, model_type, target_name, feature_set_types[0], num_folds, collect_sizes=True, ) ( matminer_lob_mean_test_errors, _matminer_lob_fold_test_errors, matminer_lob_n_train_list, matminer_lob_n_test_list, ) = load_cv_results( models_dir, model_type, target_name, feature_set_types[1], num_folds, collect_sizes=True, ) t_test_results = corrected_resampled_ttest( matminer_mean_test_errors, matminer_lob_mean_test_errors, matminer_n_train_list, matminer_n_test_list, alpha=0.05, alternative=alternative, ) d_av = calculate_cohens_d_av( matminer_mean_test_errors, matminer_lob_mean_test_errors ) rel_improvement = calculate_relative_percentage_improvement( matminer_mean_test_errors, matminer_lob_mean_test_errors ) percent_folds_improved = caclulate_percent_folds_improved( matminer_mean_test_errors, matminer_lob_mean_test_errors ) summary_dict = { "t_stat": t_test_results["t_stat"], "df": t_test_results["df"], "critical_value": t_test_results["critical_value"], "p_value": t_test_results["p_value"], "significance_stars": significance_stars(t_test_results["p_value"]), "r_bar": t_test_results["r_bar"], "d_av": d_av, "rel_improvement": rel_improvement, "percent_folds_improved": percent_folds_improved, } return pd.DataFrame(summary_dict, index=[target_name])
[docs] def significance_stars(p: float) -> str: """ Return significance stars based on p-value Parameters ---------- p : float P-value from statistical test. Returns ------- str Significance stars: '***' for p<0.001, '**' for p<0.01, '*' for p<0.05, '' otherwise. """ if p < 0.001: return "***" elif p < 0.01: return "**" elif p < 0.05: return "*" else: return ""