Source code for mlproject.postprocess.utils

"""
Misc postprocessing utility functions
"""

import os
import pickle
import numpy as np
import pandas as pd
from mlproject.postprocess.t_test import corrected_resampled_ttest



[docs]
def mean_absolute_percentage_error(y_true, y_pred, threshold=1e-5) -> float:
    """
    Compute mean absolute percentage error, masked

    Masking is for when y_true is zero (causing a
    divide by zero error) or when y_true is very small
    (causing a massive skewing in the absolute percentage
    error).

    **Note: THIS WILL IGNORE ALL ENTRIES WHERE y_true's
    MAGNITUDE IS less than the threshold, hence the
    MAPE score is not representative of all
    entries if the truth array contains entries with
    magnitude very close to 0.**

    Parameters
    ----------
    y_true : np.ndarray
        A 1-D array of true values
    y_pred : np.ndarray
        A 1-D array of predicted values
    threshold : float
        Entries with magnitude below this
        value will be ignored in the output.

    Returns
    -------
    float
        Mean absolute percentage error, masked
    """
    y_true = np.asarray(y_true)
    mask = np.abs(y_true) > threshold
    y_pred = np.asarray(y_pred)
    y_true = y_true[mask]
    y_pred = y_pred[mask]
    return np.mean(np.fabs((y_true - y_pred) / y_true))




[docs]
def calculate_cohens_d_av(baseline_scores, new_scores):
    """
    Calculate Cohen's d_av between two sets of scores.

    Parameters
    ----------
    baseline_scores : list or np.ndarray
        A list or array of baseline scores.
    new_scores : list or np.ndarray
        A list or array of new scores.

    Returns
    -------
    d_av : float
        Cohen's d_av value.
    """
    # Convert to numpy arrays if not already

    s1 = np.array(baseline_scores)
    s2 = np.array(new_scores)

    # 1. Calculate Means
    mu1, mu2 = np.mean(s1), np.mean(s2)

    # 2. Calculate Standard Deviations
    sd1, sd2 = np.std(s1, ddof=1), np.std(s2, ddof=1)

    # 3. Calculate Mean Difference
    mean_diff = mu1 - mu2

    # 4. Calculate Average Standard Deviation
    sd_avg = (sd1 + sd2) / 2

    d_av = mean_diff / sd_avg

    return d_av




[docs]
def calculate_relative_percentage_improvement(baseline_scores, new_scores):
    """
    Calculate the relative percentage improvement from baseline to new scores.

    Parameters
    ----------
    baseline_scores : list or np.ndarray
        A list or array of baseline scores.
    new_scores : list or np.ndarray
        A list or array of new scores.

    Returns
    -------
    float
        Relative percentage improvement.
    """
    baseline_mean = np.mean(baseline_scores)
    new_mean = np.mean(new_scores)

    improvement = baseline_mean - new_mean
    relative_improvement = (improvement / baseline_mean) * 100

    return relative_improvement




[docs]
def caclulate_percent_folds_improved(baseline_scores, new_scores):
    """
    Calculate the percentage of folds that showed improvement from baseline to new scores.

    Parameters
    ----------
    baseline_scores : list or np.ndarray
        A list or array of baseline scores.
    new_scores : list or np.ndarray
        A list or array of new scores.

    Returns
    -------
    float
        Percentage of folds improved.
    """
    baseline_scores = np.array(baseline_scores)
    new_scores = np.array(new_scores)

    improved_folds = np.sum(new_scores < baseline_scores)
    total_folds = len(baseline_scores)

    percent_improved = (improved_folds / total_folds) * 100

    return percent_improved




[docs]
def load_cv_results(
    models_dir: str,
    model_type: str,
    target_name: str,
    feat_set_type: str,
    n_folds: int,
    collect_sizes: bool = False,
):
    """
    Load cross-validation results and aggregate test MAE errors.

    Parameters
    ----------
    models_dir : str
        Base directory containing model results.
    model_type : str
        Model name/prefix (e.g., 'rf', 'modnet').
    target_name : str
        Target property name.
    feat_set_type : str
        Subfolder suffix (e.g., 'matminer', 'matminer_lob').
    n_folds : int
        Number of CV folds.
    collect_sizes : bool, optional
        If True, also return train/test set sizes per fold.

    Returns
    -------
    mean_test_errors : list of float
        Mean test error for each fold.
    fold_test_errors : list of np.ndarray
        Raw test errors for each fold.
    n_train_list : list of int (optional)
        Number of training samples per fold.
    n_test_list : list of int (optional)
        Number of test samples per fold.
    """
    mean_test_errors = []
    fold_test_errors = []
    n_train_list = []
    n_test_list = []

    base_path = os.path.join(models_dir, f"{model_type}_{target_name}_{feat_set_type}")

    for i in range(1, n_folds + 1):
        results_path = os.path.join(base_path, f"{i}_results.pkl")
        with open(results_path, "rb") as f:
            res = pickle.load(f)

        test_errors = np.asarray(res["test_errors"])
        mean_test_errors.append(np.mean(test_errors))
        fold_test_errors.append(test_errors)

        if collect_sizes:
            n_train_list.append(np.asarray(res["train_errors"]).size)
            n_test_list.append(test_errors.size)

    if collect_sizes:
        return mean_test_errors, fold_test_errors, n_train_list, n_test_list

    return mean_test_errors, fold_test_errors




[docs]
def get_ttest_summary_df(
    target_name,
    models_dir,
    num_folds: int = 10,
    model_type: str = "rf",
    feature_set_types: list[str] = ["matminer", "matminer_lob"],
    alternative: str = "two-sided",
) -> pd.DataFrame:
    """
    Get t-test model summary dataframe including effect size and relative improvement.

    Parameters
    ----------
    target_name : str
        Target property name.
    models_dir : str
        Base directory containing model results.
    num_folds : int, optional
        Number of CV folds. Default is 10.
    model_type : str, optional
        Model name/prefix (e.g., 'rf', 'modnet'). Default is 'rf'.
    feature_set_types : list of str, optional
        List of feature set variants to compare. Default is ['matminer', 'matminer_lob']

    Returns
    -------
    summary_df : pd.DataFrame
        Summary dataframe with t-test results, effect size, and relative improvement.
    """

    (
        matminer_mean_test_errors,
        _matminer_fold_test_errors,
        matminer_n_train_list,
        matminer_n_test_list,
    ) = load_cv_results(
        models_dir,
        model_type,
        target_name,
        feature_set_types[0],
        num_folds,
        collect_sizes=True,
    )

    (
        matminer_lob_mean_test_errors,
        _matminer_lob_fold_test_errors,
        matminer_lob_n_train_list,
        matminer_lob_n_test_list,
    ) = load_cv_results(
        models_dir,
        model_type,
        target_name,
        feature_set_types[1],
        num_folds,
        collect_sizes=True,
    )

    t_test_results = corrected_resampled_ttest(
        matminer_mean_test_errors,
        matminer_lob_mean_test_errors,
        matminer_n_train_list,
        matminer_n_test_list,
        alpha=0.05,
        alternative=alternative,
    )

    d_av = calculate_cohens_d_av(
        matminer_mean_test_errors, matminer_lob_mean_test_errors
    )

    rel_improvement = calculate_relative_percentage_improvement(
        matminer_mean_test_errors, matminer_lob_mean_test_errors
    )

    percent_folds_improved = caclulate_percent_folds_improved(
        matminer_mean_test_errors, matminer_lob_mean_test_errors
    )
    summary_dict = {
        "t_stat": t_test_results["t_stat"],
        "df": t_test_results["df"],
        "critical_value": t_test_results["critical_value"],
        "p_value": t_test_results["p_value"],
        "significance_stars": significance_stars(t_test_results["p_value"]),
        "r_bar": t_test_results["r_bar"],
        "d_av": d_av,
        "rel_improvement": rel_improvement,
        "percent_folds_improved": percent_folds_improved,
    }

    return pd.DataFrame(summary_dict, index=[target_name])




[docs]
def significance_stars(p: float) -> str:
    """
    Return significance stars based on p-value

    Parameters
    ----------
    p : float
        P-value from statistical test.
    Returns
    -------
    str
        Significance stars: '***' for p<0.001, '**' for p<0.01, '*' for p<0.05, '' otherwise.
    """
    if p < 0.001:
        return "***"
    elif p < 0.01:
        return "**"
    elif p < 0.05:
        return "*"
    else:
        return ""