Get metrics of LightGBM model using selected features from ARFS GrootCV

Get metrics of LightGBM model using selected features from ARFS GrootCV#

📝 Note

The snippet is just for a sanity check

This does not need to be executed and is completely optional

import os
import psutil
import pandas as pd
import numpy as np
import warnings
import pickle
import lightgbm as lgb
from lightgbm import early_stopping
from monty.serialization import dumpfn, loadfn
from mlproject.data.preprocessing import get_dataset
from mlproject.training.fold_trainer import train_eval_fold
from mlproject.training.feature_selection import get_relevant_features
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.metrics import root_mean_squared_error, r2_score
from arfs.utils import create_dtype_dict
from arfs.feature_selection.allrelevant import _set_lgb_parameters, _train_lgb_model, _split_data
from mlproject.postprocess.utils import mean_absolute_percentage_error
warnings.filterwarnings("ignore")

data_parent_dir = "absolute/path/to/paper-ml-with-lobster-descriptors/data/"

model_dir = "/path/to/parent/dir/with/saved/models/" # top directory with all models saved

num_jobs = psutil.cpu_count(logical=False) # This will use all physical cores on the system. Please reduce it as per needs

target_names = [
    "last_phdos_peak","max_pfc", 
    "log_g_vrh", "log_k_vrh",
    "log_klat_300", "log_kp_300",
    "log_msd_all_300", "log_msd_all_600",
    "log_msd_max_300", "log_msd_max_600",
    "log_msd_mean_300", "log_msd_mean_600",
    "Cv_25", "Cv_305", "Cv_705",
    "H_25", "H_305", "H_705", 
    "S_25", "S_305", "S_705", 
    "U_25", "U_305", "U_705"
]

results_dict = {
    target: {
        "matminer": {
            "train_errors": [], "test_errors": [],
            "train_rmse": [], "test_rmse": [], 
            "train_r2": [], "test_r2": [],
            "train_mape": [], "test_mape": []
        },
        "matminer_lob": {
            "train_errors": [], "test_errors": [],
            "train_rmse": [], "test_rmse": [], 
            "train_r2": [], "test_r2": [],
            "train_mape": [], "test_mape": []
        }
    }
    for target in target_names
}

def get_metrics_table(results_dict, target, feat_type):

    cv_df = pd.DataFrame(index=list(range(1,6)))
    for fold in range(5):
        for metric, value in results_dict[target][feat_type].items():
            cv_df.loc[fold, metric] = value[fold]
    
    overview_df = pd.DataFrame([cv_df.mean(), cv_df.min(), cv_df.max(), cv_df.std(ddof=0)], index=["mean", "min", "max", "std"])

    columns = ['train_rmse', 'test_rmse', 'train_errors', 'test_errors', 'train_r2', 'test_r2']
    
    return overview_df.loc[:, columns]

def train_lightgbm(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.DataFrame,
    y_test: pd.DataFrame,
    n_jobs: int,
) -> tuple[np.ndarray, np.ndarray, lgb.basic.Booster, np.ndarray, np.ndarray]:
    """
    Training/evaluation for LightGBM model. 5-Fold CV is used to find optimal number of boosting rounds 

    Parameters
    ----------
    X_train : pd.DataFrame
        Training features.
    X_test : pd.DataFrame
        Testing features.
    y_train : pd.DataFrame
        Training target.
    y_test : pd.DataFrame
        Testing target.
    n_jobs : int
        Number of parallel jobs.
    
    Returns
    -------
    tuple
        y_hat_train, y_hat_test, model, y_train, y_test
    """

    dtypes_dic = create_dtype_dict(X_train, dic_keys="dtypes")
    category_cols = dtypes_dic["cat"] + dtypes_dic["time"] + dtypes_dic["unk"]
    cat_idx = [X_train.columns.get_loc(col) for col in category_cols]

    d_train = lgb.Dataset(X_train, label=y_train, weight=None)
    d_valid = lgb.Dataset(X_test, label=y_test, weight=None)

    params = _set_lgb_parameters(
        X=X_train,
        y=y_train,
        objective="mae",
        rf=False,
        silent=True,
        n_jobs=n_jobs,
        lgbm_params=None,
    )

    # get optimal number of boosting round using LightGBM internal CV method
    cv_results = lgb.cv(
        params=params,
        train_set=d_train,
        num_boost_round=10000,
        nfold=5,
        shuffle=True,
        stratified=False,
        callbacks=[lgb.early_stopping(20)],
        return_cvbooster=True,
    )

    cvbooster = cv_results["cvbooster"] 

    # Train a model with optimal number of boosting rounds found via 5-fold CV
    model = lgb.train(
        params,
        train_set=d_train,
        num_boost_round=cvbooster.best_iteration,
    )  

    y_hat_train = model.predict(X_train)
    y_hat_test = model.predict(X_test)

    return y_hat_train, y_hat_test, model, y_train, y_test

%%capture --no-display
for target_name in target_names:
    for feat_type in ["matminer", "matminer_lob"]:
        target, feat = get_dataset(feat_type=feat_type, target_name=target_name,
        data_parent_dir=data_parent_dir)
        
        feat.dropna(axis=1, inplace=True)

        cv_outer = KFold(n_splits=5, shuffle=True, random_state=18012019)

        
        for fold_ind, (train_ix, test_ix) in enumerate(cv_outer.split(feat)):
            X_train, X_test = feat.iloc[train_ix], feat.iloc[test_ix]
            y_train, y_test = target.iloc[train_ix, 0], target.iloc[test_ix, 0]

            with open(f"{model_dir}/modnet_{target_name}_{feat_type}/{fold_ind+1}_pipeline.pkl", "rb") as f:
                pipeline = pickle.load(f)

                selected_feats = pipeline[-1].selected_features_

                X_train_fil = X_train.loc[:, selected_feats]
                X_test_fil = X_test.loc[:, selected_feats]

                y_hat_train, y_hat_test, _, y_train_true, y_test_true = train_lightgbm(
                    X_train_fil, X_test_fil, y_train, y_test, num_jobs
                )

                # --- Evaluation ---
                train_rmse = root_mean_squared_error(y_train_true, y_hat_train)
                test_rmse = root_mean_squared_error(y_test_true, y_hat_test)
                train_r2 = r2_score(y_train_true, y_hat_train)
                test_r2 = r2_score(y_test_true, y_hat_test)
                train_errors = (
                    abs(y_train_true - y_hat_train).values
                    if isinstance(abs(y_train_true - y_hat_train), pd.DataFrame)
                    else abs(y_train_true - y_hat_train)
                )
                test_errors = (
                    abs(y_test_true - y_hat_test).values
                    if isinstance(abs(y_test_true - y_hat_test), pd.DataFrame)
                    else abs(y_test_true - y_hat_test)
                )
                train_mape = mean_absolute_percentage_error(y_true=y_train_true, y_pred=y_hat_train)
                test_mape = mean_absolute_percentage_error(y_true=y_test_true, y_pred=y_hat_test)

                results_dict[target_name][feat_type]["train_errors"].append(np.mean(train_errors))
                results_dict[target_name][feat_type]["test_errors"].append(np.mean(test_errors))
                results_dict[target_name][feat_type]["train_mape"].append(train_mape)
                results_dict[target_name][feat_type]["test_mape"].append(test_mape)

                results_dict[target_name][feat_type]["train_r2"].append(train_r2)
                results_dict[target_name][feat_type]["test_r2"].append(test_r2)
                results_dict[target_name][feat_type]["train_rmse"].append(train_rmse)
                results_dict[target_name][feat_type]["test_rmse"].append(test_rmse)
        

# dumpfn(results_dict, "lightgbm_metrics.json") # dump results from run as json (optional)
# results_dict = loadfn("lightgbm_metrics.json") # load saved results (optional)

# get metrics summary table
df_metric_mm = get_metrics_table(results_dict, "last_phdos_peak", "matminer")
df_metric_mm_l = get_metrics_table(results_dict, "last_phdos_peak", "matminer_lob")

df_metric_mm

df_metric_mm_l