Source code for mlproject.training.fold_trainer

"""
Training/evaluation functions for RF/MODNet/GA-SISSO models.
"""

import os
import pickle
import json
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, r2_score
from sissopp import Inputs
from sissopp import SISSORegressor
from sissopp.py_interface import get_fs_solver
from sissopp.postprocess.load_models import load_model
from modnet.hyper_opt.fit_genetic import FitGenetic
from modnet.preprocessing import MODData
from modnet.models import EnsembleMODNetModel
from mlproject.training.feature_selection import GAFeatureSelector
from mlproject.postprocess.utils import mean_absolute_percentage_error

warnings.filterwarnings("ignore")


[docs] def train_modnet( X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, target_name: str, n_jobs: int, save_model: bool, fold_ind, **kwargs, ) -> tuple[np.ndarray, np.ndarray, EnsembleMODNetModel, np.ndarray, np.ndarray]: """ Training/evaluation for MODNet model. Parameters ---------- X_train : pd.DataFrame Training features. X_test : pd.DataFrame Testing features. y_train : pd.DataFrame Training target. y_test : pd.DataFrame Testing target. target_name : str Name of the target variable. n_jobs : int Number of parallel jobs. save_model : bool Whether to save the trained model. fold_ind : int Fold index for cross-validation. **kwargs Additional keyword arguments for MODNet's genetic algorithm hyperparameter optimization. """ os.environ["OPENBLAS_NUM_THREADS"] = "1" os.environ["MKL_NUM_THREADS"] = "1" os.environ["OMP_NUM_THREADS"] = "1" os.environ["TF_NUM_INTRAOP_THREADS"] = "1" os.environ["TF_NUM_INTEROP_THREADS"] = "1" os.environ["CUDA_VISIBLE_DEVICES"] = "" os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2" y_train = y_train.values.flatten() y_test = y_test.values.flatten() moddata_train = MODData( df_featurized=X_train, targets=y_train.reshape(-1, 1), target_names=[target_name], structure_ids=list(X_train.index), ) moddata_test = MODData( df_featurized=X_test, targets=y_test.reshape(-1, 1), target_names=[target_name], structure_ids=list(X_test.index), ) moddata_train.feature_selection(n=-1, n_jobs=n_jobs, random_state=42) ga_settings = { "size_pop": 50, "num_generations": 20, "early_stopping": 4, "refit": False, "nested": True, **kwargs, } ga = FitGenetic(moddata_train, targets=[[[target_name]]]) model = ga.run(n_jobs=n_jobs, **ga_settings) if save_model: model.save(f"model_{fold_ind+1}") y_hat_train = model.predict(moddata_train).values.flatten() y_hat_test = model.predict(moddata_test).values.flatten() return y_hat_train, y_hat_test, model, y_train, y_test
[docs] def train_sisso_ga( X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, n_jobs: int, save_model: bool, fold_ind: int, **kwargs, ) -> tuple[np.ndarray, np.ndarray, SISSORegressor, np.ndarray, np.ndarray]: """ Training/evaluation for SISSO model with GA-based feature selection. Parameters ---------- X_train : pd.DataFrame Training features. X_test : pd.DataFrame Testing features. y_train : pd.DataFrame Training target. y_test : pd.DataFrame Testing target. n_jobs : int Number of parallel jobs. save_model : bool Whether to save the trained model. fold_ind : int Fold index for cross-validation. **kwargs Additional keyword arguments for GA feature selection and SISSO inputs.""" default_ga_kwargs = { "num_features": 15, "generations": 100, "population_size": 50, "early_stop_patience": 5, "cv": 5, "mpi_tasks": kwargs.get("mpi_tasks", 1), "sissopp_binary_path": kwargs.get("sissopp_binary_path"), "sissopp_inputs": kwargs.get("sissopp_inputs"), **kwargs, } current_work_dir = os.getcwd() os.makedirs(f"fold_{fold_ind+1}", exist_ok=True) os.chdir(f"fold_{fold_ind+1}") selector = GAFeatureSelector( X=X_train, y=y_train, model=None, scoring="neg_mean_absolute_error", X_test=None, y_test=None, n_jobs=n_jobs, return_train_score=False, **default_ga_kwargs, ) selected_features = selector.run(strategy="de") with open("feature_usage_counts.json", "w") as f: json.dump(selector.feature_usage_counts, f) X_train_fil = X_train.loc[:, selected_features] X_test_fil = X_test.loc[:, selected_features] pd.concat([X_train_fil, y_train], axis=1).to_csv("data.csv") with open("sisso.json", "w") as f: json.dump(kwargs.get("sissopp_inputs"), f, indent=4) inputs = Inputs("sisso.json") _, model = get_fs_solver(inputs, allow_overwrite=False) model.fit() model_file_name = Path("models") / f"train_dim_{model.n_dim}_model_0.dat" trained_model = load_model(model_file_name.as_posix()) y_hat_train = trained_model.eval_many(X_train_fil) y_hat_test = trained_model.eval_many(X_test_fil) os.chdir(current_work_dir) return y_hat_train, y_hat_test, model, y_train, y_test
[docs] def train_rf( X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, n_jobs: int, save_model: bool, fold_ind: int, **kwargs, ) -> tuple[np.ndarray, np.ndarray, RandomForestRegressor, np.ndarray, np.ndarray]: """ Training/evaluation for RandomForest model. Parameters ---------- X_train : pd.DataFrame Training features. X_test : pd.DataFrame Testing features. y_train : pd.DataFrame Training target. y_test : pd.DataFrame Testing target. n_jobs : int Number of parallel jobs. save_model : bool Whether to save the trained model. fold_ind : int Fold index for cross-validation. **kwargs Additional keyword arguments for RandomForestRegressor. Returns ------- tuple y_hat_train, y_hat_test, model, y_train, y_test """ y_train = y_train.values.flatten() y_test = y_test.values.flatten() estimator_kwargs = { "n_jobs": n_jobs, "n_estimators": 500, **kwargs, } model = RandomForestRegressor(**estimator_kwargs) model.fit(X_train, y_train) if save_model: with open(f"model_{fold_ind+1}.pkl", "wb") as f: pickle.dump(model, f) y_hat_train = model.predict(X_train) y_hat_test = model.predict(X_test) return y_hat_train, y_hat_test, model, y_train, y_test
[docs] def train_eval_fold( model_type: str, fold_ind: int, X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.DataFrame, y_test: pd.DataFrame, target_name: str = None, n_jobs: int = 1, save_model: bool = True, **kwargs, ): """ Training/evaluation for MODNet, GA-SISSO, and RandomForest. Parameters ---------- model_type : str Type of model to train ("modnet", "ga_sisso", "rf"). fold_ind : int Fold index for cross-validation. X_train : pd.DataFrame Training features. X_test : pd.DataFrame Testing features. y_train : pd.DataFrame Training target. y_test : pd.DataFrame Testing target. target_name : str, optional Name of the target variable (required for MODNet). n_jobs : int, optional Number of parallel jobs (default is 1). save_model : bool, optional Whether to save the trained model (default is True). **kwargs Additional keyword arguments for the specific model training function. Returns ------- dict Dictionary containing training and testing regression metrics. """ if model_type.lower() == "modnet": y_hat_train, y_hat_test, _, y_train_true, y_test_true = train_modnet( X_train, X_test, y_train, y_test, target_name, n_jobs, save_model, fold_ind, **kwargs, ) elif model_type.lower() == "ga_sisso": y_hat_train, y_hat_test, _, y_train_true, y_test_true = train_sisso_ga( X_train, X_test, y_train, y_test, n_jobs, save_model, fold_ind, **kwargs ) elif model_type.lower() == "rf": y_hat_train, y_hat_test, _, y_train_true, y_test_true = train_rf( X_train, X_test, y_train, y_test, n_jobs, save_model, fold_ind, **kwargs ) else: raise ValueError(f"Unsupported model_type: {model_type}") # --- Evaluation --- train_rmse = root_mean_squared_error(y_train_true, y_hat_train) test_rmse = root_mean_squared_error(y_test_true, y_hat_test) train_r2 = r2_score(y_train_true, y_hat_train) test_r2 = r2_score(y_test_true, y_hat_test) train_errors = ( abs(y_train_true - y_hat_train).values if isinstance(abs(y_train_true - y_hat_train), pd.DataFrame) else abs(y_train_true - y_hat_train) ) test_errors = ( abs(y_test_true - y_hat_test).values if isinstance(abs(y_test_true - y_hat_test), pd.DataFrame) else abs(y_test_true - y_hat_test) ) train_mape = mean_absolute_percentage_error(y_true=y_train_true, y_pred=y_hat_train) test_mape = mean_absolute_percentage_error(y_true=y_test_true, y_pred=y_hat_test) fold_metrics = { "train_rmse": train_rmse, "test_rmse": test_rmse, "train_errors": train_errors, "test_errors": test_errors, "train_r2": train_r2, "test_r2": test_r2, "train_mape": train_mape, "test_mape": test_mape, } return fold_metrics