Train RF

Train RF#

import os
import psutil
import numpy as np
import warnings
import pickle
from mlproject.data.preprocessing import get_dataset
from mlproject.training.fold_trainer import train_eval_fold
from mlproject.training.feature_selection import get_relevant_features
from sklearn.model_selection import KFold
import mlflow
warnings.filterwarnings("ignore")

model_type = "rf"

parent_dir = os.getcwd()

os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

target_names = ["last_phdos_peak","max_pfc", 
                "log_g_vrh", "log_k_vrh",
                "log_klat_300", "log_kp_300",
                "log_msd_all_300", "log_msd_all_600",
                "log_msd_max_300", "log_msd_max_600",
                "log_msd_mean_300", "log_msd_mean_600",
                "Cv_25", "Cv_305", "Cv_705", 
                "H_25", "H_305", "H_705", 
               "S_25", "S_305", "S_705", 
               "U_25", "U_305", "U_705"]

Provide absolute path to https://github.com/DigiMatChem/paper-ml-with-lobster-descriptors/tree/main/data after cloning the repository locally to data_parent_dir variable below

data_parent_dir = "absolute/path/to/paper-ml-with-lobster-descriptors/data/"

Provide absolute valid path to log the model metrics with mlflow in the mlflow.set_tracking_uri function below

mlflow.set_tracking_uri('file:///some/valid/path/on/your/system')

num_jobs = psutil.cpu_count(logical=False) # This will use all physical cores on the system. Please reduce it as per needs

📝 Note

The snippet below is for a 5-fold cross-validation (CV) run.

Change the number of splits in the cv_outer variable from 5 to 10 to obtain results for 10-fold CV, 10-fold CV results are used in the corrected resampling t-tests.

Don't forget to change working directory to avoid overwriting your existing 5-fold CV run results.

for target_name in target_names:
    for feat_type in ["matminer", "matminer_lob"]:
        target, feat = get_dataset(feat_type=feat_type, target_name=target_name,
        data_parent_dir=data_parent_dir)
        
        feat.dropna(axis=1, inplace=True)

        cv_outer = KFold(n_splits=5, shuffle=True, random_state=18012019)

        experiment_name =  f"{model_type}_experiment_{target_name}"
        mlflow.set_experiment(experiment_name)

        os.makedirs(f"{model_type}_{target_name}_{feat_type}", exist_ok=True)
        os.chdir(f"{model_type}_{target_name}_{feat_type}")

        with mlflow.start_run(run_name=f"{target_name}_{feat_type}", 
                              experiment_id=mlflow.get_experiment_by_name(experiment_name).experiment_id):
            all_results = {
                "train_mae": [], "test_mae": [],
                "train_rmse": [], "test_rmse": [], 
                "train_r2": [], "test_r2": [],
                "train_mape": [], "test_mape": []
            }
            for fold_ind, (train_ix, test_ix) in enumerate(cv_outer.split(feat)):
                X_train, X_test = feat.iloc[train_ix], feat.iloc[test_ix]
                y_train, y_test = target.iloc[train_ix, 0], target.iloc[test_ix, 0]
            
                pipe, X_train_fil = get_relevant_features(
                        X_train=X_train, 
                        y_train=y_train.values.flatten(), 
                        grootcv_n_iter=50,
                        **{"all_rel_feats__n_jobs": num_jobs})

                with open(f"{fold_ind+1}_pipeline.pkl", "wb") as f:
                    pickle.dump(pipe, f)
                    
                X_test_fil = X_test.loc[:, X_train_fil.columns]
            
                result = train_eval_fold(
                        fold_ind=fold_ind, 
                        X_train=X_train_fil, 
                        y_train=y_train, 
                        X_test=X_test_fil, 
                        y_test=y_test,
                        model_type="rf",
                    **{"n_jobs": num_jobs,
                      }
                        )

                for metric, value in result.items():
                    if isinstance(value, float):
                        all_results[metric].append(value)
                    elif "train" in metric:
                        all_results["train_mae"].append(value.mean())
                    else:
                        all_results["test_mae"].append(value.mean())

                with open(f"{fold_ind+1}_results.pkl", "wb") as f:
                    pickle.dump(result, f)

            for metric, value in all_results.items():
                mlflow.log_metric(f"{metric}_mean", np.array(value).mean())
                mlflow.log_metric(f"{metric}_min", np.array(value).min())
                mlflow.log_metric(f"{metric}_max", np.array(value).max())
                mlflow.log_metric(f"{metric}_std", np.array(value).std())

            os.chdir(parent_dir)