Train RF#
import os
import psutil
import numpy as np
import warnings
import pickle
from mlproject.data.preprocessing import get_dataset
from mlproject.training.fold_trainer import train_eval_fold
from mlproject.training.feature_selection import get_relevant_features
from sklearn.model_selection import KFold
import mlflow
warnings.filterwarnings("ignore")
model_type = "rf"
parent_dir = os.getcwd()
os.environ["CUDA_VISIBLE_DEVICES"] = ""
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
target_names = ["last_phdos_peak","max_pfc",
"log_g_vrh", "log_k_vrh",
"log_klat_300", "log_kp_300",
"log_msd_all_300", "log_msd_all_600",
"log_msd_max_300", "log_msd_max_600",
"log_msd_mean_300", "log_msd_mean_600",
"Cv_25", "Cv_305", "Cv_705",
"H_25", "H_305", "H_705",
"S_25", "S_305", "S_705",
"U_25", "U_305", "U_705"]
Provide absolute path to https://github.com/DigiMatChem/paper-ml-with-lobster-descriptors/tree/main/data after cloning the repository locally to data_parent_dir variable below
data_parent_dir = "absolute/path/to/paper-ml-with-lobster-descriptors/data/"
Provide absolute valid path to log the model metrics with mlflow in the mlflow.set_tracking_uri function below
mlflow.set_tracking_uri('file:///some/valid/path/on/your/system')
num_jobs = psutil.cpu_count(logical=False) # This will use all physical cores on the system. Please reduce it as per needs
📝 Note
The snippet below is for a 5-fold cross-validation (CV) run.
Change the number of splits in the
Don't forget to change working directory to avoid overwriting your existing 5-fold CV run results.
The snippet below is for a 5-fold cross-validation (CV) run.
Change the number of splits in the
cv_outer variable from
5 to 10 to obtain results for
10-fold CV, 10-fold CV results are used in the corrected
resampling t-tests.Don't forget to change working directory to avoid overwriting your existing 5-fold CV run results.
for target_name in target_names:
for feat_type in ["matminer", "matminer_lob"]:
target, feat = get_dataset(feat_type=feat_type, target_name=target_name,
data_parent_dir=data_parent_dir)
feat.dropna(axis=1, inplace=True)
cv_outer = KFold(n_splits=5, shuffle=True, random_state=18012019)
experiment_name = f"{model_type}_experiment_{target_name}"
mlflow.set_experiment(experiment_name)
os.makedirs(f"{model_type}_{target_name}_{feat_type}", exist_ok=True)
os.chdir(f"{model_type}_{target_name}_{feat_type}")
with mlflow.start_run(run_name=f"{target_name}_{feat_type}",
experiment_id=mlflow.get_experiment_by_name(experiment_name).experiment_id):
all_results = {
"train_mae": [], "test_mae": [],
"train_rmse": [], "test_rmse": [],
"train_r2": [], "test_r2": [],
"train_mape": [], "test_mape": []
}
for fold_ind, (train_ix, test_ix) in enumerate(cv_outer.split(feat)):
X_train, X_test = feat.iloc[train_ix], feat.iloc[test_ix]
y_train, y_test = target.iloc[train_ix, 0], target.iloc[test_ix, 0]
pipe, X_train_fil = get_relevant_features(
X_train=X_train,
y_train=y_train.values.flatten(),
grootcv_n_iter=50,
**{"all_rel_feats__n_jobs": num_jobs})
with open(f"{fold_ind+1}_pipeline.pkl", "wb") as f:
pickle.dump(pipe, f)
X_test_fil = X_test.loc[:, X_train_fil.columns]
result = train_eval_fold(
fold_ind=fold_ind,
X_train=X_train_fil,
y_train=y_train,
X_test=X_test_fil,
y_test=y_test,
model_type="rf",
**{"n_jobs": num_jobs,
}
)
for metric, value in result.items():
if isinstance(value, float):
all_results[metric].append(value)
elif "train" in metric:
all_results["train_mae"].append(value.mean())
else:
all_results["test_mae"].append(value.mean())
with open(f"{fold_ind+1}_results.pkl", "wb") as f:
pickle.dump(result, f)
for metric, value in all_results.items():
mlflow.log_metric(f"{metric}_mean", np.array(value).mean())
mlflow.log_metric(f"{metric}_min", np.array(value).min())
mlflow.log_metric(f"{metric}_max", np.array(value).max())
mlflow.log_metric(f"{metric}_std", np.array(value).std())
os.chdir(parent_dir)