PFI + SHAP#
Script to extract and plot top descriptors based on PFI and SHAP scores from trained models
import os
import matplotlib
import matplotlib.pyplot as plt
from mlproject.postprocess.feature_importances import get_rf_pfi_shap_summary, get_modnet_pfi_shap_summary
from mlproject.plotting.importances import plot_feature_importance
matplotlib.rcParams['pdf.fonttype'] = 42
Provide absolute path to https://github.com/DigiMatChem/paper-ml-with-lobster-descriptors/tree/main/data after cloning the repository locally to data_parent_dir variable below
models_parent_dir = "/path/to/parent/dir/with/saved/models/" # top directory with all models saved
data_parent_dir = "absolute/path/to/paper-ml-with-lobster-descriptors/data/"
target_names = ["last_phdos_peak","max_pfc",
"log_g_vrh", "log_k_vrh",
"log_klat_300", "log_kp_300",
"log_msd_all_300", "log_msd_all_600",
"log_msd_max_300", "log_msd_max_600",
"log_msd_mean_300", "log_msd_mean_600",
"Cv_25", "Cv_305", "Cv_705",
"H_25", "H_305", "H_705",
"S_25", "S_305", "S_705",
"U_25", "U_305", "U_705"]
parent_dir = os.getcwd()
Compute and plot RF Model PFI and SHAP scores#
os.makedirs("rf_pfi_shap", exist_ok=True)
for target_name in target_names:
n_feats = 30 if any(prefix in target_name for prefix in ["Cv_", "H_", "U_", "S_"]) else 20
pfi_summary, shap_summary = get_rf_pfi_shap_summary(models_parent_dir=models_parent_dir, data_parent_dir=data_parent_dir, target_name=target_name)
pfi_summary.to_json(f"rf_pfi_shap/pfi_summary_{target_name}.json")
shap_summary.to_json(f"rf_pfi_shap/shap_summary_{target_name}.json")
fig1 = plot_feature_importance(pfi_summary, target_name=target_name, model_name="RF", importance_type="PFI", n_feats=n_feats)
fig1.savefig(f"rf_pfi_shap/pfi_{target_name}.pdf")
fig1.savefig(f"rf_pfi_shap/pfi_{target_name}.png", dpi=300)
fig2 = plot_feature_importance(shap_summary, target_name=target_name, model_name="RF", importance_type="SHAP", n_feats=n_feats)
fig2.savefig(f"rf_pfi_shap/shap_{target_name}.pdf")
fig2.savefig(f"rf_pfi_shap/shap_{target_name}.png", dpi=300)
plt.close()
Compute and plot MODNet Model PFI and SHAP scores#
os.makedirs("modnet_pfi_shap", exist_ok=True)
for target_name in target_names:
n_feats = 30 if any(prefix in target_name for prefix in ["Cv_", "H_", "U_", "S_"]) else 20
pfi_summary, shap_summary = get_modnet_pfi_shap_summary(models_parent_dir=models_parent_dir, data_parent_dir=data_parent_dir, target_name=target_name)
pfi_summary.to_json(f"modnet_pfi_shap/pfi_summary_{target_name}.json")
shap_summary.to_json(f"modnet_pfi_shap/shap_summary_{target_name}.json")
fig1 = plot_feature_importance(pfi_summary, target_name=target_name, model_name="MODNet", importance_type="PFI", n_feats=n_feats)
fig1.savefig(f"modnet_pfi_shap/pfi_{target_name}.pdf")
fig1.savefig(f"modnet_pfi_shap/pfi_{target_name}.png", dpi=300)
fig2 = plot_feature_importance(shap_summary, target_name=target_name, model_name="MODNet", importance_type="SHAP", n_feats=n_feats)
fig2.savefig(f"modnet_pfi_shap/shap_{target_name}.pdf")
fig2.savefig(f"modnet_pfi_shap/shap_{target_name}.png",dpi=300)
plt.close()