PFI + SHAP#

Script to extract and plot top descriptors based on PFI and SHAP scores from trained models

import os
import matplotlib
import matplotlib.pyplot as plt
from mlproject.postprocess.feature_importances import get_rf_pfi_shap_summary, get_modnet_pfi_shap_summary
from mlproject.plotting.importances import plot_feature_importance
matplotlib.rcParams['pdf.fonttype'] = 42

Provide absolute path to https://github.com/DigiMatChem/paper-ml-with-lobster-descriptors/tree/main/data after cloning the repository locally to data_parent_dir variable below

models_parent_dir = "/path/to/parent/dir/with/saved/models/" # top directory with all models saved
data_parent_dir = "absolute/path/to/paper-ml-with-lobster-descriptors/data/"
target_names = ["last_phdos_peak","max_pfc", 
                "log_g_vrh", "log_k_vrh",
                "log_klat_300", "log_kp_300",
                "log_msd_all_300", "log_msd_all_600",
                "log_msd_max_300", "log_msd_max_600",
                "log_msd_mean_300", "log_msd_mean_600",
                "Cv_25", "Cv_305", "Cv_705", 
                "H_25", "H_305", "H_705", 
               "S_25", "S_305", "S_705", 
               "U_25", "U_305", "U_705"]
parent_dir = os.getcwd()

Compute and plot RF Model PFI and SHAP scores#

os.makedirs("rf_pfi_shap", exist_ok=True)
for target_name in target_names:
    n_feats = 30 if any(prefix in target_name for prefix in ["Cv_", "H_", "U_", "S_"]) else 20
    pfi_summary, shap_summary = get_rf_pfi_shap_summary(models_parent_dir=models_parent_dir, data_parent_dir=data_parent_dir, target_name=target_name)

    pfi_summary.to_json(f"rf_pfi_shap/pfi_summary_{target_name}.json")
    shap_summary.to_json(f"rf_pfi_shap/shap_summary_{target_name}.json")

    fig1 = plot_feature_importance(pfi_summary, target_name=target_name, model_name="RF", importance_type="PFI", n_feats=n_feats)
    fig1.savefig(f"rf_pfi_shap/pfi_{target_name}.pdf")
    fig1.savefig(f"rf_pfi_shap/pfi_{target_name}.png", dpi=300)

    fig2 = plot_feature_importance(shap_summary, target_name=target_name, model_name="RF", importance_type="SHAP", n_feats=n_feats)
    fig2.savefig(f"rf_pfi_shap/shap_{target_name}.pdf")
    fig2.savefig(f"rf_pfi_shap/shap_{target_name}.png", dpi=300)
    plt.close()

Compute and plot MODNet Model PFI and SHAP scores#

os.makedirs("modnet_pfi_shap", exist_ok=True)
for target_name in target_names:
    n_feats = 30 if any(prefix in target_name for prefix in ["Cv_", "H_", "U_", "S_"]) else 20
    pfi_summary, shap_summary = get_modnet_pfi_shap_summary(models_parent_dir=models_parent_dir, data_parent_dir=data_parent_dir, target_name=target_name)

    pfi_summary.to_json(f"modnet_pfi_shap/pfi_summary_{target_name}.json")
    shap_summary.to_json(f"modnet_pfi_shap/shap_summary_{target_name}.json")
    
    fig1 = plot_feature_importance(pfi_summary, target_name=target_name, model_name="MODNet", importance_type="PFI", n_feats=n_feats)
    fig1.savefig(f"modnet_pfi_shap/pfi_{target_name}.pdf")
    fig1.savefig(f"modnet_pfi_shap/pfi_{target_name}.png", dpi=300)

    fig2 = plot_feature_importance(shap_summary, target_name=target_name, model_name="MODNet", importance_type="SHAP", n_feats=n_feats)
    fig2.savefig(f"modnet_pfi_shap/shap_{target_name}.pdf")
    fig2.savefig(f"modnet_pfi_shap/shap_{target_name}.png",dpi=300)
    plt.close()