Dependency graphs#
Script to get dependency graphs and feature learnability plots via RF regressor as shown in Dependency graphs section for each target property
import os
import psutil
import numpy as np
import warnings
import matplotlib
from mlproject.data.preprocessing import get_dataset
from mlproject.training.feature_selection import get_relevant_features
from mlproject.corr_analysis.dependency_graph import evaluate_feature_set_relationships, evaluate_feature_set_to_feature, evaluate_features_to_target
from mlproject.plotting.dependency_graph import plot_feature_learnability, plot_dependency_graph_from_df
from mlproject.utils.misc import split_features
warnings.filterwarnings("ignore")
matplotlib.rcParams['pdf.fonttype'] = 42
target_names = ["last_phdos_peak","max_pfc",
"log_g_vrh", "log_k_vrh",
"log_klat_300", "log_kp_300",
"log_msd_all_300", "log_msd_all_600",
"log_msd_max_300", "log_msd_max_600",
"log_msd_mean_300", "log_msd_mean_600",
"Cv_25", "Cv_305", "Cv_705",
"H_25", "H_305", "H_705",
"S_25", "S_305", "S_705",
"U_25", "U_305", "U_705"]
Provide absolute path to https://github.com/DigiMatChem/paper-ml-with-lobster-descriptors/tree/main/data after cloning the repository locally to data_parent_dir variable below
data_parent_dir = "absolute/path/to/paper-ml-with-lobster-descriptors/data/"
os.makedirs("dependency_graph_analysis", exist_ok=True)
os.chdir("dependency_graph_analysis")
parent_dir = os.getcwd()
num_jobs = psutil.cpu_count(logical=False) # This will use all physical cores on the system. Please reduce it as per needs
%%capture --no-display
for target_name in target_names:
# Load dataset
target, all_feat = get_dataset(
target_name=target_name,
feat_type="matminer_lob",
data_parent_dir=data_parent_dir
)
# split lobster and matminer feature names
lob_feats , matminer_feats = split_features(feats=all_feat.columns)
# get X_all, X_lob, X_matminer and y
X_all = all_feat.dropna(axis=1)
X_lob_all = all_feat.loc[:, lob_feats].dropna(axis=1)
X_matminer_all = all_feat.loc[:, matminer_feats].dropna(axis=1)
y = target.iloc[:,0]
X_l_pipe, X_lob = get_relevant_features(X_train=X_lob_all, y_train=y, grootcv_n_iter=50, grootcv_nfolds=5, **{"all_rel_feats__n_jobs": 8})
X_m_pipe, X_matminer = get_relevant_features(X_train=X_matminer_all, y_train=y, grootcv_n_iter=50, grootcv_nfolds=5, **{"all_rel_feats__n_jobs": 8})
X_a_pip, X_all_fil = get_relevant_features(X_train=X_all, y_train=y, grootcv_n_iter=50, grootcv_nfolds=5, **{"all_rel_feats__n_jobs": 8})
os.makedirs(target_name, exist_ok=True)
os.chdir(target_name)
# Matminer to lobster
metrics_m2l, summary_m2l= evaluate_feature_set_to_feature(X_matminer, X_lob, n_splits=5)
metrics_m2l.to_json("metrics_m2l.json")
summary_m2l.to_json("summary_m2l.json")
# Lobster to matminer
metrics_l2m, summary_l2m= evaluate_feature_set_to_feature(X_lob, X_matminer, n_splits=5)
metrics_l2m.to_json("metrics_l2m.json")
summary_l2m.to_json("summary_l2m.json")
# Lobster to target
metrics_l2t_df, summary_l2t = evaluate_features_to_target(X_lob, y, n_splits=5)
metrics_l2t_df.to_json("metrics_l2t.json")
summary_l2t.to_json("summary_l2t.json")
# Matminer to target
metrics_m2t_df, summary_m2t = evaluate_features_to_target(X_matminer, y, n_splits=5)
metrics_m2t_df.to_json("metrics_m2t.json")
summary_m2t.to_json("summary_m2t.json")
# all_Feats to target
metrics_a2t_df, summary_a2t = evaluate_features_to_target(X_all_fil, y, n_splits=5)
metrics_a2t_df.to_json("metrics_a2t.json")
summary_a2t.to_json("summary_a2t.json")
r2_combined_feat = np.round(summary_a2t.loc["R2", "mean"],5)
#--------------------------------------------------
# Now runs with mutioutput rf regression and final dependency graph
# Compute relationships
results = evaluate_feature_set_relationships(X_lob, X_matminer, y, n_splits=5)
results.to_json("multioutput_regression_summary.json")
fig_formats = ["svg", "pdf", "png"]
# Plot using the resutls DataFrame
for fig_ext in fig_formats:
# Plot using the resutls DataFrame
plot_dependency_graph_from_df(
results_df=results,
feature1_name="Lobster Features",
feature2_name="Matminer Features",
target_name=f"{target_name}",
metric="R2 Mean", # or "MAE Mean", "RMSE Mean", etc.
node_colors={
"LOBSTER": "#a6cee3",
"MATMINER": "#fdbf6f",
f"{target_name}": "#99d8c9"
},
title=f"(MATMINER+LOBSTER) = {r2_combined_feat}",
save_path=f"{target_name}_feat_metrics.{fig_ext}"
)
# Plot matminer to lobster feature learnablitly fig
plot_feature_learnability(
results=summary_m2l,
title="LOBSTER Feature Learnability from MATMINER Features",
save_path=f"{target_name}_m2l_learnability.{fig_ext}",
n_feats=20
)
os.chdir(parent_dir)