Distance correlation#
Script to get heatmap plots using distance correlation values as shown in manuscript
import os
import psutil
import warnings
import matplotlib
from sklearn.preprocessing import StandardScaler
from mlproject.data.preprocessing import get_dataset
from mlproject.training.feature_selection import get_relevant_features
from mlproject.corr_analysis.distance_correlation import evaluate_distance_correlation_matrix_bootstrap
from mlproject.plotting.distance_correlation import plot_distance_correlation_heatmap
from mlproject.utils.misc import split_features
warnings.filterwarnings("ignore")
matplotlib.rcParams['pdf.fonttype'] = 42
target_names = ["last_phdos_peak","max_pfc",
"log_g_vrh", "log_k_vrh",
"log_klat_300", "log_kp_300",
"log_msd_all_300", "log_msd_all_600",
"log_msd_max_300", "log_msd_max_600",
"log_msd_mean_300", "log_msd_mean_600",
"Cv_25", "Cv_305", "Cv_705",
"H_25", "H_305", "H_705",
"S_25", "S_305", "S_705",
"U_25", "U_305", "U_705"]
Provide absolute path to https://github.com/DigiMatChem/paper-ml-with-lobster-descriptors/tree/main/data after cloning the repository locally to data_parent_dir variable below
data_parent_dir = "absolute/path/to/paper-ml-with-lobster-descriptors/data/"
os.makedirs("dcor_analysis", exist_ok=True)
os.chdir("dcor_analysis")
parent_dir = os.getcwd()
num_jobs = psutil.cpu_count(logical=False) # This will use all physical cores on the system. Please reduce it as per needs
%%capture --no-display
for target_name in target_names:
target, all_feat = get_dataset(
target_name=target_name,
feat_type="matminer_lob",
data_parent_dir=data_parent_dir
)
lob_feats , matminer_feats = split_features(feats=all_feat.columns)
X_all = all_feat.dropna(axis=1)
X_lob_all = all_feat.loc[:, lob_feats].dropna(axis=1)
X_matminer_all = all_feat.loc[:, matminer_feats].dropna(axis=1)
y = target.iloc[:,0]
X_l_pipe, X_lob = get_relevant_features(X_train=X_lob_all, y_train=y, grootcv_n_iter=50, grootcv_nfolds=5, **{"all_rel_feats__n_jobs": num_jobs})
X_m_pipe, X_matminer = get_relevant_features(X_train=X_matminer_all, y_train=y, grootcv_n_iter=50, grootcv_nfolds=5, **{"all_rel_feats__n_jobs": num_jobs})
X_a_pip, X_all_fil = get_relevant_features(X_train=X_all, y_train=y, grootcv_n_iter=50, grootcv_nfolds=5, **{"all_rel_feats__n_jobs": num_jobs})
X1 = X_lob.values
X2 = X_matminer.values
X1X2 = X_all_fil.values
Z = y.values.reshape(-1,1)
scaler = StandardScaler()
X1s = scaler.fit_transform(X1)
X2s = scaler.fit_transform(X2)
X1s_X2s = scaler.fit_transform(X1X2)
ys = scaler.fit_transform(Z)
# Define feature sets
sets = {
'LOBSTER': X1s,
'MATMINER': X2s,
'LOBSTER+MATMINER': X1s_X2s,
f'{target_name}': ys,
}
os.makedirs(f"{target_name}", exist_ok=True)
mean_mat, std_mat, mean_pvals = evaluate_distance_correlation_matrix_bootstrap(sets=sets, num_resamples=1000, num_bootstrap=20)
mean_mat.to_json(f"{target_name}/dcor_matrix.json")
mean_pvals.to_json(f"{target_name}/pvals_matrix.json")
std_mat.to_json(f"{target_name}/dcor_std_matrix.json")
fig = plot_distance_correlation_heatmap(mat=mean_mat, pvals=mean_pvals, std_mat=std_mat,
title=f"Distance correlation: {target_name}",
cmap="Blues", show_values=True)
for img_ext in ["png", "svg", "pdf"]:
fig.savefig(f"{target_name}/heatmap.{img_ext}")
os.chdir(parent_dir)