Distance correlation

Distance correlation#

Script to get heatmap plots using distance correlation values as shown in manuscript

import os
import psutil
import warnings
import matplotlib
from sklearn.preprocessing import StandardScaler
from mlproject.data.preprocessing import get_dataset
from mlproject.training.feature_selection import get_relevant_features
from mlproject.corr_analysis.distance_correlation import evaluate_distance_correlation_matrix_bootstrap
from mlproject.plotting.distance_correlation import plot_distance_correlation_heatmap
from mlproject.utils.misc import split_features
warnings.filterwarnings("ignore")
matplotlib.rcParams['pdf.fonttype'] = 42
target_names = ["last_phdos_peak","max_pfc", 
                "log_g_vrh", "log_k_vrh",
                "log_klat_300", "log_kp_300",
                "log_msd_all_300", "log_msd_all_600",
                "log_msd_max_300", "log_msd_max_600",
                "log_msd_mean_300", "log_msd_mean_600",
                "Cv_25", "Cv_305", "Cv_705", 
                "H_25", "H_305", "H_705", 
               "S_25", "S_305", "S_705", 
               "U_25", "U_305", "U_705"]

Provide absolute path to https://github.com/DigiMatChem/paper-ml-with-lobster-descriptors/tree/main/data after cloning the repository locally to data_parent_dir variable below

data_parent_dir = "absolute/path/to/paper-ml-with-lobster-descriptors/data/"
os.makedirs("dcor_analysis", exist_ok=True)
os.chdir("dcor_analysis")
parent_dir = os.getcwd()
num_jobs = psutil.cpu_count(logical=False) # This will use all physical cores on the system. Please reduce it as per needs
%%capture --no-display
for target_name in target_names:
    target, all_feat = get_dataset(
        target_name=target_name,
        feat_type="matminer_lob",
        data_parent_dir=data_parent_dir
    )

    lob_feats , matminer_feats = split_features(feats=all_feat.columns)

    X_all = all_feat.dropna(axis=1)
    X_lob_all = all_feat.loc[:, lob_feats].dropna(axis=1)
    X_matminer_all = all_feat.loc[:, matminer_feats].dropna(axis=1)
    y = target.iloc[:,0]

    X_l_pipe, X_lob = get_relevant_features(X_train=X_lob_all, y_train=y, grootcv_n_iter=50, grootcv_nfolds=5, **{"all_rel_feats__n_jobs": num_jobs})
    X_m_pipe, X_matminer = get_relevant_features(X_train=X_matminer_all, y_train=y, grootcv_n_iter=50, grootcv_nfolds=5, **{"all_rel_feats__n_jobs": num_jobs})
    X_a_pip, X_all_fil = get_relevant_features(X_train=X_all, y_train=y, grootcv_n_iter=50, grootcv_nfolds=5, **{"all_rel_feats__n_jobs": num_jobs})

    
    X1 = X_lob.values
    X2 = X_matminer.values
    X1X2 = X_all_fil.values
    Z = y.values.reshape(-1,1)

    scaler = StandardScaler()


    X1s = scaler.fit_transform(X1)
    X2s = scaler.fit_transform(X2)
    X1s_X2s = scaler.fit_transform(X1X2)
    ys  = scaler.fit_transform(Z)

    # Define feature sets
    sets = {
    'LOBSTER': X1s,
    'MATMINER': X2s,
    'LOBSTER+MATMINER': X1s_X2s,
    f'{target_name}': ys,
    }

    os.makedirs(f"{target_name}", exist_ok=True)

    mean_mat, std_mat, mean_pvals = evaluate_distance_correlation_matrix_bootstrap(sets=sets, num_resamples=1000, num_bootstrap=20)
    mean_mat.to_json(f"{target_name}/dcor_matrix.json")
    mean_pvals.to_json(f"{target_name}/pvals_matrix.json")
    std_mat.to_json(f"{target_name}/dcor_std_matrix.json")

    fig = plot_distance_correlation_heatmap(mat=mean_mat, pvals=mean_pvals, std_mat=std_mat,
                                            title=f"Distance correlation: {target_name}",
                                           cmap="Blues", show_values=True)

    for img_ext in ["png", "svg", "pdf"]:
        fig.savefig(f"{target_name}/heatmap.{img_ext}")
os.chdir(parent_dir)