ARFS convergence tests#
Scripts to test n_iter parameter convergence for selecting stable set of descriptors
import psutil
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from mlproject.data.preprocessing import get_dataset
from mlproject.training.feature_selection import get_relevant_features
warnings.filterwarnings("ignore")
matplotlib.rcParams['pdf.fonttype'] = 42
Provide absolute path to https://github.com/DigiMatChem/paper-ml-with-lobster-descriptors/tree/main/data after cloning the repository locally to data_parent_dir variable below
data_parent_dir = "absolute/path/to/paper-ml-with-lobster-descriptors/data/"
target_names = ["last_phdos_peak","max_pfc",
"log_g_vrh", "log_k_vrh",
"log_klat_300", "log_kp_300",
"log_msd_all_300", "log_msd_all_600",
"log_msd_max_300", "log_msd_max_600",
"log_msd_mean_300", "log_msd_mean_600",
"Cv_25", "Cv_305", "Cv_705",
"H_25", "H_305", "H_705",
"S_25", "S_305", "S_705",
"U_25", "U_305", "U_705"]
num_jobs = psutil.cpu_count(logical=False) # This will use all physical cores on the system. Please reduce it as per needs
def grootcv_convergence_test(
X_train: pd.DataFrame,
y_train: pd.DataFrame,
n_iter_values: list[int] | None = None,
similarity_threshold: float = 0.90,
stable_steps_required: int = 2,
**kwargs
):
"""
Test GrootCV convergence by measuring feature selection stability over iterations.
Parameters
----------
X_train : pd.DataFrame
DataFrame with Features
y_train : pd.DataFrame
DataFrame with target values as column
n_iter_values : list[int], optional
List of n_iter values to test. Default: range(5, 101, 5)
similarity_threshold : float, default=0.90
Jaccard similarity threshold to consider features stable.
stable_steps_required : int, default=2
Number of consecutive stable steps required to declare convergence.
kwargs : dict
Additional parameters passed to get_relevant_features.
Returns
-------
pd.DataFrame
Table with n_iter, selected features, and Jaccard similarity to previous step.
int
Suggested n_iter where convergence is reached.
"""
if n_iter_values is None:
n_iter_values = list(range(5, 101, 5)) # e.g., [5, 10, 15, ..., 100]
results = []
prev_features = None
stable_count = 0
convergence_iter = None
def jaccard_similarity(set_a, set_b):
return len(set_a & set_b) / len(set_a | set_b)
for n_iter in n_iter_values:
print(f"Testing GrootCV with n_iter={n_iter}")
pipeline, X_sel = get_relevant_features(
X_train, y_train.values.flatten(), grootcv_n_iter=n_iter, **kwargs
)
if isinstance(X_sel, pd.DataFrame):
selected_features = set(X_sel.columns)
else:
selected_features = set(getattr(pipeline.named_steps["all_rel_feats"], "selected_features_", []))
similarity = None
if prev_features is not None:
similarity = jaccard_similarity(prev_features, selected_features)
if similarity >= similarity_threshold:
stable_count += 1
if stable_count >= stable_steps_required and convergence_iter is None:
convergence_iter = n_iter
else:
stable_count = 0
else:
similarity = 0.0 # <-- Set to zero for first iteration
results.append({
"n_iter": n_iter,
"selected_features": selected_features,
"num_features": len(selected_features),
"jaccard_similarity": similarity
})
prev_features = selected_features
df_results = pd.DataFrame(results)
return df_results, convergence_iter
def aggregate_convergence_results(
X: pd.DataFrame,
y: pd.DataFrame,
n_iter_values: list[int] = [2, 5, 10, 15],
similarity_threshold: float = 0.90,
stable_steps_required: int = 2,
n_splits: int = 5,
random_state: int = 18012019,
**kwargs
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""
Aggregate convergence results across cross-validation folds.
This function performs an outer K-fold cross-validation loop and, for each
fold, runs a convergence test over multiple iteration counts. The results
from all folds are concatenated and aggregated by iteration count using
the mean and standard deviation of the Jaccard similarity.
Parameters
----------
X : pandas.DataFrame
Feature matrix of shape (n_samples, n_features).
y : pandas.Series
Target vector of shape (n_samples,).
n_iter_values : iterable of int, default=(2, 5, 10, 15)
Sequence of iteration counts to evaluate during the convergence test.
similarity_threshold : float, default=0.90
Threshold for determining convergence based on similarity metrics.
stable_steps_required : int, default=2
Number of consecutive stable steps required to declare convergence.
n_splits : int, default=5
Number of splits for the outer K-fold cross-validation.
random_state : int, default=18012019
Random seed used to ensure reproducibility of the cross-validation splits.
**kwargs : dict
Additional keyword arguments passed directly to
``grootcv_convergence_test``.
Returns
-------
combined_df : pandas.DataFrame
Concatenated results from all cross-validation folds. Contains
per-fold convergence metrics, including the iteration count
and Jaccard similarity.
agg_df : pandas.DataFrame
Aggregated convergence statistics grouped by iteration count.
Contains the mean and standard deviation of the Jaccard similarity
across folds for each ``n_iter`` value.
"""
all_fold_results = []
cv_outer = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
fold_id = 1
for train_ix, test_ix in cv_outer.split(X):
print(f"Outer Fold {fold_id}/{n_splits}")
X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
df_results, _ = grootcv_convergence_test(
X_train, y_train,
n_iter_values=n_iter_values,
similarity_threshold=similarity_threshold,
stable_steps_required=stable_steps_required,
**kwargs
)
df_results["fold"] = fold_id
all_fold_results.append(df_results)
fold_id += 1
combined_df = pd.concat(all_fold_results, ignore_index=True)
# Aggregate by n_iter
agg_df = combined_df.groupby("n_iter")["jaccard_similarity"].agg(["mean", "std"]).reset_index()
return combined_df, agg_df
def plot_n_iter_convergence_across_folds(
agg_df: pd.DataFrame,
similarity_threshold: float = 0.90
):
"""
Plot convergence behavior across cross-validation folds.
This function visualizes the mean Jaccard similarity and its variability
(standard deviation) across different iteration counts using an error bar
plot. A horizontal reference line indicates the similarity threshold used
to assess convergence.
Parameters
----------
agg_df : pandas.DataFrame
Aggregated convergence results grouped by iteration count. Must contain
the columns:
- ``n_iter`` : int
Iteration count.
- ``mean`` : float
Mean Jaccard similarity across folds.
- ``std`` : float
Standard deviation of the Jaccard similarity across folds.
similarity_threshold : float, default=0.90
Similarity threshold displayed as a horizontal reference line.
Returns
-------
fig : matplotlib.figure.Figure
The Matplotlib figure object containing the convergence plot.
"""
fig, ax = plt.subplots(figsize=(8, 5))
ax.errorbar(
agg_df["n_iter"],
agg_df["mean"],
yerr=agg_df["std"],
fmt="o-",
capsize=5,
color="blue",
label="Mean Jaccard Similarity ± STD"
)
ax.axhline(
y=similarity_threshold,
color="red",
linestyle="--",
label="Similarity Threshold"
)
ax.set_xlabel("n_iter")
ax.set_ylabel("Mean Jaccard Similarity")
ax.set_title("GrootCV Convergence Across Folds")
ax.legend()
ax.grid(True)
return fig
%%capture --no-display
for target_name in target_names:
for feat_type in ["matminer", "matminer_lob"]:
target, feat = get_dataset(target_name=target_name, feat_type=feat_type,
data_parent_dir=data_parent_dir)
feat.dropna(axis=1, inplace=True)
combined_df, agg_df = aggregate_convergence_results(
X=feat, y=target,
n_iter_values=[1, 2, 5, 10, 15, 20, 30, 40 , 50],
similarity_threshold=0.90,
stable_steps_required=2,
n_splits=5,
**{"all_rel_feats__n_jobs": num_jobs}
)
combined_df.to_json(f"combined_feat_info_{target_name}_{feat_type}.json")
agg_df.to_json(f"aggregated_feat_info_{target_name}_{feat_type}.json")
fig = plot_n_iter_convergence_across_folds(
agg_df,
similarity_threshold=0.90,
)
fig.savefig(f"{target_name}_{feat_type}_n_iter_convergence.pdf", dpi=300)