Source code for mlproject.data.featurizer

"""
Function to get matminer features from structures via preset from MODNet.
"""

import pandas as pd
import numpy as np
import warnings
from monty.serialization import MontyDecoder
from modnet.preprocessing import MODData
from matminer.featurizers.site import (
    AGNIFingerprints,
    AverageBondAngle,
    AverageBondLength,
    BondOrientationalParameter,
    ChemEnvSiteFingerprint,
    CoordinationNumber,
    CrystalNNFingerprint,
    GaussianSymmFunc,
    GeneralizedRadialDistributionFunction,
    LocalPropertyDifference,
    OPSiteFingerprint,
    VoronoiFingerprint,
    SiteElementalProperty,
)
from matminer.featurizers.base import MultipleFeaturizer
from pymatgen.analysis.local_env import VoronoiNN
from pymatgen.io.lobster import Charge
from lobsterpy.featurize.batch import BatchSummaryFeaturizer, BatchIcoxxlistFeaturizer
from lobsterpy.featurize.core import FeaturizeIcoxxlist
from lobsterpy.featurize.utils import get_file_paths
from scipy.stats import kurtosis, skew

warnings.filterwarnings("ignore")


[docs] def get_matminer_feats(structures_df: pd.DataFrame, n_jobs: int = 8) -> pd.DataFrame: """ Get featurized dataframe using pymatgen structure object. Uses MODNet featurizer which uses matminer featurizer implementations Args: structures_df: pandas Dataframe object column named "structure"(pymatgen structures in dict format) n_jobs: number of parallel jobs to run for featurization Returns: A pandas dataframe with structure and composition based features """ structures_df["structure"] = structures_df["structure"].apply( MontyDecoder().process_decoded ) mod_data = MODData( materials=structures_df["structure"].values, structure_ids=list(structures_df.index), ) mod_data.featurize(n_jobs=n_jobs) return mod_data.df_featurized
[docs] def get_matminer_site_feats( structures_df: pd.DataFrame, site_featurizers: list | None = None, n_jobs: int = 8 ) -> pd.DataFrame: """ Get site featurized dataframe using pymatgen structure object . Uses Matminers MultipleFeaturizer Args: structures_df: pandas Dataframe object column named "structure"(pymatgen structures in dict format) and "site_index" site_featurizers: list of matminer site based featurizes to apply on input structures_df n_jobs: number of parallel jobs to run for featurization Returns: A pandas dataframe with structure and composition based features for sites """ structures_df["structure"] = structures_df["structure"].apply( MontyDecoder().process_decoded ) if site_featurizers is None: site_featurizers = [ AGNIFingerprints(), AverageBondAngle(VoronoiNN()), AverageBondLength(VoronoiNN()), BondOrientationalParameter(), ChemEnvSiteFingerprint.from_preset("simple"), CoordinationNumber(), CrystalNNFingerprint.from_preset("ops"), GaussianSymmFunc(), GeneralizedRadialDistributionFunction.from_preset("gaussian"), LocalPropertyDifference(), OPSiteFingerprint(), VoronoiFingerprint(), SiteElementalProperty.from_preset("seko-prb-2017"), ] multi_feat = MultipleFeaturizer(featurizers=site_featurizers) matminer_feat_df = multi_feat.featurize_dataframe( structures_df, col_id=["structure", "site_index"], ignore_errors=True ) matminer_feat_df.drop(columns=["structure", "site_index"], inplace=True) matminer_feat_cleaned_df = matminer_feat_df.loc[ :, ~matminer_feat_df.columns.duplicated() ] return matminer_feat_cleaned_df.dropna(axis=1)
[docs] def get_lobster_feats(path_to_lobster_calcs: str, n_jobs: int = 8) -> pd.DataFrame: """ Get featurized dataframe using parent directory path with LOBSTER calcs. Uses LobsterPy featurizer implementations. Args: path_to_lobster_calcs: Path to parent directory containing all LOBSTER calcs n_jobs: Number of parallel jobs to run for featurization Returns: A pandas DataFrame with features extracted from LOBSTER output files """ # Summary features (LobsterPy automatic bonding analysis + COHP based + Charge stats) summary_featurizer = BatchSummaryFeaturizer( path_to_lobster_calcs=path_to_lobster_calcs, n_jobs=n_jobs, bonds="all", charge_type="both", feature_type="bonding", e_range=[-15.0, 0.0], noise_cutoff=1e-4, ) df_summary = summary_featurizer.get_df() # common IcoxxlistFeaturizer settings icoxx_kwargs = { "path_to_lobster_calcs": path_to_lobster_calcs, "normalization": "counts", "bin_width": 0.1, "max_length": 5, "n_jobs": n_jobs, } # BWDF-features types bwdf_types = { "stats": "get_bwdf_df", "binned": "get_bwdf_df", "sorted_bwdf": "get_bwdf_df", "sorted_dists": "get_bwdf_df", } dfs = [df_summary] for bwdf_type, method_name in bwdf_types.items(): featurizer = BatchIcoxxlistFeaturizer( bwdf_df_type=bwdf_type, **icoxx_kwargs, ) dfs.append(getattr(featurizer, method_name)()) # Asymmetry index asi_featurizer = BatchIcoxxlistFeaturizer( bwdf_df_type="sorted_dists", **icoxx_kwargs, ) dfs.append(asi_featurizer.get_asymmetry_index_df()) df_combined = pd.concat(dfs, axis=1) return df_combined
[docs] def get_lobster_site_feat(path_to_lobster_calc: str, site_index: int) -> pd.DataFrame: """ Get featurized dataframe using path provided containing LOBSTER calc files. Uses LobsterPy featurizer implementations. Args: path_to_lobster_calc: Path to directory containing LOBSTER calcs output files site_index: site index of structure Returns: A pandas DataFrame with site features extracted from LOBSTER calculation files """ file_paths = get_file_paths( path_to_lobster_calc=path_to_lobster_calc, requested_files=["structure", "icohplist", "charge"], ) icoxx_featurizer = FeaturizeIcoxxlist( path_to_icoxxlist=file_paths.get("icohplist"), path_to_structure=file_paths.get("structure"), bin_width=0.1, normalization="counts", max_length=5, ) chargeobj = Charge(filename=file_paths.get("charge")) site_bwdf = icoxx_featurizer.calc_site_bwdf(site_index=site_index) bwdf_values = site_bwdf[f"{site_index}"]["icoxx_binned"] stats_fns = { "max": np.max, "mean": np.mean, "std": np.std, "min": np.min, "sum": np.sum, "skew": skew, "kurtosis": kurtosis, } site_feats = { f"site_bwdf_{name}": fn(bwdf_values) for name, fn in stats_fns.items() } site_feats.update( { "site_asi": icoxx_featurizer.calc_site_asymmetry_index(site_index), "charge_loew": chargeobj.loewdin[site_index], "charge_mull": chargeobj.mulliken[site_index], } ) structure_name = file_paths["structure"].parent.name index_name = f"{structure_name}_{site_index}" return pd.DataFrame(data=site_feats, index=[index_name])