Source code for mlproject.data.preprocessing

"""
Functions for data preprocessing, including loading datasets, renaming and adding units to features.
"""

import pandas as pd
from pathlib import Path
from typing import Literal


UNIT_RULES = [
    # --- Matminer ---
    (True, ("MagpieData", "MeltingT"), " (K)"),
    (True, ("MagpieData", "GSbandgap"), " (eV)"),
    (True, ("MagpieData", "GSvolume_pa"), " (A^3/atom)"),
    (True, ("MagpieData", "CovalentRadius"), " (pm)"),
    (True, ("AtomicOrbitals", "LUMO_energy"), " (eV)"),
    (True, ("AtomicOrbitals", "gap_AO"), " (eV)"),
    (True, ("AtomicOrbitals", "HOMO_energy"), " (eV)"),
    (True, ("BandCenter", "band_center"), " (eV)"),
    (True, ("Miedema", "deltaH_amor"), " (eV/atom)"),
    (True, ("Miedema", "deltaH_ss_min"), " (eV/atom)"),
    (True, ("Miedema", "deltaH_inter"), " (eV/atom)"),
    (False, ("AverageBondLength",), " (A)"),
    (False, ("AverageBondAngle",), " (rad)"),
    (True, ("DensityFeatures", "vpa"), " (A^3/atom)"),
    (True, ("DensityFeatures", "density"), " (g/cm^3)"),
    (True, ("site", "FirstIonizationEnergy"), " (eV)"),
    (True, ("site", "ElectronAffinity"), " (kJ/mol)"),
    (True, ("site", "CovalentRadius"), " (pm)"),
    (True, ("site", "AtomicRadius"), " (A)"),
    (True, ("site", "MeltingT"), " (K)"),
    (True, ("site", "BoilingT"), " (K)"),
    (True, ("site", "Density"), " (g/L)"),
    (True, ("site", "MolarVolume"), " (cm^3)"),
    (True, ("site", "HeatFusion"), " (kJ/mol)"),
    (True, ("site", "HeatVaporization"), " (kJ/mol)"),
    (True, ("Average bond length",), " (A)"),
    (True, ("Voro", "dist"), " (A)"),
    (True, ("Average bond angle",), " (rad)"),
    # --- LOBSTER ---
    (False, ("w_ICOHP",), " (eV)"),
    (False, ("ICOHP_", "asi_", "_asi"), " (eV)"),
    (
        False,
        ("bwdf_", "Icohp_", "center_COHP", "width_COHP", "edge_COHP", "Madelung_"),
        " (eV)",
    ),
    (False, ("dist_at",), " (A)"),
]


[docs] def apply_unit_rule(col: str, new_name: str) -> str: """Apply first matching unit rule.""" if "_kurtosis" in col or "_skew" in col: return new_name for match_all, substrings, unit in UNIT_RULES: if match_all: if all(s in col or s in new_name for s in substrings): return new_name + unit if not new_name.endswith(unit) else new_name else: if any(s in col or s in new_name for s in substrings): return new_name + unit if not new_name.endswith(unit) else new_name return new_name
[docs] def build_rename_dict(columns: list[str], clean_names: bool = False) -> dict: """Return rename mapping for given columns.""" renamed = {} for col in columns: new_name = str(col) if clean_names: new_name = ( new_name.replace("[", "_") .replace("]", "_") .replace(" ", "_") .replace("|", "_") .replace("<", "less") .replace("=", "_") ) renamed[col] = apply_unit_rule(str(col), new_name) return renamed
[docs] def get_dataset( data_parent_dir: str | Path, target_name: str = "last_phdos_peak", feat_type: Literal["matminer", "matminer_lob"] = "matminer", rename_features: bool = True, ) -> tuple[pd.DataFrame, pd.DataFrame]: """ Load target and feature datasets for a given target and feature type. Parameters ---------- target_name : str, default="last_phdos_peak" Name of the target dataset to load. feat_type : {"matminer", "matminer_lob"}, default="matminer" Feature type used to select feature data. data_parent_dir : str | Path Parent directory containing `targets` and `features` subdirectories. rename_features : bool, default=True If True, clean feature column names and append physical units. Returns ------- target_df : pd.DataFrame DataFrame containing target values. feature_df : pd.DataFrame DataFrame containing feature values. Raises ------ ValueError If `feat_type` is not one of the allowed values. """ data_parent_dir = Path(data_parent_dir) # Validate feat_type allowed_types = {"matminer", "matminer_lob"} if feat_type not in allowed_types: raise ValueError( f"Invalid feat_type: {feat_type}. Must be one of {allowed_types}." ) # Paths target_path = data_parent_dir / "targets" / f"{target_name}.json" feature_path_matminer = ( data_parent_dir / "features" / "matminer" / f"{target_name}.json.gz" ) # Load targets + features target_df = pd.read_json(target_path) feature_df = pd.read_json(feature_path_matminer) if feat_type == "matminer_lob": # Merge lobster features feature_path_lobster = ( data_parent_dir / "features" / "lobster" / f"{target_name}.json.gz" ) feature_df_lob = pd.read_json(feature_path_lobster) feature_df = feature_df.merge( feature_df_lob, left_index=True, right_index=True, how="inner" ) # Optionally rename + add units if rename_features: feature_df = feature_df.rename( columns=build_rename_dict(feature_df.columns, clean_names=True) ) return target_df.loc[feature_df.index, :], feature_df