Source code for reclaim.derived_features.feature_engineering_and_transformation

import pandas as pd
import numpy as np

ALL_FEATURES = [
     'log_OBC', 'log_HGT', 'MRB', 'LAT', 'LON',
     'log_RA', 'log_RP', 'log_FL',
     'log_CA', 'log_DCA',
     
     'AECS', 'AECC','AECI',
     
     'log_LCAS', 'log_LCC',
     'log_LCG', 'log_LCT', 'log_LCS',
     'log_LCHV', 'log_LCM', 
     'log_LCSV','log_LCBS', 
     'log_LCSG', 'log_LCWB','DLC',
     
     'COAR', 'SAND', 'SILT', 'CLAY', 'BULK',
     
     'ELEV', 'SLOP', 'CURV', 'ASP', 'HILL',
     
     'log_MAI', 'log_PAI', 'I_cv',
     'log_I_std','I_above_90', 'I_max_persis',
     'log_MAO', 'log_O_std', 'O_cv',
     'E_mean', 'E_std',
     'log_SA_mean',  'log_SA_std', 'SA_cv', 'SA_skew', 'log_SA_kurt',
     'log_SA_mean_clip', 'SA_above_90',
     'NSSC1_mean', 'NSSC1_std', 'NSSC1_cv', 'NSSC1_skew', 'NSSC1_kurt', 
     'NSSC2_mean', 'NSSC2_above_90', 'NSSC2_max_persis',
     
     'log_MAR', '#_rain_above_10', '#_rain_above_50', '#_rain_above_100',
     'tmin_mean', 'tmax_mean',
     'wind_mean', 'wind_std', 'wind_cv', 'wind_skew', 'wind_kurt', 
     
     'AGE', 'log_ROBC', 'log_GC',
     'NVGF',
     'R_tree_bare', 'R_shrub_bare', 'R_coarse_sand',
     'log_rel_SA_mean_clip', 'log_R_SA_cap',
     'log_rain_per_area',
     'log_TE', 'log_RT', 'log_ECLR', 'ESR',
     'log_SIN', 'log_SOUT',
]


[docs]
def engineer_and_transform_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Engineer and transform features in reservoir/catchment dataset.

    Features are first engineered in raw space (linear), then log-transformations
    are applied in a single pass to avoid double-logging.

    Log-transformed columns are prefixed with ``log_`` to clearly indicate their state.

    Required input columns (abbreviations):
        - CA, DCA, OBC, HGT, RA, RP, FL
        - SA_mean, SA_mean_clip, SA_std, SA_kurt
        - PAI, MAI, MAO, I_std, O_std, MAR
        - OEY, BY, VGF, VLF
        - Land cover: LCAS, LCC, LCG, LCT, LCS, LCHV, LCM, LCSV, LCBS, LCSG, LCWB
        - COAR, SAND, NSSC2_mean
    """
    
    # Ensure required columns exist
    required_cols = ['CA', 'DCA', 'OBC', 'HGT', 'RA', 'RP', 'FL',
                     'SA_mean', 'SA_mean_clip', 'SA_std', 'SA_kurt',
                     'PAI', 'MAI', 'MAO', 'I_std', 'O_std', 'MAR',
                     'OEY', 'BY', 'VGF', 'VLF',
                     'LCAS','LCC','LCG','LCT','LCS','LCHV','LCM','LCSV','LCBS','LCSG','LCWB',
                     'COAR','SAND','NSSC2_mean']
    for col in required_cols:
        if col not in df.columns:
            df[col] = np.nan

    # -------------------------
    # ENGINEER RAW FEATURES
    # -------------------------
    inflow_cap_ratio = (df['MAI'] * 3600 * 24 * 365.25 / 1e6) / df['OBC']
    
    feature_dict = {
        "AGE": df["OEY"] - df["BY"],
        "ROBC": df["OBC"] / df["CA"],
        "NVGF": df["VGF"] - df["VLF"],
        "GC": df["RA"] / (df["RP"]**2),
        "rain_per_area": np.where(df["CA"]!=0, df["MAR"]/df["CA"], df["MAR"]),
        "R_tree_bare": np.where(df["LCBS"]!=0, df["LCT"]/df["LCBS"], df["LCT"]),
        "R_shrub_bare": np.where(df["LCBS"]!=0, df["LCS"]/df["LCBS"], df["LCS"]),
        "R_coarse_sand": df["COAR"]/df["SAND"],
        "RT": df["OBC"] * 1e6 / (df["MAI"] * 3600 * 24 * 365.25),
        "TE": np.exp(-0.0079 * inflow_cap_ratio) * 100,
        "ECLR": np.exp(-0.0079 * inflow_cap_ratio) * 100 * df["NSSC2_mean"] * inflow_cap_ratio,
        "ESR": np.exp(-0.0079 * inflow_cap_ratio) * 100 * df["NSSC2_mean"] * inflow_cap_ratio * df["OBC"] / 100,
        "rel_SA_mean_clip": df["SA_mean_clip"] / df["RA"],
        "R_SA_cap": df["SA_mean_clip"] / df["OBC"],
        "SIN": df["MAI"] * df["NSSC2_mean"],
        "SOUT": df["MAO"] * df["NSSC2_mean"],
    }
    
    df = pd.concat([df, pd.DataFrame(feature_dict)], axis=1)

    # Land cover log-area features
    lc_cols = ['LCAS','LCC','LCG','LCT','LCS','LCHV','LCM','LCSV','LCBS','LCSG','LCWB']
    # for col in lc_cols:
    #     df[col] = df["CA"] * df[col] / 100
    # Doing calculation along with taking log as done in model training. results will slightly differ for cases where percentage of LC is 0.
        
    # -------------------------
    # APPLY LOG TRANSFORMATIONS
    # -------------------------
    log_candidates = ['CA','DCA','OBC','HGT','RA','RP','FL',
                      'SA_mean','SA_mean_clip','SA_std','SA_kurt','PAI','MAI','MAO','I_std','O_std','MAR',
                      'ROBC','rain_per_area','GC','TE','RT','ECLR','SIN','SOUT', 'rel_SA_mean_clip', 'R_SA_cap'] + lc_cols

    for col in log_candidates:
        log_col = f'log_{col}'  # add prefix to avoid double log
        try:
            if col in ['ECLR','SIN','SOUT']:
                # Land cover columns can be zero (upto 15 decimal places), clip at 1e-15
                df[log_col] = np.log(df[col].clip(lower=1e-15))
            elif col in ['rain_per_area']:
                # Rain per area can be zero (upto 10 decimal places), clip at 1e-10
                df[log_col] = np.log(df[col].clip(lower=1e-10))
            elif col in lc_cols:
                df[log_col] = np.log(df["CA"].clip(lower=1e-6)) + np.log(df[col].clip(lower=1e-6)) - np.log(100)
            else:
                # All other columns can be zero (upto 6 decimal places), clip at 1e-6
                df[log_col] = np.log(df[col].clip(lower=1e-6))
        except Exception as e:
            raise ValueError(f"Error applying log transform to column '{col}': {e}")
    
    # Process DLc as categorical column
    df['DLC'] = df['DLC'].astype(int).fillna(0)
    
    # Add empty columns for any missing features
    for feature in ALL_FEATURES:
        if feature not in df.columns:
            df[feature] = np.nan
    
    return df