Source code for reclaim.derived_features.feature_engineering_and_transformation
import pandas as pd
import numpy as np
ALL_FEATURES = [
'log_OBC', 'log_HGT', 'MRB', 'LAT', 'LON',
'log_RA', 'log_RP', 'log_FL',
'log_CA', 'log_DCA',
'AECS', 'AECC','AECI',
'log_LCAS', 'log_LCC',
'log_LCG', 'log_LCT', 'log_LCS',
'log_LCHV', 'log_LCM',
'log_LCSV','log_LCBS',
'log_LCSG', 'log_LCWB','DLC',
'COAR', 'SAND', 'SILT', 'CLAY', 'BULK',
'ELEV', 'SLOP', 'CURV', 'ASP', 'HILL',
'log_MAI', 'log_PAI', 'I_cv',
'log_I_std','I_above_90', 'I_max_persis',
'log_MAO', 'log_O_std', 'O_cv',
'E_mean', 'E_std',
'log_SA_mean', 'log_SA_std', 'SA_cv', 'SA_skew', 'log_SA_kurt',
'log_SA_mean_clip', 'SA_above_90',
'NSSC1_mean', 'NSSC1_std', 'NSSC1_cv', 'NSSC1_skew', 'NSSC1_kurt',
'NSSC2_mean', 'NSSC2_above_90', 'NSSC2_max_persis',
'log_MAR', '#_rain_above_10', '#_rain_above_50', '#_rain_above_100',
'tmin_mean', 'tmax_mean',
'wind_mean', 'wind_std', 'wind_cv', 'wind_skew', 'wind_kurt',
'AGE', 'log_ROBC', 'log_GC',
'NVGF',
'R_tree_bare', 'R_shrub_bare', 'R_coarse_sand',
'log_rel_SA_mean_clip', 'log_R_SA_cap',
'log_rain_per_area',
'log_TE', 'log_RT', 'log_ECLR', 'ESR',
'log_SIN', 'log_SOUT',
]
[docs]
def engineer_and_transform_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Engineer and transform features in reservoir/catchment dataset.
Features are first engineered in raw space (linear), then log-transformations
are applied in a single pass to avoid double-logging.
Log-transformed columns are prefixed with ``log_`` to clearly indicate their state.
Required input columns (abbreviations):
- CA, DCA, OBC, HGT, RA, RP, FL
- SA_mean, SA_mean_clip, SA_std, SA_kurt
- PAI, MAI, MAO, I_std, O_std, MAR
- OEY, BY, VGF, VLF
- Land cover: LCAS, LCC, LCG, LCT, LCS, LCHV, LCM, LCSV, LCBS, LCSG, LCWB
- COAR, SAND, NSSC2_mean
"""
# Ensure required columns exist
required_cols = ['CA', 'DCA', 'OBC', 'HGT', 'RA', 'RP', 'FL',
'SA_mean', 'SA_mean_clip', 'SA_std', 'SA_kurt',
'PAI', 'MAI', 'MAO', 'I_std', 'O_std', 'MAR',
'OEY', 'BY', 'VGF', 'VLF',
'LCAS','LCC','LCG','LCT','LCS','LCHV','LCM','LCSV','LCBS','LCSG','LCWB',
'COAR','SAND','NSSC2_mean']
for col in required_cols:
if col not in df.columns:
df[col] = np.nan
# -------------------------
# ENGINEER RAW FEATURES
# -------------------------
inflow_cap_ratio = (df['MAI'] * 3600 * 24 * 365.25 / 1e6) / df['OBC']
feature_dict = {
"AGE": df["OEY"] - df["BY"],
"ROBC": df["OBC"] / df["CA"],
"NVGF": df["VGF"] - df["VLF"],
"GC": df["RA"] / (df["RP"]**2),
"rain_per_area": np.where(df["CA"]!=0, df["MAR"]/df["CA"], df["MAR"]),
"R_tree_bare": np.where(df["LCBS"]!=0, df["LCT"]/df["LCBS"], df["LCT"]),
"R_shrub_bare": np.where(df["LCBS"]!=0, df["LCS"]/df["LCBS"], df["LCS"]),
"R_coarse_sand": df["COAR"]/df["SAND"],
"RT": df["OBC"] * 1e6 / (df["MAI"] * 3600 * 24 * 365.25),
"TE": np.exp(-0.0079 * inflow_cap_ratio) * 100,
"ECLR": np.exp(-0.0079 * inflow_cap_ratio) * 100 * df["NSSC2_mean"] * inflow_cap_ratio,
"ESR": np.exp(-0.0079 * inflow_cap_ratio) * 100 * df["NSSC2_mean"] * inflow_cap_ratio * df["OBC"] / 100,
"rel_SA_mean_clip": df["SA_mean_clip"] / df["RA"],
"R_SA_cap": df["SA_mean_clip"] / df["OBC"],
"SIN": df["MAI"] * df["NSSC2_mean"],
"SOUT": df["MAO"] * df["NSSC2_mean"],
}
df = pd.concat([df, pd.DataFrame(feature_dict)], axis=1)
# Land cover log-area features
lc_cols = ['LCAS','LCC','LCG','LCT','LCS','LCHV','LCM','LCSV','LCBS','LCSG','LCWB']
# for col in lc_cols:
# df[col] = df["CA"] * df[col] / 100
# Doing calculation along with taking log as done in model training. results will slightly differ for cases where percentage of LC is 0.
# -------------------------
# APPLY LOG TRANSFORMATIONS
# -------------------------
log_candidates = ['CA','DCA','OBC','HGT','RA','RP','FL',
'SA_mean','SA_mean_clip','SA_std','SA_kurt','PAI','MAI','MAO','I_std','O_std','MAR',
'ROBC','rain_per_area','GC','TE','RT','ECLR','SIN','SOUT', 'rel_SA_mean_clip', 'R_SA_cap'] + lc_cols
for col in log_candidates:
log_col = f'log_{col}' # add prefix to avoid double log
try:
if col in ['ECLR','SIN','SOUT']:
# Land cover columns can be zero (upto 15 decimal places), clip at 1e-15
df[log_col] = np.log(df[col].clip(lower=1e-15))
elif col in ['rain_per_area']:
# Rain per area can be zero (upto 10 decimal places), clip at 1e-10
df[log_col] = np.log(df[col].clip(lower=1e-10))
elif col in lc_cols:
df[log_col] = np.log(df["CA"].clip(lower=1e-6)) + np.log(df[col].clip(lower=1e-6)) - np.log(100)
else:
# All other columns can be zero (upto 6 decimal places), clip at 1e-6
df[log_col] = np.log(df[col].clip(lower=1e-6))
except Exception as e:
raise ValueError(f"Error applying log transform to column '{col}': {e}")
# Process DLc as categorical column
df['DLC'] = df['DLC'].astype(int).fillna(0)
# Add empty columns for any missing features
for feature in ALL_FEATURES:
if feature not in df.columns:
df[feature] = np.nan
return df