Source code for reclaim.dynamic_features.utils.statistical_metrics

import pandas as pd
import numpy as np

from scipy.stats import skew, kurtosis

[docs] def annual_mean(ts: pd.Series) -> float: """ Calculates the mean of annual means from a time series. The annual mean is computed for each year using daily values. Parameters ---------- ts : pd.Series Time series of daily values, indexed by datetime. Returns ------- float Mean of the annual mean values across all years. """ if ts.empty: return float('nan') # Group by year and calculate mean surface area for each year annual_means = ts.groupby(ts.index.year).mean() # Return the mean of these annual means return annual_means.mean()
[docs] def annual_std(ts: pd.Series) -> float: """ Calculates the mean annual standard deviation from a time series. Standard deviation is computed for each year using daily values. Parameters ---------- ts : pd.Series Time series of daily values, indexed by datetime. Returns ------- float Mean standard deviation across all years. """ if ts.empty: return float('nan') # Group by year and compute standard deviation for each year annual_std_values = ts.groupby(ts.index.year).std() # Return the mean standard deviation across years return annual_std_values.mean()
# Skewness
[docs] def skewness(ts: pd.Series) -> float: """ Calculates skewness of the given time series. Parameters ---------- ts : pd.Series Time series, indexed by datetime. Returns ------- float Skewness of the time series (unitless). """ if ts.empty: return float('nan') return skew(ts.dropna())
# Kurtosis
[docs] def kurtosis_val(ts: pd.Series) -> float: """ Calculates kurtosis of the given time series. Parameters ---------- ts : pd.Series Time series, indexed by datetime. Returns ------- float Kurtosis of the time series (excess kurtosis, unitless). """ if ts.empty: return float('nan') return kurtosis(ts.dropna(), fisher=True)
# COV
[docs] def coefficient_of_variation(ts: pd.Series) -> float: """ Calculates coefficient of variation (CV) of the given time series. Parameters ---------- ts : pd.Series Time series, indexed by datetime. Returns ------- float Coefficient of variation (std/mean, unitless). """ if ts.empty: return float('nan') mean_val = ts.mean() if mean_val == 0: return float('nan') return ts.std() / mean_val
[docs] def max_days_above_90th(ts: pd.Series) -> float: """ Calculates the maximum number of days per year where the daily values exceed the 90th percentile threshold (computed over the entire time series). Parameters ---------- ts : pd.Series Time series of daily values, indexed by datetime. Returns ------- float Maximum count of days above the 90th percentile across years. """ if ts.empty: return float('nan') # Compute global 90th percentile threshold threshold = np.nanpercentile(ts, 90) # Boolean series: True if value > threshold above_threshold = ts > threshold # Count per year annual_counts = above_threshold.groupby(ts.index.year).sum() # Return maximum count across years return float(annual_counts.max()) if not annual_counts.empty else float('nan')
[docs] def max_annual_persistence(timeseries, threshold=1/np.e, min_periods=30): """ Compute the persistence (decorrelation time) of high values in a time series annually. Parameters ---------- timeseries : pd.Series A datetime-indexed series of daily values. threshold : float, optional Autocorrelation cutoff (default=1/e ~ 0.367). min_periods : int, optional Minimum number of days required in a year to compute autocorrelation. Returns ------- int Maximum persistence (days) across all years. """ results = {} # group by year for year, group in timeseries.groupby(timeseries.index.year): if len(group) < min_periods: continue # normalize (remove mean, divide std) x = (group - group.mean()) / group.std() n = len(x) # compute autocorrelation using np.correlate acf = np.correlate(x, x, mode='full') / n acf = acf[n-1:] / acf[n-1] # keep positive lags, normalize at lag 0 = 1 # find first lag where acf < threshold persistence = np.argmax(acf < threshold) if persistence == 0: # if acf never drops below threshold persistence = len(acf) - 1 results[year] = persistence if not results: return float('nan') return max(results.values())