From ade69a4d4c6ca79abd1db391f601e8649ee22f37 Mon Sep 17 00:00:00 2001 From: Dylan Date: Sat, 31 Jan 2026 23:26:15 +0100 Subject: [PATCH 1/3] mtd --- README.md | 28 +++++++- .../evaluation/metric_calculators.py | 68 +++++++++++++++++++ views_evaluation/evaluation/metrics.py | 1 + 3 files changed, 94 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e077671..e758e01 100644 --- a/README.md +++ b/README.md @@ -55,9 +55,31 @@ VIEWS Evaluation ensures **forecasting accuracy and model robustness** as the ** * **Step-wise evaluation**: groups and evaluates predictions by the respective steps from all models. * **Time-series-wise evaluation**: evaluates predictions for each time-series. * **Month-wise evaluation**: groups and evaluates predictions at a monthly level. -* **Support for Mulyiple Metrics** - * **Point Evaluation Metrics**: RMSLE, CRPS, Average Precision (Brier Score, Jeffreys Divergence, Pearson Correlation, Sinkhorn/Earth-mover Distance & pEMDiv and Variogram to be added). - * **Uncertainty Evaluation Metrics**: CRPS (and more to be added in the future). +* **Support for Multiple Metrics** (see table below for details) + +### **Available Metrics** + +| Metric | Key | Description | Available | Supports Distributions | +|--------|-----|-------------|:---------:|:----------------------:| +| Mean Squared Error | `MSE` | Average of squared differences between predictions and actuals | ✅ | ❌ | +| Mean Squared Log Error | `MSLE` | MSE computed on log-transformed values | ✅ | ❌ | +| Root Mean Squared Log Error | `RMSLE` | Square root of MSLE | ✅ | ❌ | +| Mean Tweedie Deviance | `MTD` | Tweedie deviance with power=1.5, ideal for zero-inflated data | ✅ | ❌ | +| Average Precision | `AP` | Area under precision-recall curve for binary classification | ✅ | ❌ | +| Pearson Correlation | `Pearson` | Linear correlation between predictions and actuals | ✅ | ❌ | +| Earth Mover's Distance | `EMD` | Wasserstein distance between predicted and actual distributions | ✅ | ✅ | +| Continuous Ranked Probability Score | `CRPS` | Measures calibration and sharpness of probabilistic forecasts | ✅ | ✅ | +| Mean Interval Score | `MIS` | Evaluates prediction interval width and coverage | ✅ | ✅ | +| Ignorance Score | `Ignorance` | Logarithmic scoring rule for probabilistic predictions | ✅ | ✅ | +| Coverage | `Coverage` | Proportion of actuals falling within prediction intervals | ✅ | ✅ | +| Mean Prediction | `y_hat_bar` | Average of all predicted values | ✅ | ✅ | +| Sinkhorn Distance | `SD` | Regularized optimal transport distance | ❌ | ✅ | +| pseudo-Earth Mover Divergence | `pEMDiv` | Efficient EMD approximation | ❌ | ✅ | +| Variogram | `Variogram` | Spatial/temporal correlation structure score | ❌ | ❌ | +| Brier Score | `Brier` | Accuracy of probabilistic predictions | ❌ | ✅ | +| Jeffreys Divergence | `Jeffreys` | Symmetric measure of distribution difference | ❌ | ✅ | + +> **Note:** Metrics marked with ✅ in "Supports Distributions" can be used for uncertainty evaluation with ensemble/sample-based predictions. * **Data Integrity Checks**: Ensures that input DataFrames conform to expected structures before evaluation based on point and uncertainty evaluation. * **Automatic Index Matching**: Aligns actual and predicted values based on MultiIndex structures. * **Planned Enhancements**: diff --git a/views_evaluation/evaluation/metric_calculators.py b/views_evaluation/evaluation/metric_calculators.py index 1bc7523..c877262 100644 --- a/views_evaluation/evaluation/metric_calculators.py +++ b/views_evaluation/evaluation/metric_calculators.py @@ -7,6 +7,7 @@ mean_squared_error, mean_squared_log_error, average_precision_score, + mean_tweedie_deviance, ) from scipy.stats import wasserstein_distance, pearsonr @@ -408,6 +409,72 @@ def _calculate_ignorance_score(predictions, observed, n, all_bins): return np.mean(scores) +def calculate_mtd( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str, power: float = 1.5 +) -> float: + """ + Calculate Mean Tweedie Deviance (MTD) between actual and predicted values. + + The Tweedie deviance is a family of loss functions parameterized by a power parameter `p`. + It generalizes several common loss functions: + - p = 0: Equivalent to Mean Squared Error (Gaussian distribution) + - p = 1: Equivalent to Poisson deviance (count data) + - p = 2: Equivalent to Gamma deviance (positive continuous data) + - 1 < p < 2: Compound Poisson-Gamma distribution (zero-inflated positive continuous data) + + With the default power of 1.5 (compound Poisson-Gamma), this metric is particularly + well-suited for conflict forecasting data which typically exhibits: + - Right-skewness (many small values, few large values) + - Zero-inflation (many observations with zero fatalities) + - Non-negative continuous outcomes + + The Tweedie deviance for a single observation is defined as: + d(y, μ) = 2 * (y^(2-p)/((1-p)*(2-p)) - y*μ^(1-p)/(1-p) + μ^(2-p)/(2-p)) + where y is the actual value and μ is the predicted value. + + Lower values indicate better model performance. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values with the target column. + The target column should contain numpy arrays or lists of actual observations. + matched_pred (pd.DataFrame): DataFrame containing predictions with the `pred_{target}` column. + The prediction column should contain numpy arrays or lists of predicted values. + target (str): The target column name (without the 'pred_' prefix). + power (float): The power parameter for the Tweedie distribution. Must be in range + [0, 1) or >= 1. Default is 1.5, which corresponds to the compound Poisson-Gamma + distribution, ideal for zero-inflated positive continuous data. + + Returns: + float: The Mean Tweedie Deviance score. Lower values indicate better predictions. + + Raises: + ValueError: If predictions contain negative values when power > 0, or if + actual values are negative when power >= 1. + + Example: + >>> mtd_score = calculate_mtd(actual_df, pred_df, "ln_sb_best") + >>> print(f"Mean Tweedie Deviance: {mtd_score:.4f}") + + References: + - Jørgensen, B. (1987). Exponential Dispersion Models. + Journal of the Royal Statistical Society. Series B, 49(2), 127-162. + - Tweedie, M. C. K. (1984). An index which distinguishes between some important + exponential families. Statistics: Applications and New Directions. + + See Also: + - sklearn.metrics.mean_tweedie_deviance: The underlying implementation. + - calculate_mse: Mean Squared Error (equivalent to MTD with power=0). + """ + actual_values = np.concatenate(matched_actual[target].values) + pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) + + actual_expanded = np.repeat( + actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] + ) + + return mean_tweedie_deviance(actual_expanded, pred_values, power=power) + + def calculate_mean_prediction( matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str ) -> float: @@ -428,6 +495,7 @@ def calculate_mean_prediction( "pEMDiv": calculate_pEMDiv, "Pearson": calculate_pearson, "Variogram": calculate_variogram, + "MTD": calculate_mtd, "y_hat_bar": calculate_mean_prediction, } diff --git a/views_evaluation/evaluation/metrics.py b/views_evaluation/evaluation/metrics.py index a7dcf33..ba8ebd1 100644 --- a/views_evaluation/evaluation/metrics.py +++ b/views_evaluation/evaluation/metrics.py @@ -128,6 +128,7 @@ class PointEvaluationMetrics(BaseEvaluationMetrics): pEMDiv: Optional[float] = None Pearson: Optional[float] = None Variogram: Optional[float] = None + MTD: Optional[float] = None y_hat_bar: Optional[float] = None From 27f5a7b255cb536d018be03cc9e7b33293ac568a Mon Sep 17 00:00:00 2001 From: Dylan Date: Sat, 31 Jan 2026 23:28:56 +0100 Subject: [PATCH 2/3] rem refs --- views_evaluation/evaluation/metric_calculators.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/views_evaluation/evaluation/metric_calculators.py b/views_evaluation/evaluation/metric_calculators.py index c877262..28ba5cb 100644 --- a/views_evaluation/evaluation/metric_calculators.py +++ b/views_evaluation/evaluation/metric_calculators.py @@ -455,12 +455,6 @@ def calculate_mtd( >>> mtd_score = calculate_mtd(actual_df, pred_df, "ln_sb_best") >>> print(f"Mean Tweedie Deviance: {mtd_score:.4f}") - References: - - Jørgensen, B. (1987). Exponential Dispersion Models. - Journal of the Royal Statistical Society. Series B, 49(2), 127-162. - - Tweedie, M. C. K. (1984). An index which distinguishes between some important - exponential families. Statistics: Applications and New Directions. - See Also: - sklearn.metrics.mean_tweedie_deviance: The underlying implementation. - calculate_mse: Mean Squared Error (equivalent to MTD with power=0). From e8c65f7ee2e3637260a30bdec8bb0ef69ec1e62a Mon Sep 17 00:00:00 2001 From: Dylan <52908667+smellycloud@users.noreply.github.com> Date: Tue, 3 Feb 2026 15:00:38 +0100 Subject: [PATCH 3/3] tests --- tests/test_metric_calculators.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/tests/test_metric_calculators.py b/tests/test_metric_calculators.py index 31872a2..bd5321b 100644 --- a/tests/test_metric_calculators.py +++ b/tests/test_metric_calculators.py @@ -11,6 +11,7 @@ calculate_coverage, calculate_ignorance_score, calculate_mean_interval_score, + calculate_mtd, POINT_METRIC_FUNCTIONS, UNCERTAINTY_METRIC_FUNCTIONS, ) @@ -94,6 +95,28 @@ def test_calculate_pearson(sample_data): assert -1 <= result <= 1 +def test_calculate_mtd(sample_data): + """Test Mean Tweedie Deviance calculation.""" + actual, pred = sample_data + result = calculate_mtd(actual, pred, 'target') + assert isinstance(result, float) + assert result >= 0 + + +def test_calculate_mtd_with_power(sample_data): + """Test Mean Tweedie Deviance calculation with different power values.""" + actual, pred = sample_data + # Test with power=1.5 (compound Poisson-Gamma) + result_15 = calculate_mtd(actual, pred, 'target', power=1.5) + assert isinstance(result_15, float) + assert result_15 >= 0 + + # Test with power=2 (Gamma) + result_2 = calculate_mtd(actual, pred, 'target', power=2.0) + assert isinstance(result_2, float) + assert result_2 >= 0 + + def test_calculate_coverage_uncertainty(sample_uncertainty_data): """Test Coverage calculation.""" actual, pred = sample_uncertainty_data @@ -121,7 +144,7 @@ def test_calculate_mis_uncertainty(sample_uncertainty_data): def test_point_metric_functions(): """Test that all point metric functions are available.""" expected_metrics = [ - "RMSLE", "CRPS", "AP", "EMD", "SD", "pEMDiv", "Pearson", "Variogram" + "MSE", "MSLE", "RMSLE", "CRPS", "AP", "EMD", "SD", "pEMDiv", "Pearson", "Variogram", "MTD", "y_hat_bar" ] for metric in expected_metrics: