diff --git a/README.md b/README.md index e077671..e758e01 100644 --- a/README.md +++ b/README.md @@ -55,9 +55,31 @@ VIEWS Evaluation ensures **forecasting accuracy and model robustness** as the ** * **Step-wise evaluation**: groups and evaluates predictions by the respective steps from all models. * **Time-series-wise evaluation**: evaluates predictions for each time-series. * **Month-wise evaluation**: groups and evaluates predictions at a monthly level. -* **Support for Mulyiple Metrics** - * **Point Evaluation Metrics**: RMSLE, CRPS, Average Precision (Brier Score, Jeffreys Divergence, Pearson Correlation, Sinkhorn/Earth-mover Distance & pEMDiv and Variogram to be added). - * **Uncertainty Evaluation Metrics**: CRPS (and more to be added in the future). +* **Support for Multiple Metrics** (see table below for details) + +### **Available Metrics** + +| Metric | Key | Description | Available | Supports Distributions | +|--------|-----|-------------|:---------:|:----------------------:| +| Mean Squared Error | `MSE` | Average of squared differences between predictions and actuals | ✅ | ❌ | +| Mean Squared Log Error | `MSLE` | MSE computed on log-transformed values | ✅ | ❌ | +| Root Mean Squared Log Error | `RMSLE` | Square root of MSLE | ✅ | ❌ | +| Mean Tweedie Deviance | `MTD` | Tweedie deviance with power=1.5, ideal for zero-inflated data | ✅ | ❌ | +| Average Precision | `AP` | Area under precision-recall curve for binary classification | ✅ | ❌ | +| Pearson Correlation | `Pearson` | Linear correlation between predictions and actuals | ✅ | ❌ | +| Earth Mover's Distance | `EMD` | Wasserstein distance between predicted and actual distributions | ✅ | ✅ | +| Continuous Ranked Probability Score | `CRPS` | Measures calibration and sharpness of probabilistic forecasts | ✅ | ✅ | +| Mean Interval Score | `MIS` | Evaluates prediction interval width and coverage | ✅ | ✅ | +| Ignorance Score | `Ignorance` | Logarithmic scoring rule for probabilistic predictions | ✅ | ✅ | +| Coverage | `Coverage` | Proportion of actuals falling within prediction intervals | ✅ | ✅ | +| Mean Prediction | `y_hat_bar` | Average of all predicted values | ✅ | ✅ | +| Sinkhorn Distance | `SD` | Regularized optimal transport distance | ❌ | ✅ | +| pseudo-Earth Mover Divergence | `pEMDiv` | Efficient EMD approximation | ❌ | ✅ | +| Variogram | `Variogram` | Spatial/temporal correlation structure score | ❌ | ❌ | +| Brier Score | `Brier` | Accuracy of probabilistic predictions | ❌ | ✅ | +| Jeffreys Divergence | `Jeffreys` | Symmetric measure of distribution difference | ❌ | ✅ | + +> **Note:** Metrics marked with ✅ in "Supports Distributions" can be used for uncertainty evaluation with ensemble/sample-based predictions. * **Data Integrity Checks**: Ensures that input DataFrames conform to expected structures before evaluation based on point and uncertainty evaluation. * **Automatic Index Matching**: Aligns actual and predicted values based on MultiIndex structures. * **Planned Enhancements**: diff --git a/tests/test_metric_calculators.py b/tests/test_metric_calculators.py index 31872a2..bd5321b 100644 --- a/tests/test_metric_calculators.py +++ b/tests/test_metric_calculators.py @@ -11,6 +11,7 @@ calculate_coverage, calculate_ignorance_score, calculate_mean_interval_score, + calculate_mtd, POINT_METRIC_FUNCTIONS, UNCERTAINTY_METRIC_FUNCTIONS, ) @@ -94,6 +95,28 @@ def test_calculate_pearson(sample_data): assert -1 <= result <= 1 +def test_calculate_mtd(sample_data): + """Test Mean Tweedie Deviance calculation.""" + actual, pred = sample_data + result = calculate_mtd(actual, pred, 'target') + assert isinstance(result, float) + assert result >= 0 + + +def test_calculate_mtd_with_power(sample_data): + """Test Mean Tweedie Deviance calculation with different power values.""" + actual, pred = sample_data + # Test with power=1.5 (compound Poisson-Gamma) + result_15 = calculate_mtd(actual, pred, 'target', power=1.5) + assert isinstance(result_15, float) + assert result_15 >= 0 + + # Test with power=2 (Gamma) + result_2 = calculate_mtd(actual, pred, 'target', power=2.0) + assert isinstance(result_2, float) + assert result_2 >= 0 + + def test_calculate_coverage_uncertainty(sample_uncertainty_data): """Test Coverage calculation.""" actual, pred = sample_uncertainty_data @@ -121,7 +144,7 @@ def test_calculate_mis_uncertainty(sample_uncertainty_data): def test_point_metric_functions(): """Test that all point metric functions are available.""" expected_metrics = [ - "RMSLE", "CRPS", "AP", "EMD", "SD", "pEMDiv", "Pearson", "Variogram" + "MSE", "MSLE", "RMSLE", "CRPS", "AP", "EMD", "SD", "pEMDiv", "Pearson", "Variogram", "MTD", "y_hat_bar" ] for metric in expected_metrics: diff --git a/views_evaluation/evaluation/metric_calculators.py b/views_evaluation/evaluation/metric_calculators.py index 1bc7523..28ba5cb 100644 --- a/views_evaluation/evaluation/metric_calculators.py +++ b/views_evaluation/evaluation/metric_calculators.py @@ -7,6 +7,7 @@ mean_squared_error, mean_squared_log_error, average_precision_score, + mean_tweedie_deviance, ) from scipy.stats import wasserstein_distance, pearsonr @@ -408,6 +409,66 @@ def _calculate_ignorance_score(predictions, observed, n, all_bins): return np.mean(scores) +def calculate_mtd( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str, power: float = 1.5 +) -> float: + """ + Calculate Mean Tweedie Deviance (MTD) between actual and predicted values. + + The Tweedie deviance is a family of loss functions parameterized by a power parameter `p`. + It generalizes several common loss functions: + - p = 0: Equivalent to Mean Squared Error (Gaussian distribution) + - p = 1: Equivalent to Poisson deviance (count data) + - p = 2: Equivalent to Gamma deviance (positive continuous data) + - 1 < p < 2: Compound Poisson-Gamma distribution (zero-inflated positive continuous data) + + With the default power of 1.5 (compound Poisson-Gamma), this metric is particularly + well-suited for conflict forecasting data which typically exhibits: + - Right-skewness (many small values, few large values) + - Zero-inflation (many observations with zero fatalities) + - Non-negative continuous outcomes + + The Tweedie deviance for a single observation is defined as: + d(y, μ) = 2 * (y^(2-p)/((1-p)*(2-p)) - y*μ^(1-p)/(1-p) + μ^(2-p)/(2-p)) + where y is the actual value and μ is the predicted value. + + Lower values indicate better model performance. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values with the target column. + The target column should contain numpy arrays or lists of actual observations. + matched_pred (pd.DataFrame): DataFrame containing predictions with the `pred_{target}` column. + The prediction column should contain numpy arrays or lists of predicted values. + target (str): The target column name (without the 'pred_' prefix). + power (float): The power parameter for the Tweedie distribution. Must be in range + [0, 1) or >= 1. Default is 1.5, which corresponds to the compound Poisson-Gamma + distribution, ideal for zero-inflated positive continuous data. + + Returns: + float: The Mean Tweedie Deviance score. Lower values indicate better predictions. + + Raises: + ValueError: If predictions contain negative values when power > 0, or if + actual values are negative when power >= 1. + + Example: + >>> mtd_score = calculate_mtd(actual_df, pred_df, "ln_sb_best") + >>> print(f"Mean Tweedie Deviance: {mtd_score:.4f}") + + See Also: + - sklearn.metrics.mean_tweedie_deviance: The underlying implementation. + - calculate_mse: Mean Squared Error (equivalent to MTD with power=0). + """ + actual_values = np.concatenate(matched_actual[target].values) + pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) + + actual_expanded = np.repeat( + actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] + ) + + return mean_tweedie_deviance(actual_expanded, pred_values, power=power) + + def calculate_mean_prediction( matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str ) -> float: @@ -428,6 +489,7 @@ def calculate_mean_prediction( "pEMDiv": calculate_pEMDiv, "Pearson": calculate_pearson, "Variogram": calculate_variogram, + "MTD": calculate_mtd, "y_hat_bar": calculate_mean_prediction, } diff --git a/views_evaluation/evaluation/metrics.py b/views_evaluation/evaluation/metrics.py index a7dcf33..ba8ebd1 100644 --- a/views_evaluation/evaluation/metrics.py +++ b/views_evaluation/evaluation/metrics.py @@ -128,6 +128,7 @@ class PointEvaluationMetrics(BaseEvaluationMetrics): pEMDiv: Optional[float] = None Pearson: Optional[float] = None Variogram: Optional[float] = None + MTD: Optional[float] = None y_hat_bar: Optional[float] = None