From 3323a1ac761198cfcc94eaaf6c696bf91c6282d9 Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Thu, 8 May 2025 13:42:12 +0200 Subject: [PATCH 01/15] add new metrics and using viewsdataset --- tests/test_evaluation_manager.py | 151 +++++++----- .../evaluation/evaluation_manager.py | 224 ++++++++++++++---- 2 files changed, 274 insertions(+), 101 deletions(-) diff --git a/tests/test_evaluation_manager.py b/tests/test_evaluation_manager.py index 7498b75..8137f2c 100644 --- a/tests/test_evaluation_manager.py +++ b/tests/test_evaluation_manager.py @@ -5,6 +5,7 @@ from sklearn.metrics import root_mean_squared_log_error import properscoring as ps from views_evaluation.evaluation.evaluation_manager import EvaluationManager +from views_pipeline_core.data.handlers import _ViewsDataset @pytest.fixture @@ -55,19 +56,19 @@ def mock_actual(): ) df = pd.DataFrame( { - "target": [0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6], - "covariate_1": [3, 2, 4, 5, 2, 6, 8, 5, 3, 2, 9, 4], + "target": [0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0, 5.0, 6.0], + "covariate_1": [3.0, 2.0, 4.0, 5.0, 2.0, 6.0, 8.0, 5.0, 3.0, 2.0, 9.0, 4.0], }, index=index, ) - return df + return _ViewsDataset(df, targets=["target"]).dataframe @pytest.fixture def mock_point_predictions(mock_index): - df1 = pd.DataFrame({"pred_target": [1, 3, 5, 7, 9, 7]}, index=mock_index[0]) - df2 = pd.DataFrame({"pred_target": [2, 4, 6, 8, 10, 8]}, index=mock_index[1]) - return [df1, df2] + df1 = pd.DataFrame({"pred_target": [1.0, 3.0, 5.0, 7.0, 9.0, 7.0]}, index=mock_index[0]) + df2 = pd.DataFrame({"pred_target": [2.0, 4.0, 6.0, 8.0, 10.0, 8.0]}, index=mock_index[1]) + return [_ViewsDataset(df1).dataframe, _ViewsDataset(df2).dataframe] @pytest.fixture @@ -75,12 +76,12 @@ def mock_uncertainty_predictions(mock_index): df1 = pd.DataFrame( { "pred_target": [ - [1, 2, 3], - [2, 3, 4], - [3, 4, 5], - [4, 5, 6], - [5, 6, 7], - [6, 7, 8], + [1.0, 2.0, 3.0], + [2.0, 3.0, 4.0], + [3.0, 4.0, 5.0], + [4.0, 5.0, 6.0], + [5.0, 6.0, 7.0], + [6.0, 7.0, 8.0], ] }, index=mock_index[0], @@ -88,17 +89,17 @@ def mock_uncertainty_predictions(mock_index): df2 = pd.DataFrame( { "pred_target": [ - [4, 6, 8], - [5, 7, 9], - [6, 8, 10], - [7, 9, 11], - [8, 10, 12], - [9, 11, 13], + [4.0, 6.0, 8.0], + [5.0, 7.0, 9.0], + [6.0, 8.0, 10.0], + [7.0, 9.0, 11.0], + [8.0, 10.0, 12.0], + [9.0, 11.0, 13.0], ] }, index=mock_index[1], ) - return [df1, df2] + return [_ViewsDataset(df1).dataframe, _ViewsDataset(df2).dataframe] def test_validate_dataframes_valid_type(mock_point_predictions): @@ -115,13 +116,6 @@ def test_validate_dataframes_valid_columns(mock_point_predictions): ) -def test_validate_dataframes_valid_point(mock_uncertainty_predictions): - with pytest.raises(ValueError): - EvaluationManager.validate_predictions( - mock_uncertainty_predictions, "target", is_uncertainty=False - ) - - def test_validate_dataframes_valid_uncertainty(mock_point_predictions): with pytest.raises(ValueError): EvaluationManager.validate_predictions( @@ -132,30 +126,30 @@ def test_validate_dataframes_valid_uncertainty(mock_point_predictions): def test_get_evaluation_type(): # Test case 1: All DataFrames for uncertainty evaluation predictions_uncertainty = [ - pd.DataFrame({'pred_target': [[1, 2], [3, 4]]}), - pd.DataFrame({'pred_target': [[5, 6], [7, 8]]}), + pd.DataFrame({'pred_target': [[1.0, 2.0], [3.0, 4.0]]}), + pd.DataFrame({'pred_target': [[5.0, 6.0], [7.0, 8.0]]}), ] assert EvaluationManager.get_evaluation_type(predictions_uncertainty) == True # Test case 2: All DataFrames for point evaluation predictions_point = [ - pd.DataFrame({'pred_target': [1.0, 2.0]}), - pd.DataFrame({'pred_target': [3.0, 4.0]}), + pd.DataFrame({'pred_target': [[1.0], [2.0]]}), + pd.DataFrame({'pred_target': [[3.0], [4.0]]}), ] assert EvaluationManager.get_evaluation_type(predictions_point) == False # Test case 3: Mixed evaluation types predictions_mixed = [ - pd.DataFrame({'pred_target': [[1, 2], [3, 4]]}), - pd.DataFrame({'pred_target': [5.0, 6.0]}), + pd.DataFrame({'pred_target': [[1.0, 2.0], [3.0, 4.0]]}), + pd.DataFrame({'pred_target': [[5.0], [6.0]]}), ] with pytest.raises(ValueError): EvaluationManager.get_evaluation_type(predictions_mixed) # Test case 4: Single element lists predictions_single_element = [ - pd.DataFrame({'pred_target': [[1], [2]]}), - pd.DataFrame({'pred_target': [[3], [4]]}), + pd.DataFrame({'pred_target': [[1.0], [2.0]]}), + pd.DataFrame({'pred_target': [[3.0], [4.0]]}), ] assert EvaluationManager.get_evaluation_type(predictions_single_element) == False @@ -164,8 +158,8 @@ def test_match_actual_pred_point( mock_actual, mock_point_predictions, mock_uncertainty_predictions, mock_index ): df_matched = [ - pd.DataFrame({"target": [1, 2, 2, 3, 3, 4]}, index=mock_index[0]), - pd.DataFrame({"target": [2, 3, 3, 4, 4, 5]}, index=mock_index[1]), + pd.DataFrame({"target": [[1.0], [2.0], [2.0], [3.0], [3.0], [4.0]]}, index=mock_index[0]), + pd.DataFrame({"target": [[2.0], [3.0], [3.0], [4.0], [4.0], [5.0]]}, index=mock_index[1]), ] for i in range(len(df_matched)): df_matched_actual_point, df_matched_point = ( @@ -186,44 +180,44 @@ def test_match_actual_pred_point( def test_split_dfs_by_step(mock_point_predictions, mock_uncertainty_predictions): df_splitted_point = [ - pd.DataFrame( - {"pred_target": [1, 3, 2, 4]}, + _ViewsDataset(pd.DataFrame( + {"pred_target": [[1.0], [3.0], [2.0], [4.0]]}, index=pd.MultiIndex.from_tuples( [(100, 1), (100, 2), (101, 1), (101, 2)], names=["month", "country"] ), - ), - pd.DataFrame( - {"pred_target": [5, 7, 6, 8]}, + )).dataframe, + _ViewsDataset(pd.DataFrame( + {"pred_target": [[5.0], [7.0], [6.0], [8.0]]}, index=pd.MultiIndex.from_tuples( [(101, 1), (101, 2), (102, 1), (102, 2)], names=["month", "country"] ), - ), - pd.DataFrame( - {"pred_target": [9, 7, 10, 8]}, + )).dataframe, + _ViewsDataset(pd.DataFrame( + {"pred_target": [[9.0], [7.0], [10.0], [8.0]]}, index=pd.MultiIndex.from_tuples( [(102, 1), (102, 2), (103, 1), (103, 2)], names=["month", "country"] ), - ), + )).dataframe, ] df_splitted_uncertainty = [ - pd.DataFrame( - {"pred_target": [[1, 2, 3], [2, 3, 4], [4, 6, 8], [5, 7, 9]]}, + _ViewsDataset(pd.DataFrame( + {"pred_target": [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [4.0, 6.0, 8.0], [5.0, 7.0, 9.0]]}, index=pd.MultiIndex.from_tuples( [(100, 1), (100, 2), (101, 1), (101, 2)], names=["month", "country"] ), - ), - pd.DataFrame( - {"pred_target": [[3, 4, 5], [4, 5, 6], [6, 8, 10], [7, 9, 11]]}, + )).dataframe, + _ViewsDataset(pd.DataFrame( + {"pred_target": [[3.0, 4.0, 5.0], [4.0, 5.0, 6.0], [6.0, 8.0, 10.0], [7.0, 9.0, 11.0]]}, index=pd.MultiIndex.from_tuples( [(101, 1), (101, 2), (102, 1), (102, 2)], names=["month", "country"] ), - ), - pd.DataFrame( - {"pred_target": [[5, 6, 7], [6, 7, 8], [8, 10, 12], [9, 11, 13]]}, + )).dataframe, + _ViewsDataset(pd.DataFrame( + {"pred_target": [[5.0, 6.0, 7.0], [6.0, 7.0, 8.0], [8.0, 10.0, 12.0], [9.0, 11.0, 13.0]]}, index=pd.MultiIndex.from_tuples( [(102, 1), (102, 2), (103, 1), (103, 2)], names=["month", "country"] ), - ), + )).dataframe, ] df_splitted_point_test = EvaluationManager._split_dfs_by_step( mock_point_predictions @@ -393,3 +387,52 @@ def test_month_wise_evaluation_uncertainty(mock_actual, mock_uncertainty_predict evaluation_dict.keys() ) assert np.allclose(df_evaluation, df_evaluation_test, atol=0.000001) + + +def test_calculate_ap_point_predictions(): + actual_data = {'target': [[40], [20], [35], [25]]} + pred_data = {'pred_target': [[35], [30], [20], [15]]} + threshold=30 + + matched_actual = pd.DataFrame(actual_data) + matched_pred = pd.DataFrame(pred_data) + + ap_score = EvaluationManager._calculate_ap(matched_actual, matched_pred, 'target', threshold) + + actual_binary = [1, 0, 1, 0] # 40>30, 20<30, 35>30, 25<30 + pred_binary = [1, 1, 0, 0] # 35>30, 30=30, 20<30, 15<30 + from sklearn.metrics import average_precision_score + expected_ap = average_precision_score(actual_binary, pred_binary) + + assert abs(ap_score - expected_ap) < 0.01 + + +def test_calculate_ap_uncertainty_predictions(): + actual_data = {'target': [[40], [20], [35], [25]]} + pred_data = { + 'pred_target': [ + [35, 40, 45], + [30, 35, 40], + [20, 25, 30], + [15, 20, 25] + ] + } + threshold=30 + matched_actual = pd.DataFrame(actual_data) + matched_pred = pd.DataFrame(pred_data) + + ap_score = EvaluationManager._calculate_ap(matched_actual, matched_pred, 'target', threshold) + + pred_values = [35, 40, 45, 30, 35, 40, 20, 25, 30, 15, 20, 25] + actual_values = [40, 40, 40, 20, 20, 20, 35, 35, 35, 25, 25, 25] + actual_binary = [1 if x > threshold else 0 for x in actual_values] + pred_binary = [1 if x >= threshold else 0 for x in pred_values] + + from sklearn.metrics import average_precision_score + expected_ap = average_precision_score(actual_binary, pred_binary) + + assert abs(ap_score - expected_ap) < 0.01 + + + + diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py index 0f5074a..01b8b37 100644 --- a/views_evaluation/evaluation/evaluation_manager.py +++ b/views_evaluation/evaluation/evaluation_manager.py @@ -1,17 +1,18 @@ from typing import List, Dict, Tuple, Optional +import logging import pandas as pd import numpy as np import properscoring as ps from sklearn.metrics import ( - root_mean_squared_error, root_mean_squared_log_error, average_precision_score, ) +from scipy.stats import wasserstein_distance, pearsonr from views_evaluation.evaluation.metrics import ( PointEvaluationMetrics, UncertaintyEvaluationMetrics, ) -import logging +from views_pipeline_core.data.handlers import _ViewsDataset logger = logging.getLogger(__name__) @@ -52,19 +53,44 @@ def __init__(self, metrics_list: list): def _calculate_rmsle( matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str ) -> float: - return ( - root_mean_squared_error(matched_actual, matched_pred) - if target.startswith("ln") - else root_mean_squared_log_error(matched_actual, matched_pred) - ) + """ + Calculate Root Mean Squared Logarithmic Error (RMSLE) for each prediction. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Average RMSLE score + """ + actual_values = np.concatenate(matched_actual[target].values) + pred_values = np.concatenate(matched_pred[f'pred_{target}'].values) + + actual_expanded = np.repeat(actual_values, + [len(x) for x in matched_pred[f'pred_{target}']]) + + return root_mean_squared_log_error(actual_expanded, pred_values) + @staticmethod def _calculate_crps( matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str ) -> float: + """ + Calculate Continuous Ranked Probability Score (CRPS) for each prediction. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Average CRPS score + """ return np.mean( [ - ps.crps_ensemble(actual, np.array(pred)) + ps.crps_ensemble(actual[0], np.array(pred)) for actual, pred in zip( matched_actual[target], matched_pred[f"pred_{target}"] ) @@ -73,17 +99,30 @@ def _calculate_crps( @staticmethod def _calculate_ap( - matched_actual: pd.DataFrame, - matched_pred: pd.DataFrame, - target: str, - threshold=0.01, + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str, threshold=30 ) -> float: """ Calculate Average Precision (AP) for binary predictions with a threshold. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + threshold (float): Threshold to convert predictions to binary values + + Returns: + float: Average Precision score """ - matched_pred_binary = (matched_pred >= threshold).astype(int) - matched_actual_binary = (matched_actual > 0).astype(int) - return average_precision_score(matched_actual_binary, matched_pred_binary) + actual_values = np.concatenate(matched_actual[target].values) + pred_values = np.concatenate(matched_pred[f'pred_{target}'].values) + + actual_expanded = np.repeat(actual_values, + [len(x) for x in matched_pred[f'pred_{target}']]) + + actual_binary = (actual_expanded > threshold).astype(int) + pred_binary = (pred_values >= threshold).astype(int) + + return average_precision_score(actual_binary, pred_binary) @staticmethod def _calculate_brier( @@ -107,7 +146,28 @@ def _calculate_coverage( def _calculate_emd( matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str ) -> float: - pass + """ + Calculate Earth Mover's Distance (EMD) between predicted and actual distributions. + EMD measures the minimum amount of work needed to transform one distribution into another. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Average EMD score + """ + actual_values = np.concatenate(matched_actual[target].values) + pred_values = np.concatenate(matched_pred[f'pred_{target}'].values) + + actual_expanded = np.repeat(actual_values, + [len(x) for x in matched_pred[f'pred_{target}']]) + + # Calculate EMD (1D Wasserstein distance) + emd = wasserstein_distance(actual_expanded, pred_values) + + return emd @staticmethod def _calculate_sd( @@ -125,14 +185,81 @@ def _calculate_pEMDiv( def _calculate_pearson( matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str ) -> float: - pass + """ + Calculate Pearson correlation coefficient between actual and predicted values. + This measures the linear correlation between predictions and actual values. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Pearson correlation coefficient + """ + actual_values = np.concatenate(matched_actual[target].values) + pred_values = np.concatenate(matched_pred[f'pred_{target}'].values) + + actual_expanded = np.repeat(actual_values, + [len(x) for x in matched_pred[f'pred_{target}']]) + + # Calculate Pearson correlation + correlation, _ = pearsonr(actual_expanded, pred_values) + return correlation @staticmethod def _calculate_variogram( matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str ) -> float: - pass + """ + Calculate the variogram score between actual and predicted values. + This measures the spatial/temporal correlation structure. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Variogram score + """ + actual_values = np.concatenate(matched_actual[target].values) + pred_values = np.concatenate(matched_pred[f'pred_{target}'].values) + + actual_expanded = np.repeat(actual_values, + [len(x) for x in matched_pred[f'pred_{target}']]) + + # Calculate empirical variogram for actual values + n = len(actual_expanded) + actual_variogram = np.zeros(n-1) + for i in range(n-1): + actual_variogram[i] = np.mean((actual_expanded[i+1:] - actual_expanded[i])**2) + + # Calculate empirical variogram for predicted values + pred_variogram = np.zeros(n-1) + for i in range(n-1): + pred_variogram[i] = np.mean((pred_values[i+1:] - pred_values[i])**2) + + # Calculate mean squared difference between variograms + variogram_score = np.mean((actual_variogram - pred_variogram)**2) + + return variogram_score + @staticmethod + def transform_data(df: pd.DataFrame, target: str) -> pd.DataFrame: + """ + Transform the data to normal distribution. + """ + if target.startswith("ln"): + df[target] = df[target].applymap(lambda x: np.exp(x) - 1 if isinstance(x, (list, np.ndarray)) else np.exp(x) - 1) + elif target.startswith("lx"): + df[target] = df[target].applymap(lambda x: np.exp(x) - np.exp(100) if isinstance(x, (list, np.ndarray)) else np.exp(x) - np.exp(100)) + elif target.startswith("lr"): + df[target] = df[target].applymap(lambda x: x if isinstance(x, (list, np.ndarray)) else x) + else: + raise ValueError(f"Target {target} is not a valid target") + return df + @staticmethod def get_evaluation_type(predictions: List[pd.DataFrame]) -> bool: """ @@ -144,32 +271,43 @@ def get_evaluation_type(predictions: List[pd.DataFrame]) -> bool: Returns: bool: True if all DataFrames are for uncertainty evaluation, - False if any DataFrame is suitable for point evaluation. + False if all DataFrame are for point evaluation. Raises: - ValueError: If there is a mix of results (some DataFrames for uncertainty and others for point evaluation). + ValueError: If there is a mix of single and multiple values in the lists, + or if uncertainty lists have different lengths. """ - all_uncertainty = True - all_point = True + is_uncertainty = False + is_point = False + uncertainty_length = None for df in predictions: - if all( - isinstance(value, list) and len(value) >= 2 - for value in df.values.flatten() - ): - all_point = False - else: - all_uncertainty = False - - if all_uncertainty and not all_point: - return True - elif all_point and not all_uncertainty: - return False - else: + for value in df.values.flatten(): + if not isinstance(value, list): + raise ValueError("All values must be lists. Use _ViewsDataset to convert the data.") + + if len(value) > 1: + is_uncertainty = True + # For uncertainty evaluation, check that all lists have the same length + if uncertainty_length is None: + uncertainty_length = len(value) + elif len(value) != uncertainty_length: + raise ValueError( + f"Inconsistent list lengths in uncertainty evaluation. " + f"Found lengths {uncertainty_length} and {len(value)}" + ) + elif len(value) == 1: + is_point = True + else: + raise ValueError("Empty lists are not allowed") + + if is_uncertainty and is_point: raise ValueError( - "Mix of evaluation types detected: some DataFrames are for uncertainty, others for point evaluation." - "Please ensure all DataFrames are consistent in their evaluation type" + "Mix of evaluation types detected: some rows contain single values, others contain multiple values. " + "Please ensure all rows are consistent in their evaluation type" ) + + return is_uncertainty @staticmethod def validate_predictions( @@ -199,16 +337,6 @@ def validate_predictions( raise ValueError( f"Predictions[{i}] must contain only one column named '{pred_column_name}'." ) - if ( - is_uncertainty - and not df.applymap(lambda x: isinstance(x, list)).all().all() - ): - raise ValueError("Each row in the predictions must be a list.") - if ( - not is_uncertainty - and not df.applymap(lambda x: isinstance(x, (int, float))).all().all() - ): - raise ValueError("Each row in the predictions must be a float.") @staticmethod def _match_actual_pred( @@ -457,6 +585,8 @@ def evaluate( """ is_uncertainty = EvaluationManager.get_evaluation_type(predictions) EvaluationManager.validate_predictions(predictions, target, is_uncertainty) + actual = EvaluationManager.transform_data(_ViewsDataset(actual).dataframe, target) + predictions = [EvaluationManager.transform_data(_ViewsDataset(pred).dataframe, target) for pred in predictions] evaluation_results = {} evaluation_results["month"] = self.month_wise_evaluation( From d07e1fc56b000598ad4d003b617025c2a65165cb Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Thu, 8 May 2025 14:04:46 +0200 Subject: [PATCH 02/15] fix validation --- views_evaluation/evaluation/evaluation_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py index 01b8b37..326f51a 100644 --- a/views_evaluation/evaluation/evaluation_manager.py +++ b/views_evaluation/evaluation/evaluation_manager.py @@ -333,9 +333,9 @@ def validate_predictions( raise TypeError(f"Predictions[{i}] must be a DataFrame.") if df.empty: raise ValueError(f"Predictions[{i}] must not be empty.") - if df.columns.tolist() != [pred_column_name]: + if pred_column_name not in df.columns: raise ValueError( - f"Predictions[{i}] must contain only one column named '{pred_column_name}'." + f"Predictions[{i}] must contain the column named '{pred_column_name}'." ) @staticmethod From 7c291809f572cef8f707f99dd4639b0478c76e7b Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Thu, 8 May 2025 14:57:04 +0200 Subject: [PATCH 03/15] fix errors --- .../evaluation/evaluation_manager.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py index 326f51a..512eab5 100644 --- a/views_evaluation/evaluation/evaluation_manager.py +++ b/views_evaluation/evaluation/evaluation_manager.py @@ -250,12 +250,12 @@ def transform_data(df: pd.DataFrame, target: str) -> pd.DataFrame: """ Transform the data to normal distribution. """ - if target.startswith("ln"): - df[target] = df[target].applymap(lambda x: np.exp(x) - 1 if isinstance(x, (list, np.ndarray)) else np.exp(x) - 1) - elif target.startswith("lx"): - df[target] = df[target].applymap(lambda x: np.exp(x) - np.exp(100) if isinstance(x, (list, np.ndarray)) else np.exp(x) - np.exp(100)) - elif target.startswith("lr"): - df[target] = df[target].applymap(lambda x: x if isinstance(x, (list, np.ndarray)) else x) + if target.startswith("ln") or target.startswith("pred_ln"): + df[[target]] = df[[target]].applymap(lambda x: np.exp(x) - 1 if isinstance(x, (list, np.ndarray)) else np.exp(x) - 1) + elif target.startswith("lx") or target.startswith("pred_lx"): + df[[target]] = df[[target]].applymap(lambda x: np.exp(x) - np.exp(100) if isinstance(x, (list, np.ndarray)) else np.exp(x) - np.exp(100)) + elif target.startswith("lr") or target.startswith("pred_lr"): + df[[target]] = df[[target]].applymap(lambda x: x if isinstance(x, (list, np.ndarray)) else x) else: raise ValueError(f"Target {target} is not a valid target") return df @@ -585,8 +585,8 @@ def evaluate( """ is_uncertainty = EvaluationManager.get_evaluation_type(predictions) EvaluationManager.validate_predictions(predictions, target, is_uncertainty) - actual = EvaluationManager.transform_data(_ViewsDataset(actual).dataframe, target) - predictions = [EvaluationManager.transform_data(_ViewsDataset(pred).dataframe, target) for pred in predictions] + actual = EvaluationManager.transform_data(_ViewsDataset(actual, targets=[target]).dataframe, target) + predictions = [EvaluationManager.transform_data(_ViewsDataset(pred).dataframe, f"pred_{target}") for pred in predictions] evaluation_results = {} evaluation_results["month"] = self.month_wise_evaluation( From ae6e4979bf7e063dbf2e6d00184d09386f97d201 Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Mon, 23 Jun 2025 14:12:30 +0200 Subject: [PATCH 04/15] error fixed --- views_evaluation/evaluation/evaluation_manager.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py index 512eab5..1f57172 100644 --- a/views_evaluation/evaluation/evaluation_manager.py +++ b/views_evaluation/evaluation/evaluation_manager.py @@ -283,8 +283,8 @@ def get_evaluation_type(predictions: List[pd.DataFrame]) -> bool: for df in predictions: for value in df.values.flatten(): - if not isinstance(value, list): - raise ValueError("All values must be lists. Use _ViewsDataset to convert the data.") + if not (isinstance(value, list) or isinstance(value, np.ndarray)): + raise ValueError("All values must be lists or numpy arrays. Use _ViewsDataset to convert the data.") if len(value) > 1: is_uncertainty = True @@ -311,18 +311,15 @@ def get_evaluation_type(predictions: List[pd.DataFrame]) -> bool: @staticmethod def validate_predictions( - predictions: List[pd.DataFrame], target: str, is_uncertainty: bool + predictions: List[pd.DataFrame], target: str ): """ Checks if the predictions are valid DataFrames. - Each DataFrame must have exactly one column named `pred_column_name`. - - If is_uncertainty is True, all elements in the column must be lists. - - If is_uncertainty is False, all elements in the column must be floats. Args: predictions (List[pd.DataFrame]): A list of DataFrames containing the predictions. target (str): The target column in the actual DataFrame. - is_uncertainty (bool): Flag to indicate if the evaluation is for uncertainty. """ pred_column_name = f"pred_{target}" if not isinstance(predictions, list): @@ -583,10 +580,11 @@ def evaluate( steps (List[int]): The steps to evaluate. """ - is_uncertainty = EvaluationManager.get_evaluation_type(predictions) - EvaluationManager.validate_predictions(predictions, target, is_uncertainty) + + EvaluationManager.validate_predictions(predictions, target) actual = EvaluationManager.transform_data(_ViewsDataset(actual, targets=[target]).dataframe, target) predictions = [EvaluationManager.transform_data(_ViewsDataset(pred).dataframe, f"pred_{target}") for pred in predictions] + is_uncertainty = EvaluationManager.get_evaluation_type(predictions) evaluation_results = {} evaluation_results["month"] = self.month_wise_evaluation( From f14691c11ba30df70ab4c1e7e8f568c17d701197 Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Mon, 23 Jun 2025 14:12:43 +0200 Subject: [PATCH 05/15] update version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bbb847d..5b66657 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "views_evaluation" -version = "0.2.0" +version = "0.4.0" description = "" authors = [ "Xiaolong Sun ", From a84af23780d086a00cd6a836efb3236226590a89 Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Tue, 24 Jun 2025 13:46:54 +0200 Subject: [PATCH 06/15] remove dependency on vpc --- .../evaluation/evaluation_manager.py | 149 ++++++++++++------ 1 file changed, 100 insertions(+), 49 deletions(-) diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py index 1f57172..96266dd 100644 --- a/views_evaluation/evaluation/evaluation_manager.py +++ b/views_evaluation/evaluation/evaluation_manager.py @@ -12,7 +12,6 @@ PointEvaluationMetrics, UncertaintyEvaluationMetrics, ) -from views_pipeline_core.data.handlers import _ViewsDataset logger = logging.getLogger(__name__) @@ -55,7 +54,7 @@ def _calculate_rmsle( ) -> float: """ Calculate Root Mean Squared Logarithmic Error (RMSLE) for each prediction. - + Args: matched_actual (pd.DataFrame): DataFrame containing actual values matched_pred (pd.DataFrame): DataFrame containing predictions @@ -65,13 +64,13 @@ def _calculate_rmsle( float: Average RMSLE score """ actual_values = np.concatenate(matched_actual[target].values) - pred_values = np.concatenate(matched_pred[f'pred_{target}'].values) + pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) - actual_expanded = np.repeat(actual_values, - [len(x) for x in matched_pred[f'pred_{target}']]) + actual_expanded = np.repeat( + actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] + ) return root_mean_squared_log_error(actual_expanded, pred_values) - @staticmethod def _calculate_crps( @@ -79,7 +78,7 @@ def _calculate_crps( ) -> float: """ Calculate Continuous Ranked Probability Score (CRPS) for each prediction. - + Args: matched_actual (pd.DataFrame): DataFrame containing actual values matched_pred (pd.DataFrame): DataFrame containing predictions @@ -99,29 +98,33 @@ def _calculate_crps( @staticmethod def _calculate_ap( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str, threshold=30 + matched_actual: pd.DataFrame, + matched_pred: pd.DataFrame, + target: str, + threshold=30, ) -> float: """ Calculate Average Precision (AP) for binary predictions with a threshold. - + Args: matched_actual (pd.DataFrame): DataFrame containing actual values matched_pred (pd.DataFrame): DataFrame containing predictions target (str): The target column name threshold (float): Threshold to convert predictions to binary values - + Returns: float: Average Precision score """ actual_values = np.concatenate(matched_actual[target].values) - pred_values = np.concatenate(matched_pred[f'pred_{target}'].values) + pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) - actual_expanded = np.repeat(actual_values, - [len(x) for x in matched_pred[f'pred_{target}']]) + actual_expanded = np.repeat( + actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] + ) actual_binary = (actual_expanded > threshold).astype(int) pred_binary = (pred_values >= threshold).astype(int) - + return average_precision_score(actual_binary, pred_binary) @staticmethod @@ -149,7 +152,7 @@ def _calculate_emd( """ Calculate Earth Mover's Distance (EMD) between predicted and actual distributions. EMD measures the minimum amount of work needed to transform one distribution into another. - + Args: matched_actual (pd.DataFrame): DataFrame containing actual values matched_pred (pd.DataFrame): DataFrame containing predictions @@ -159,10 +162,11 @@ def _calculate_emd( float: Average EMD score """ actual_values = np.concatenate(matched_actual[target].values) - pred_values = np.concatenate(matched_pred[f'pred_{target}'].values) + pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) - actual_expanded = np.repeat(actual_values, - [len(x) for x in matched_pred[f'pred_{target}']]) + actual_expanded = np.repeat( + actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] + ) # Calculate EMD (1D Wasserstein distance) emd = wasserstein_distance(actual_expanded, pred_values) @@ -188,7 +192,7 @@ def _calculate_pearson( """ Calculate Pearson correlation coefficient between actual and predicted values. This measures the linear correlation between predictions and actual values. - + Args: matched_actual (pd.DataFrame): DataFrame containing actual values matched_pred (pd.DataFrame): DataFrame containing predictions @@ -198,10 +202,11 @@ def _calculate_pearson( float: Pearson correlation coefficient """ actual_values = np.concatenate(matched_actual[target].values) - pred_values = np.concatenate(matched_pred[f'pred_{target}'].values) + pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) - actual_expanded = np.repeat(actual_values, - [len(x) for x in matched_pred[f'pred_{target}']]) + actual_expanded = np.repeat( + actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] + ) # Calculate Pearson correlation correlation, _ = pearsonr(actual_expanded, pred_values) @@ -214,7 +219,7 @@ def _calculate_variogram( """ Calculate the variogram score between actual and predicted values. This measures the spatial/temporal correlation structure. - + Args: matched_actual (pd.DataFrame): DataFrame containing actual values matched_pred (pd.DataFrame): DataFrame containing predictions @@ -224,24 +229,27 @@ def _calculate_variogram( float: Variogram score """ actual_values = np.concatenate(matched_actual[target].values) - pred_values = np.concatenate(matched_pred[f'pred_{target}'].values) + pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) - actual_expanded = np.repeat(actual_values, - [len(x) for x in matched_pred[f'pred_{target}']]) + actual_expanded = np.repeat( + actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] + ) # Calculate empirical variogram for actual values n = len(actual_expanded) - actual_variogram = np.zeros(n-1) - for i in range(n-1): - actual_variogram[i] = np.mean((actual_expanded[i+1:] - actual_expanded[i])**2) + actual_variogram = np.zeros(n - 1) + for i in range(n - 1): + actual_variogram[i] = np.mean( + (actual_expanded[i + 1 :] - actual_expanded[i]) ** 2 + ) # Calculate empirical variogram for predicted values - pred_variogram = np.zeros(n-1) - for i in range(n-1): - pred_variogram[i] = np.mean((pred_values[i+1:] - pred_values[i])**2) + pred_variogram = np.zeros(n - 1) + for i in range(n - 1): + pred_variogram[i] = np.mean((pred_values[i + 1 :] - pred_values[i]) ** 2) # Calculate mean squared difference between variograms - variogram_score = np.mean((actual_variogram - pred_variogram)**2) + variogram_score = np.mean((actual_variogram - pred_variogram) ** 2) return variogram_score @@ -251,15 +259,47 @@ def transform_data(df: pd.DataFrame, target: str) -> pd.DataFrame: Transform the data to normal distribution. """ if target.startswith("ln") or target.startswith("pred_ln"): - df[[target]] = df[[target]].applymap(lambda x: np.exp(x) - 1 if isinstance(x, (list, np.ndarray)) else np.exp(x) - 1) + df[[target]] = df[[target]].applymap( + lambda x: ( + np.exp(x) - 1 + if isinstance(x, (list, np.ndarray)) + else np.exp(x) - 1 + ) + ) elif target.startswith("lx") or target.startswith("pred_lx"): - df[[target]] = df[[target]].applymap(lambda x: np.exp(x) - np.exp(100) if isinstance(x, (list, np.ndarray)) else np.exp(x) - np.exp(100)) + df[[target]] = df[[target]].applymap( + lambda x: ( + np.exp(x) - np.exp(100) + if isinstance(x, (list, np.ndarray)) + else np.exp(x) - np.exp(100) + ) + ) elif target.startswith("lr") or target.startswith("pred_lr"): - df[[target]] = df[[target]].applymap(lambda x: x if isinstance(x, (list, np.ndarray)) else x) + df[[target]] = df[[target]].applymap( + lambda x: x if isinstance(x, (list, np.ndarray)) else x + ) else: raise ValueError(f"Target {target} is not a valid target") return df - + + @staticmethod + def convert_to_arrays(df: pd.DataFrame) -> pd.DataFrame: + """ + Convert columns in a DataFrame to numpy arrays. + + Args: + df (pd.DataFrame): The input DataFrame with columns that may contain lists. + + Returns: + pd.DataFrame: A new DataFrame with columns converted to numpy arrays. + """ + converted = df.copy() + for col in converted.columns: + converted[col] = converted[col].apply( + lambda x: np.array(x) if isinstance(x, list) else np.array([x]) + ) + return converted + @staticmethod def get_evaluation_type(predictions: List[pd.DataFrame]) -> bool: """ @@ -283,9 +323,11 @@ def get_evaluation_type(predictions: List[pd.DataFrame]) -> bool: for df in predictions: for value in df.values.flatten(): - if not (isinstance(value, list) or isinstance(value, np.ndarray)): - raise ValueError("All values must be lists or numpy arrays. Use _ViewsDataset to convert the data.") - + if not isinstance(value, np.ndarray): + raise ValueError( + "All values must be lists or numpy arrays. Use _ViewsDataset to convert the data." + ) + if len(value) > 1: is_uncertainty = True # For uncertainty evaluation, check that all lists have the same length @@ -306,13 +348,11 @@ def get_evaluation_type(predictions: List[pd.DataFrame]) -> bool: "Mix of evaluation types detected: some rows contain single values, others contain multiple values. " "Please ensure all rows are consistent in their evaluation type" ) - + return is_uncertainty @staticmethod - def validate_predictions( - predictions: List[pd.DataFrame], target: str - ): + def validate_predictions(predictions: List[pd.DataFrame], target: str): """ Checks if the predictions are valid DataFrames. - Each DataFrame must have exactly one column named `pred_column_name`. @@ -520,7 +560,7 @@ def month_wise_evaluation( month_range = pred_concat.index.get_level_values(0).unique() month_start = month_range.min() month_end = month_range.max() - + if is_uncertainty: evaluation_dict = ( UncertaintyEvaluationMetrics.make_month_wise_evaluation_dict( @@ -580,10 +620,17 @@ def evaluate( steps (List[int]): The steps to evaluate. """ - + EvaluationManager.validate_predictions(predictions, target) - actual = EvaluationManager.transform_data(_ViewsDataset(actual, targets=[target]).dataframe, target) - predictions = [EvaluationManager.transform_data(_ViewsDataset(pred).dataframe, f"pred_{target}") for pred in predictions] + actual = EvaluationManager.transform_data( + EvaluationManager.convert_to_arrays(actual), target + ) + predictions = [ + EvaluationManager.transform_data( + EvaluationManager.convert_to_arrays(pred), f"pred_{target}" + ) + for pred in predictions + ] is_uncertainty = EvaluationManager.get_evaluation_type(predictions) evaluation_results = {} @@ -594,7 +641,11 @@ def evaluate( actual, predictions, target, is_uncertainty ) evaluation_results["step"] = self.step_wise_evaluation( - actual, predictions, target, steps, is_uncertainty, + actual, + predictions, + target, + steps, + is_uncertainty, ) return evaluation_results From 5d52a543c7288a83392eec38737b8b778d92fa0e Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Tue, 24 Jun 2025 14:08:27 +0200 Subject: [PATCH 07/15] fix test --- tests/test_evaluation_manager.py | 43 ++++++++----------- .../evaluation/evaluation_manager.py | 6 +-- 2 files changed, 20 insertions(+), 29 deletions(-) diff --git a/tests/test_evaluation_manager.py b/tests/test_evaluation_manager.py index 8137f2c..9fa0c31 100644 --- a/tests/test_evaluation_manager.py +++ b/tests/test_evaluation_manager.py @@ -5,7 +5,6 @@ from sklearn.metrics import root_mean_squared_log_error import properscoring as ps from views_evaluation.evaluation.evaluation_manager import EvaluationManager -from views_pipeline_core.data.handlers import _ViewsDataset @pytest.fixture @@ -61,14 +60,14 @@ def mock_actual(): }, index=index, ) - return _ViewsDataset(df, targets=["target"]).dataframe + return EvaluationManager.convert_to_arrays(df) @pytest.fixture def mock_point_predictions(mock_index): df1 = pd.DataFrame({"pred_target": [1.0, 3.0, 5.0, 7.0, 9.0, 7.0]}, index=mock_index[0]) df2 = pd.DataFrame({"pred_target": [2.0, 4.0, 6.0, 8.0, 10.0, 8.0]}, index=mock_index[1]) - return [_ViewsDataset(df1).dataframe, _ViewsDataset(df2).dataframe] + return [EvaluationManager.convert_to_arrays(df1), EvaluationManager.convert_to_arrays(df2)] @pytest.fixture @@ -99,30 +98,22 @@ def mock_uncertainty_predictions(mock_index): }, index=mock_index[1], ) - return [_ViewsDataset(df1).dataframe, _ViewsDataset(df2).dataframe] + return [EvaluationManager.convert_to_arrays(df1), EvaluationManager.convert_to_arrays(df2)] def test_validate_dataframes_valid_type(mock_point_predictions): with pytest.raises(TypeError): EvaluationManager.validate_predictions( - mock_point_predictions[0], "target", is_uncertainty=False + mock_point_predictions[0], "target" ) def test_validate_dataframes_valid_columns(mock_point_predictions): with pytest.raises(ValueError): EvaluationManager.validate_predictions( - mock_point_predictions, "y", is_uncertainty=False + mock_point_predictions, "y" ) - -def test_validate_dataframes_valid_uncertainty(mock_point_predictions): - with pytest.raises(ValueError): - EvaluationManager.validate_predictions( - mock_point_predictions, "devpar", is_uncertainty=True - ) - - def test_get_evaluation_type(): # Test case 1: All DataFrames for uncertainty evaluation predictions_uncertainty = [ @@ -180,44 +171,44 @@ def test_match_actual_pred_point( def test_split_dfs_by_step(mock_point_predictions, mock_uncertainty_predictions): df_splitted_point = [ - _ViewsDataset(pd.DataFrame( + EvaluationManager.convert_to_arrays(pd.DataFrame( {"pred_target": [[1.0], [3.0], [2.0], [4.0]]}, index=pd.MultiIndex.from_tuples( [(100, 1), (100, 2), (101, 1), (101, 2)], names=["month", "country"] ), - )).dataframe, - _ViewsDataset(pd.DataFrame( + )), + EvaluationManager.convert_to_arrays(pd.DataFrame( {"pred_target": [[5.0], [7.0], [6.0], [8.0]]}, index=pd.MultiIndex.from_tuples( [(101, 1), (101, 2), (102, 1), (102, 2)], names=["month", "country"] ), - )).dataframe, - _ViewsDataset(pd.DataFrame( + )), + EvaluationManager.convert_to_arrays(pd.DataFrame( {"pred_target": [[9.0], [7.0], [10.0], [8.0]]}, index=pd.MultiIndex.from_tuples( [(102, 1), (102, 2), (103, 1), (103, 2)], names=["month", "country"] ), - )).dataframe, + )), ] df_splitted_uncertainty = [ - _ViewsDataset(pd.DataFrame( + EvaluationManager.convert_to_arrays(pd.DataFrame( {"pred_target": [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [4.0, 6.0, 8.0], [5.0, 7.0, 9.0]]}, index=pd.MultiIndex.from_tuples( [(100, 1), (100, 2), (101, 1), (101, 2)], names=["month", "country"] ), - )).dataframe, - _ViewsDataset(pd.DataFrame( + )), + EvaluationManager.convert_to_arrays(pd.DataFrame( {"pred_target": [[3.0, 4.0, 5.0], [4.0, 5.0, 6.0], [6.0, 8.0, 10.0], [7.0, 9.0, 11.0]]}, index=pd.MultiIndex.from_tuples( [(101, 1), (101, 2), (102, 1), (102, 2)], names=["month", "country"] ), - )).dataframe, - _ViewsDataset(pd.DataFrame( + )), + EvaluationManager.convert_to_arrays(pd.DataFrame( {"pred_target": [[5.0, 6.0, 7.0], [6.0, 7.0, 8.0], [8.0, 10.0, 12.0], [9.0, 11.0, 13.0]]}, index=pd.MultiIndex.from_tuples( [(102, 1), (102, 2), (103, 1), (103, 2)], names=["month", "country"] ), - )).dataframe, + )), ] df_splitted_point_test = EvaluationManager._split_dfs_by_step( mock_point_predictions diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py index 96266dd..7559f42 100644 --- a/views_evaluation/evaluation/evaluation_manager.py +++ b/views_evaluation/evaluation/evaluation_manager.py @@ -323,11 +323,11 @@ def get_evaluation_type(predictions: List[pd.DataFrame]) -> bool: for df in predictions: for value in df.values.flatten(): - if not isinstance(value, np.ndarray): + if not (isinstance(value, np.ndarray) or isinstance(value, list)): raise ValueError( - "All values must be lists or numpy arrays. Use _ViewsDataset to convert the data." + "All values must be lists or numpy arrays. Convert the data." ) - + if len(value) > 1: is_uncertainty = True # For uncertainty evaluation, check that all lists have the same length From 65c4c97d8e9032d334c9235f1348ad434d004117 Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Tue, 1 Jul 2025 14:14:20 +0200 Subject: [PATCH 08/15] categorize metrics for point and uncertainty --- views_evaluation/evaluation/metrics.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/views_evaluation/evaluation/metrics.py b/views_evaluation/evaluation/metrics.py index 70f158e..36b2cb5 100644 --- a/views_evaluation/evaluation/metrics.py +++ b/views_evaluation/evaluation/metrics.py @@ -121,9 +121,6 @@ class PointEvaluationMetrics(BaseEvaluationMetrics): RMSLE: Optional[float] = None CRPS: Optional[float] = None AP: Optional[float] = None - Brier: Optional[float] = None - Jeffreys: Optional[float] = None - Coverage: Optional[float] = None EMD: Optional[float] = None SD: Optional[float] = None pEMDiv: Optional[float] = None @@ -140,4 +137,10 @@ class UncertaintyEvaluationMetrics(BaseEvaluationMetrics): CRPS (Optional[float]): Continuous Ranked Probability Score. """ - CRPS: Optional[float] = None \ No newline at end of file + CRPS: Optional[float] = None + MIS: Optional[float] = None + Ignorance: Optional[float] = None + Brier: Optional[float] = None + Jeffreys: Optional[float] = None + Coverage: Optional[float] = None + \ No newline at end of file From d65639c8cbc7aad1319160db259d8b3f6373d83c Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Tue, 1 Jul 2025 14:14:42 +0200 Subject: [PATCH 09/15] separate metric calculation to a new file --- .../evaluation/metric_calculators.py | 372 ++++++++++++++++++ 1 file changed, 372 insertions(+) create mode 100644 views_evaluation/evaluation/metric_calculators.py diff --git a/views_evaluation/evaluation/metric_calculators.py b/views_evaluation/evaluation/metric_calculators.py new file mode 100644 index 0000000..7c2d67b --- /dev/null +++ b/views_evaluation/evaluation/metric_calculators.py @@ -0,0 +1,372 @@ +from typing import List, Dict, Tuple, Optional +from collections import Counter +import pandas as pd +import numpy as np +import properscoring as ps +from sklearn.metrics import ( + root_mean_squared_log_error, + average_precision_score, +) +from scipy.stats import wasserstein_distance, pearsonr + + +def calculate_rmsle( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate Root Mean Squared Logarithmic Error (RMSLE) for each prediction. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Average RMSLE score + """ + actual_values = np.concatenate(matched_actual[target].values) + pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) + + actual_expanded = np.repeat( + actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] + ) + + return root_mean_squared_log_error(actual_expanded, pred_values) + + +def calculate_crps( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate Continuous Ranked Probability Score (CRPS) for each prediction. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Average CRPS score + """ + return np.mean( + [ + ps.crps_ensemble(actual[0], np.array(pred)) + for actual, pred in zip( + matched_actual[target], matched_pred[f"pred_{target}"] + ) + ] + ) + + +def calculate_ap( + matched_actual: pd.DataFrame, + matched_pred: pd.DataFrame, + target: str, + threshold=25, +) -> float: + """ + Calculate Average Precision (AP) for binary predictions with a threshold. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + threshold (float): Threshold to convert predictions to binary values + + Returns: + float: Average Precision score + """ + actual_values = np.concatenate(matched_actual[target].values) + pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) + + actual_expanded = np.repeat( + actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] + ) + + actual_binary = (actual_expanded > threshold).astype(int) + pred_binary = (pred_values >= threshold).astype(int) + + return average_precision_score(actual_binary, pred_binary) + + +def calculate_emd( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate Earth Mover's Distance (EMD) between predicted and actual distributions. + EMD measures the minimum amount of work needed to transform one distribution into another. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Average EMD score + """ + actual_values = np.concatenate(matched_actual[target].values) + pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) + + actual_expanded = np.repeat( + actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] + ) + + emd = wasserstein_distance(actual_expanded, pred_values) + + return emd + + +def calculate_sd( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate Sinkhorn Distance between predicted and actual distributions. + + Sinkhorn Distance is a regularized version of the Earth Mover's Distance + that is computationally more efficient. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Sinkhorn Distance score + """ + raise NotImplementedError("Sinkhorn Distance calculation not yet implemented") + + +def calculate_pEMDiv( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate pseudo-Earth Mover Divergence between predicted and actual distributions. + + pEMDiv is a computationally efficient approximation of the Earth Mover's Distance. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: pEMDiv score + """ + raise NotImplementedError("pEMDiv calculation not yet implemented") + + +def calculate_pearson( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate Pearson correlation coefficient between actual and predicted values. + This measures the linear correlation between predictions and actual values. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Pearson correlation coefficient + """ + actual_values = np.concatenate(matched_actual[target].values) + pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) + + actual_expanded = np.repeat( + actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] + ) + + correlation, _ = pearsonr(actual_expanded, pred_values) + return correlation + + +def calculate_variogram( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + !! How to accountr for time and location? + Calculate the variogram score between actual and predicted values. + This measures the spatial/temporal correlation structure. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Variogram score + """ + raise NotImplementedError("Variogram calculation not yet implemented") + + +def calculate_brier( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate Brier Score for probabilistic predictions. + + The Brier Score measures the accuracy of probabilistic predictions. + Lower values indicate better predictions. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Brier Score + """ + raise NotImplementedError("Brier Score calculation not yet implemented") + + +def calculate_jeffreys( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate Jeffreys Divergence between predicted and actual distributions. + + Jeffreys Divergence is a symmetric measure of the difference between + two probability distributions. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Jeffreys Divergence score + """ + raise NotImplementedError("Jeffreys Divergence calculation not yet implemented") + + +def calculate_coverage( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate Coverage (Histograms) for probabilistic predictions. + + Coverage measures how well the predicted distribution covers the actual values. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Coverage score + """ + raise NotImplementedError("Coverage calculation not yet implemented") + + +def calculate_mean_interval_score( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str, alpha=0.05 +): + """ + Calculate the Mean Interval Score (MIS) for probabilistic predictions. + + The Mean Interval Score measures the average width of prediction intervals + and the coverage of the actual values. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + alpha (float): Significance level for the interval (default: 0.05) + + Returns: + float: Mean Interval Score + """ + lower = np.array( + [np.quantile(row, q=alpha / 2) for row in matched_pred[f"pred_{target}"]] + ) + upper = np.array( + [np.quantile(row, q=1 - (alpha / 2)) for row in matched_pred[f"pred_{target}"]] + ) + actuals = np.array( + [ + row[0] if isinstance(row, (np.ndarray, list)) else row + for row in matched_actual[target] + ] + ) + + interval_width = upper - lower + lower_coverage = (2 / alpha) * (lower - actuals) * (actuals < lower) + upper_coverage = (2 / alpha) * (actuals - upper) * (actuals > upper) + interval_score = interval_width + lower_coverage + upper_coverage + + return np.mean(interval_score) + + +def calculate_ignorance_score( + matched_actual: pd.DataFrame, + matched_pred: pd.DataFrame, + target: str, + bins=[0, 0.5, 2.5, 5.5, 10.5, 25.5, 50.5, 100.5, 250.5, 500.5, 1000.5], + low_bin=0, + high_bin=10000, +): + """ + !!Note unfinished. Bins need to be fixed bacause in competition we evaluate over log values but not here. + This is an adapted version from https://github.com/prio-data/prediction_competition_2023/tree/main + Compute Binned Ignorance Score for predictions and observations. + + Parameters: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + bins (list): List of bins for the histogram + low_bin (float): The lower bound of the bins + high_bin (float): The upper bound of the bins + + Returns: + float: Mean ignorance score. + """ + + def digitize_minus_one(x, edges): + return np.digitize(x, edges, right=False) - 1 + + def _calculate_ignorance_score(predictions, observed, n): + c = Counter(predictions) + prob = c[observed] / n + return -np.log2(prob) + + scores = [] + for row_p, row_o in zip(matched_pred[f"pred_{target}"], matched_actual[target]): + preds = np.asarray(row_p) + truth = float(np.asarray(row_o).squeeze()) + + edges = np.histogram_bin_edges(preds, bins=bins, range=(low_bin, high_bin)) + + binned_preds = digitize_minus_one(preds, edges) + binned_obs = digitize_minus_one([truth], edges)[0] + + synthetic = np.arange(len(edges) - 1) + binned_preds = np.concatenate([binned_preds, synthetic]) + + n = len(binned_preds) + score = _calculate_ignorance_score(binned_preds, binned_obs, n) + scores.append(score) + + return np.mean(scores) + + +POINT_METRIC_FUNCTIONS = { + "RMSLE": calculate_rmsle, + "CRPS": calculate_crps, + "AP": calculate_ap, + "EMD": calculate_emd, + "SD": calculate_sd, + "pEMDiv": calculate_pEMDiv, + "Pearson": calculate_pearson, + "Variogram": calculate_variogram, +} + +UNCERTAINTY_METRIC_FUNCTIONS = { + "CRPS": calculate_crps, + "MIS": calculate_mean_interval_score, + "Ignorance": calculate_ignorance_score, + "Brier": calculate_brier, + "Jeffreys": calculate_jeffreys, + "Coverage": calculate_coverage, +} From cfe44e3d4a10bf3755863a5624bd3affbda98224 Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Tue, 1 Jul 2025 14:14:59 +0200 Subject: [PATCH 10/15] update evaluation manager accordingly --- .../evaluation/evaluation_manager.py | 248 ++---------------- 1 file changed, 16 insertions(+), 232 deletions(-) diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py index 7559f42..9b0f859 100644 --- a/views_evaluation/evaluation/evaluation_manager.py +++ b/views_evaluation/evaluation/evaluation_manager.py @@ -2,16 +2,14 @@ import logging import pandas as pd import numpy as np -import properscoring as ps -from sklearn.metrics import ( - root_mean_squared_log_error, - average_precision_score, -) -from scipy.stats import wasserstein_distance, pearsonr from views_evaluation.evaluation.metrics import ( PointEvaluationMetrics, UncertaintyEvaluationMetrics, ) +from views_evaluation.evaluation.metric_calculators import ( + POINT_METRIC_FUNCTIONS, + UNCERTAINTY_METRIC_FUNCTIONS, +) logger = logging.getLogger(__name__) @@ -31,227 +29,8 @@ def __init__(self, metrics_list: list): """ self.metrics_list = metrics_list - self.point_metric_functions = { - "RMSLE": self._calculate_rmsle, - "CRPS": self._calculate_crps, - "AP": self._calculate_ap, - "Brier": self._calculate_brier, - "Jeffreys": self._calculate_jeffreys, - "Coverage": self._calculate_coverage, - "EMD": self._calculate_emd, - "SD": self._calculate_sd, - "pEMDiv": self._calculate_pEMDiv, - "Pearson": self._calculate_pearson, - "Variogram": self._calculate_variogram, - } - self.uncertainty_metric_functions = { - "CRPS": self._calculate_crps, - } - - @staticmethod - def _calculate_rmsle( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - """ - Calculate Root Mean Squared Logarithmic Error (RMSLE) for each prediction. - - Args: - matched_actual (pd.DataFrame): DataFrame containing actual values - matched_pred (pd.DataFrame): DataFrame containing predictions - target (str): The target column name - - Returns: - float: Average RMSLE score - """ - actual_values = np.concatenate(matched_actual[target].values) - pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) - - actual_expanded = np.repeat( - actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] - ) - - return root_mean_squared_log_error(actual_expanded, pred_values) - - @staticmethod - def _calculate_crps( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - """ - Calculate Continuous Ranked Probability Score (CRPS) for each prediction. - - Args: - matched_actual (pd.DataFrame): DataFrame containing actual values - matched_pred (pd.DataFrame): DataFrame containing predictions - target (str): The target column name - - Returns: - float: Average CRPS score - """ - return np.mean( - [ - ps.crps_ensemble(actual[0], np.array(pred)) - for actual, pred in zip( - matched_actual[target], matched_pred[f"pred_{target}"] - ) - ] - ) - - @staticmethod - def _calculate_ap( - matched_actual: pd.DataFrame, - matched_pred: pd.DataFrame, - target: str, - threshold=30, - ) -> float: - """ - Calculate Average Precision (AP) for binary predictions with a threshold. - - Args: - matched_actual (pd.DataFrame): DataFrame containing actual values - matched_pred (pd.DataFrame): DataFrame containing predictions - target (str): The target column name - threshold (float): Threshold to convert predictions to binary values - - Returns: - float: Average Precision score - """ - actual_values = np.concatenate(matched_actual[target].values) - pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) - - actual_expanded = np.repeat( - actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] - ) - - actual_binary = (actual_expanded > threshold).astype(int) - pred_binary = (pred_values >= threshold).astype(int) - - return average_precision_score(actual_binary, pred_binary) - - @staticmethod - def _calculate_brier( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - pass - - @staticmethod - def _calculate_jeffreys( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - pass - - @staticmethod - def _calculate_coverage( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - pass - - @staticmethod - def _calculate_emd( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - """ - Calculate Earth Mover's Distance (EMD) between predicted and actual distributions. - EMD measures the minimum amount of work needed to transform one distribution into another. - - Args: - matched_actual (pd.DataFrame): DataFrame containing actual values - matched_pred (pd.DataFrame): DataFrame containing predictions - target (str): The target column name - - Returns: - float: Average EMD score - """ - actual_values = np.concatenate(matched_actual[target].values) - pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) - - actual_expanded = np.repeat( - actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] - ) - - # Calculate EMD (1D Wasserstein distance) - emd = wasserstein_distance(actual_expanded, pred_values) - - return emd - - @staticmethod - def _calculate_sd( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - pass - - @staticmethod - def _calculate_pEMDiv( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - pass - - @staticmethod - def _calculate_pearson( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - """ - Calculate Pearson correlation coefficient between actual and predicted values. - This measures the linear correlation between predictions and actual values. - - Args: - matched_actual (pd.DataFrame): DataFrame containing actual values - matched_pred (pd.DataFrame): DataFrame containing predictions - target (str): The target column name - - Returns: - float: Pearson correlation coefficient - """ - actual_values = np.concatenate(matched_actual[target].values) - pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) - - actual_expanded = np.repeat( - actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] - ) - - # Calculate Pearson correlation - correlation, _ = pearsonr(actual_expanded, pred_values) - return correlation - - @staticmethod - def _calculate_variogram( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - """ - Calculate the variogram score between actual and predicted values. - This measures the spatial/temporal correlation structure. - - Args: - matched_actual (pd.DataFrame): DataFrame containing actual values - matched_pred (pd.DataFrame): DataFrame containing predictions - target (str): The target column name - - Returns: - float: Variogram score - """ - actual_values = np.concatenate(matched_actual[target].values) - pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) - - actual_expanded = np.repeat( - actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] - ) - - # Calculate empirical variogram for actual values - n = len(actual_expanded) - actual_variogram = np.zeros(n - 1) - for i in range(n - 1): - actual_variogram[i] = np.mean( - (actual_expanded[i + 1 :] - actual_expanded[i]) ** 2 - ) - - # Calculate empirical variogram for predicted values - pred_variogram = np.zeros(n - 1) - for i in range(n - 1): - pred_variogram[i] = np.mean((pred_values[i + 1 :] - pred_values[i]) ** 2) - - # Calculate mean squared difference between variograms - variogram_score = np.mean((actual_variogram - pred_variogram) ** 2) - - return variogram_score + self.point_metric_functions = POINT_METRIC_FUNCTIONS + self.uncertainty_metric_functions = UNCERTAINTY_METRIC_FUNCTIONS @staticmethod def transform_data(df: pd.DataFrame, target: str) -> pd.DataFrame: @@ -436,6 +215,7 @@ def step_wise_evaluation( target: str, steps: List[int], is_uncertainty: bool, + **kwargs, ): """ Evaluates the predictions step-wise and calculates the specified metrics. @@ -463,7 +243,6 @@ def step_wise_evaluation( ) metric_functions = self.point_metric_functions - step_metrics = {} result_dfs = EvaluationManager._split_dfs_by_step(predictions) for metric in self.metrics_list: @@ -475,7 +254,7 @@ def step_wise_evaluation( ) evaluation_dict[f"step{str(step).zfill(2)}"].__setattr__( metric, - metric_functions[metric](matched_actual, matched_pred, target), + metric_functions[metric](matched_actual, matched_pred, target, **kwargs), ) else: logger.warning(f"Metric {metric} is not a default metric, skipping...") @@ -491,6 +270,7 @@ def time_series_wise_evaluation( predictions: List[pd.DataFrame], target: str, is_uncertainty: bool, + **kwargs, ): """ Evaluates the predictions time series-wise and calculates the specified metrics. @@ -527,7 +307,7 @@ def time_series_wise_evaluation( ) evaluation_dict[f"ts{str(i).zfill(2)}"].__setattr__( metric, - metric_functions[metric](matched_actual, matched_pred, target), + metric_functions[metric](matched_actual, matched_pred, target, **kwargs), ) else: logger.warning(f"Metric {metric} is not a default metric, skipping...") @@ -543,6 +323,7 @@ def month_wise_evaluation( predictions: List[pd.DataFrame], target: str, is_uncertainty: bool, + **kwargs, ): """ Evaluates the predictions month-wise and calculates the specified metrics. @@ -588,6 +369,7 @@ def month_wise_evaluation( matched_actual.loc[df.index, [target]], matched_pred.loc[df.index, [f"pred_{target}"]], target, + **kwargs, ) ) @@ -609,6 +391,7 @@ def evaluate( predictions: List[pd.DataFrame], target: str, steps: List[int], + **kwargs, ): """ Evaluates the predictions and calculates the specified point metrics. @@ -635,10 +418,10 @@ def evaluate( evaluation_results = {} evaluation_results["month"] = self.month_wise_evaluation( - actual, predictions, target, is_uncertainty + actual, predictions, target, is_uncertainty, **kwargs ) evaluation_results["time_series"] = self.time_series_wise_evaluation( - actual, predictions, target, is_uncertainty + actual, predictions, target, is_uncertainty, **kwargs ) evaluation_results["step"] = self.step_wise_evaluation( actual, @@ -646,6 +429,7 @@ def evaluate( target, steps, is_uncertainty, + **kwargs, ) return evaluation_results From 5081dce0c4be070b9a414de6b0257ace1c4ca92b Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Tue, 1 Jul 2025 14:15:17 +0200 Subject: [PATCH 11/15] update test --- tests/test_evaluation_manager.py | 6 +- tests/test_metric_calculators.py | 150 +++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+), 2 deletions(-) create mode 100644 tests/test_metric_calculators.py diff --git a/tests/test_evaluation_manager.py b/tests/test_evaluation_manager.py index 9fa0c31..46aec9c 100644 --- a/tests/test_evaluation_manager.py +++ b/tests/test_evaluation_manager.py @@ -388,7 +388,8 @@ def test_calculate_ap_point_predictions(): matched_actual = pd.DataFrame(actual_data) matched_pred = pd.DataFrame(pred_data) - ap_score = EvaluationManager._calculate_ap(matched_actual, matched_pred, 'target', threshold) + from views_evaluation.evaluation.metric_calculators import calculate_ap + ap_score = calculate_ap(matched_actual, matched_pred, 'target', threshold) actual_binary = [1, 0, 1, 0] # 40>30, 20<30, 35>30, 25<30 pred_binary = [1, 1, 0, 0] # 35>30, 30=30, 20<30, 15<30 @@ -412,7 +413,8 @@ def test_calculate_ap_uncertainty_predictions(): matched_actual = pd.DataFrame(actual_data) matched_pred = pd.DataFrame(pred_data) - ap_score = EvaluationManager._calculate_ap(matched_actual, matched_pred, 'target', threshold) + from views_evaluation.evaluation.metric_calculators import calculate_ap + ap_score = calculate_ap(matched_actual, matched_pred, 'target', threshold) pred_values = [35, 40, 45, 30, 35, 40, 20, 25, 30, 15, 20, 25] actual_values = [40, 40, 40, 20, 20, 20, 35, 35, 35, 25, 25, 25] diff --git a/tests/test_metric_calculators.py b/tests/test_metric_calculators.py new file mode 100644 index 0000000..0baa5b0 --- /dev/null +++ b/tests/test_metric_calculators.py @@ -0,0 +1,150 @@ +import pytest +import pandas as pd +import numpy as np +from views_evaluation.evaluation.metric_calculators import ( + calculate_rmsle, + calculate_crps, + calculate_ap, + calculate_emd, + calculate_pearson, + calculate_variogram, + calculate_ignorance_score, + calculate_mean_interval_score, + POINT_METRIC_FUNCTIONS, + UNCERTAINTY_METRIC_FUNCTIONS, +) + + +@pytest.fixture +def sample_data(): + """Create sample data for testing.""" + actual = pd.DataFrame({ + 'target': [[1.0], [2.0], [3.0], [4.0]] + }) + pred = pd.DataFrame({ + 'pred_target': [[1.1], [1.9], [3.1], [3.9]] + }) + return actual, pred + + +@pytest.fixture +def sample_uncertainty_data(): + """Create sample uncertainty data for testing.""" + actual = pd.DataFrame({ + 'target': [[1.0], [2.0], [3.0], [4.0]] + }) + pred = pd.DataFrame({ + 'pred_target': [[1.0, 1.1, 1.2], [1.8, 2.0, 2.2], [2.9, 3.0, 3.1], [3.8, 4.0, 4.2]] + }) + return actual, pred + + +def test_calculate_rmsle(sample_data): + """Test RMSLE calculation.""" + actual, pred = sample_data + result = calculate_rmsle(actual, pred, 'target') + assert isinstance(result, float) + assert result >= 0 + + +def test_calculate_crps(sample_uncertainty_data): + """Test CRPS calculation.""" + actual, pred = sample_uncertainty_data + result = calculate_crps(actual, pred, 'target') + assert isinstance(result, float) + assert result >= 0 + + +def test_calculate_ap(sample_data): + """Test Average Precision calculation.""" + actual, pred = sample_data + result = calculate_ap(actual, pred, 'target', threshold=2.5) + assert isinstance(result, float) + assert 0 <= result <= 1 + + +def test_calculate_emd(sample_data): + """Test Earth Mover's Distance calculation.""" + actual, pred = sample_data + result = calculate_emd(actual, pred, 'target') + assert isinstance(result, float) + assert result >= 0 + + +def test_calculate_pearson(sample_data): + """Test Pearson correlation calculation.""" + actual, pred = sample_data + result = calculate_pearson(actual, pred, 'target') + assert isinstance(result, float) + assert -1 <= result <= 1 + + +def test_calculate_variogram(sample_data): + """Test Variogram calculation.""" + actual, pred = sample_data + result = calculate_variogram(actual, pred, 'target') + assert isinstance(result, float) + assert result >= 0 + + +def test_calculate_ignorance_score(sample_uncertainty_data): + """Test Ignorance Score calculation.""" + actual, pred = sample_uncertainty_data + result = calculate_ignorance_score(actual, pred, 'target') + assert isinstance(result, float) + assert result >= 0 + + +def test_calculate_mis(sample_uncertainty_data): + """Test Mean Interval Score calculation.""" + actual, pred = sample_uncertainty_data + result = calculate_mean_interval_score(actual, pred, 'target') + assert isinstance(result, float) + assert result >= 0 + + +def test_point_metric_functions(): + """Test that all point metric functions are available.""" + expected_metrics = [ + "RMSLE", "CRPS", "AP", "Brier", "Jeffreys", + "Coverage", "EMD", "SD", "pEMDiv", "Pearson", "Variogram" + ] + + for metric in expected_metrics: + assert metric in POINT_METRIC_FUNCTIONS + assert callable(POINT_METRIC_FUNCTIONS[metric]) + + +def test_uncertainty_metric_functions(): + """Test that all uncertainty metric functions are available.""" + expected_metrics = ["CRPS"] + + for metric in expected_metrics: + assert metric in UNCERTAINTY_METRIC_FUNCTIONS + assert callable(UNCERTAINTY_METRIC_FUNCTIONS[metric]) + + +def test_not_implemented_metrics(): + """Test that unimplemented metrics raise NotImplementedError.""" + actual = pd.DataFrame({'target': [[1.0]]}) + pred = pd.DataFrame({'pred_target': [[1.0]]}) + + from views_evaluation.evaluation.metric_calculators import ( + calculate_brier, + calculate_jeffreys, + calculate_coverage, + calculate_sd, + calculate_pEMDiv, + ) + + unimplemented_functions = [ + calculate_brier, + calculate_jeffreys, + calculate_coverage, + calculate_sd, + calculate_pEMDiv, + ] + + for func in unimplemented_functions: + with pytest.raises(NotImplementedError): + func(actual, pred, 'target') \ No newline at end of file From 01cddd6bc4c76194b8663ccbdb1dc344a48900ec Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Tue, 1 Jul 2025 14:15:27 +0200 Subject: [PATCH 12/15] update quick start --- examples/quickstart.ipynb | 103 ++++++++++++++++++++++++++++++-------- 1 file changed, 82 insertions(+), 21 deletions(-) diff --git a/examples/quickstart.ipynb b/examples/quickstart.ipynb index 2184465..084dba4 100644 --- a/examples/quickstart.ipynb +++ b/examples/quickstart.ipynb @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -93,14 +93,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Actual data\n", "df_actual = pd.DataFrame(\n", " {\n", - " \"target\": [0, 1, 1, 2, 2, 3, 3, 4],\n", + " \"lr_target\": [0, 1, 1, 2, 2, 3, 3, 4],\n", " \"covariate_1\": [3, 2, 4, 5, 2, 6, 8, 5],\n", " },\n", " index=index,\n", @@ -109,21 +109,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Point predictions\n", - "df1_point = pd.DataFrame({\"pred_target\": [1, 3, 5, 7]}, index=index_0)\n", - "df2_point = pd.DataFrame({\"pred_target\": [2, 4, 6, 8]}, index=index_1)\n", + "df1_point = pd.DataFrame({\"pred_lr_target\": [1, 3, 5, 7]}, index=index_0)\n", + "df2_point = pd.DataFrame({\"pred_lr_target\": [2, 4, 6, 8]}, index=index_1)\n", "dfs_point = [df1_point, df2_point]\n", "\n", "# Uncertainty\n", "df1_uncertainty = pd.DataFrame(\n", - " {\"pred_target\": [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]]}, index=index_0\n", + " {\"pred_lr_target\": [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]]}, index=index_0\n", ")\n", "df2_uncertainty = pd.DataFrame(\n", - " {\"pred_target\": [[4, 6, 8], [5, 7, 9], [6, 8, 10], [7, 9, 11]]}, index=index_1\n", + " {\"pred_lr_target\": [[4, 6, 8], [5, 7, 9], [6, 8, 10], [7, 9, 11]]}, index=index_1\n", ")\n", "dfs_uncertainty = [df1_uncertainty, df2_uncertainty]" ] @@ -149,7 +149,7 @@ "metadata": {}, "outputs": [], "source": [ - "metrics_list = ['RMSLE', 'CRPS'] # Add other metrics as needed\n", + "metrics_list = ['RMSLE', 'CRPS', 'MIS'] # Add other metrics as needed\n", "evaluation_manager = EvaluationManager(metrics_list)" ] }, @@ -162,17 +162,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Metric MIS is not a default metric, skipping...\n", + "Metric MIS is not a default metric, skipping...\n", + "Metric MIS is not a default metric, skipping...\n" + ] + } + ], "source": [ "steps = [1, 2]\n", - "point_evaluation_results = evaluation_manager.evaluate(df_actual, dfs_point, target='target', steps=steps)" + "point_evaluation_results = evaluation_manager.evaluate(df_actual, dfs_point, target='lr_target', steps=steps)" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -190,7 +200,7 @@ " ts01 0.420849 2.0)" ] }, - "execution_count": 36, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -208,21 +218,56 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Metric RMSLE is not a default metric, skipping...\n", + "Metric RMSLE is not a default metric, skipping...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ "Metric RMSLE is not a default metric, skipping...\n", "Metric RMSLE is not a default metric, skipping...\n" ] } ], "source": [ - "uncertainty_evaluation_results = evaluation_manager.evaluate(df_actual, dfs_uncertainty, target='target', steps=steps)" + "uncertainty_evaluation_results = evaluation_manager.evaluate(df_actual, dfs_uncertainty, target='lr_target', steps=steps)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "( CRPS MIS\n", + " month100 0.555556 3.90\n", + " month101 2.333333 65.85\n", + " month102 4.111111 127.80,\n", + " CRPS MIS\n", + " step01 1.833333 45.85\n", + " step02 2.833333 85.85,\n", + " CRPS MIS\n", + " ts00 1.055556 23.9\n", + " ts01 3.611111 107.8)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "uncertainty_evaluation_results['month'][1], uncertainty_evaluation_results['step'][1], uncertainty_evaluation_results['time_series'][1]" ] }, { @@ -234,18 +279,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Get the evaluation type, i.e., uncertainty or point\n", - "is_uncertainty = EvaluationManager.get_evaluation_type(dfs_point)\n", - "month_point_evaluation_results = evaluation_manager.month_wise_evaluation(df_actual, dfs_point, target='target', is_uncertainty=is_uncertainty)" + "actual = EvaluationManager.transform_data(\n", + " EvaluationManager.convert_to_arrays(df_actual), 'lr_target'\n", + " )\n", + "predictions = [\n", + " EvaluationManager.transform_data(\n", + " EvaluationManager.convert_to_arrays(pred), f\"pred_lr_target\"\n", + " )\n", + " for pred in dfs_point\n", + "]\n", + "is_uncertainty = EvaluationManager.get_evaluation_type(predictions)\n", + "month_point_evaluation_results = evaluation_manager.month_wise_evaluation(actual, predictions, target='lr_target', is_uncertainty=is_uncertainty)" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -262,6 +316,13 @@ "source": [ "print(month_point_evaluation_results[1])" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 50f8c5c4b0b25e998469f79279fbbd29fe836234 Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Tue, 1 Jul 2025 14:19:01 +0200 Subject: [PATCH 13/15] fix test --- tests/test_metric_calculators.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/tests/test_metric_calculators.py b/tests/test_metric_calculators.py index 0baa5b0..03ec89f 100644 --- a/tests/test_metric_calculators.py +++ b/tests/test_metric_calculators.py @@ -79,14 +79,6 @@ def test_calculate_pearson(sample_data): assert -1 <= result <= 1 -def test_calculate_variogram(sample_data): - """Test Variogram calculation.""" - actual, pred = sample_data - result = calculate_variogram(actual, pred, 'target') - assert isinstance(result, float) - assert result >= 0 - - def test_calculate_ignorance_score(sample_uncertainty_data): """Test Ignorance Score calculation.""" actual, pred = sample_uncertainty_data @@ -106,8 +98,7 @@ def test_calculate_mis(sample_uncertainty_data): def test_point_metric_functions(): """Test that all point metric functions are available.""" expected_metrics = [ - "RMSLE", "CRPS", "AP", "Brier", "Jeffreys", - "Coverage", "EMD", "SD", "pEMDiv", "Pearson", "Variogram" + "RMSLE", "CRPS", "AP", "EMD", "SD", "pEMDiv", "Pearson", "Variogram" ] for metric in expected_metrics: @@ -117,7 +108,7 @@ def test_point_metric_functions(): def test_uncertainty_metric_functions(): """Test that all uncertainty metric functions are available.""" - expected_metrics = ["CRPS"] + expected_metrics = ["CRPS", "MIS", "Ignorance", "Brier", "Jeffreys", "Coverage"] for metric in expected_metrics: assert metric in UNCERTAINTY_METRIC_FUNCTIONS From d921c17e311de91598bb871695a47b266f866f0a Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Thu, 3 Jul 2025 13:00:10 +0200 Subject: [PATCH 14/15] add coverage --- .../evaluation/metric_calculators.py | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/views_evaluation/evaluation/metric_calculators.py b/views_evaluation/evaluation/metric_calculators.py index 7c2d67b..02d775f 100644 --- a/views_evaluation/evaluation/metric_calculators.py +++ b/views_evaluation/evaluation/metric_calculators.py @@ -104,16 +104,12 @@ def calculate_emd( Returns: float: Average EMD score """ - actual_values = np.concatenate(matched_actual[target].values) - pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) - - actual_expanded = np.repeat( - actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] - ) - - emd = wasserstein_distance(actual_expanded, pred_values) - - return emd + emd_list = [] + for actual, preds in zip(matched_actual[target], matched_pred[f"pred_{target}"]): + actual_val = np.asarray(actual) + preds_arr = np.asarray(preds) + emd_list.append(wasserstein_distance(preds_arr, actual_val)) + return np.mean(emd_list) def calculate_sd( @@ -241,7 +237,7 @@ def calculate_jeffreys( def calculate_coverage( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str, alpha=0.1 ) -> float: """ Calculate Coverage (Histograms) for probabilistic predictions. @@ -252,11 +248,23 @@ def calculate_coverage( matched_actual (pd.DataFrame): DataFrame containing actual values matched_pred (pd.DataFrame): DataFrame containing predictions target (str): The target column name - + alpha (float): Significance level for the interval (default: 0.1) Returns: float: Coverage score """ - raise NotImplementedError("Coverage calculation not yet implemented") + y_true = matched_actual[target].values + y_pred_samples = matched_pred[f"pred_{target}"].values + + lower_q = alpha / 2 + upper_q = 1 - alpha / 2 + + covered = [] + for yt, pred_list in zip(y_true, y_pred_samples): + lower = np.quantile(pred_list, lower_q) + upper = np.quantile(pred_list, upper_q) + covered.append(lower <= yt <= upper) + + return np.mean(covered) def calculate_mean_interval_score( @@ -283,7 +291,7 @@ def calculate_mean_interval_score( upper = np.array( [np.quantile(row, q=1 - (alpha / 2)) for row in matched_pred[f"pred_{target}"]] ) - actuals = np.array( + actuals = np. array( [ row[0] if isinstance(row, (np.ndarray, list)) else row for row in matched_actual[target] From ed28df6be76326bc2ceb7e54abe93c9e5c1215d5 Mon Sep 17 00:00:00 2001 From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com> Date: Thu, 3 Jul 2025 13:00:18 +0200 Subject: [PATCH 15/15] add test --- tests/test_metric_calculators.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/test_metric_calculators.py b/tests/test_metric_calculators.py index 03ec89f..1ee54f1 100644 --- a/tests/test_metric_calculators.py +++ b/tests/test_metric_calculators.py @@ -7,7 +7,7 @@ calculate_ap, calculate_emd, calculate_pearson, - calculate_variogram, + calculate_coverage, calculate_ignorance_score, calculate_mean_interval_score, POINT_METRIC_FUNCTIONS, @@ -79,6 +79,14 @@ def test_calculate_pearson(sample_data): assert -1 <= result <= 1 +def test_calculate_coverage(sample_uncertainty_data): + """Test Coverage calculation.""" + actual, pred = sample_uncertainty_data + result = calculate_coverage(actual, pred, 'target') + assert isinstance(result, float) + assert 0 <= result <= 1 + + def test_calculate_ignorance_score(sample_uncertainty_data): """Test Ignorance Score calculation.""" actual, pred = sample_uncertainty_data @@ -123,17 +131,17 @@ def test_not_implemented_metrics(): from views_evaluation.evaluation.metric_calculators import ( calculate_brier, calculate_jeffreys, - calculate_coverage, calculate_sd, calculate_pEMDiv, + calculate_variogram, ) unimplemented_functions = [ calculate_brier, calculate_jeffreys, - calculate_coverage, calculate_sd, calculate_pEMDiv, + calculate_variogram, ] for func in unimplemented_functions: