diff --git a/examples/quickstart.ipynb b/examples/quickstart.ipynb index 2184465..084dba4 100644 --- a/examples/quickstart.ipynb +++ b/examples/quickstart.ipynb @@ -73,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -93,14 +93,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Actual data\n", "df_actual = pd.DataFrame(\n", " {\n", - " \"target\": [0, 1, 1, 2, 2, 3, 3, 4],\n", + " \"lr_target\": [0, 1, 1, 2, 2, 3, 3, 4],\n", " \"covariate_1\": [3, 2, 4, 5, 2, 6, 8, 5],\n", " },\n", " index=index,\n", @@ -109,21 +109,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Point predictions\n", - "df1_point = pd.DataFrame({\"pred_target\": [1, 3, 5, 7]}, index=index_0)\n", - "df2_point = pd.DataFrame({\"pred_target\": [2, 4, 6, 8]}, index=index_1)\n", + "df1_point = pd.DataFrame({\"pred_lr_target\": [1, 3, 5, 7]}, index=index_0)\n", + "df2_point = pd.DataFrame({\"pred_lr_target\": [2, 4, 6, 8]}, index=index_1)\n", "dfs_point = [df1_point, df2_point]\n", "\n", "# Uncertainty\n", "df1_uncertainty = pd.DataFrame(\n", - " {\"pred_target\": [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]]}, index=index_0\n", + " {\"pred_lr_target\": [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6]]}, index=index_0\n", ")\n", "df2_uncertainty = pd.DataFrame(\n", - " {\"pred_target\": [[4, 6, 8], [5, 7, 9], [6, 8, 10], [7, 9, 11]]}, index=index_1\n", + " {\"pred_lr_target\": [[4, 6, 8], [5, 7, 9], [6, 8, 10], [7, 9, 11]]}, index=index_1\n", ")\n", "dfs_uncertainty = [df1_uncertainty, df2_uncertainty]" ] @@ -149,7 +149,7 @@ "metadata": {}, "outputs": [], "source": [ - "metrics_list = ['RMSLE', 'CRPS'] # Add other metrics as needed\n", + "metrics_list = ['RMSLE', 'CRPS', 'MIS'] # Add other metrics as needed\n", "evaluation_manager = EvaluationManager(metrics_list)" ] }, @@ -162,17 +162,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Metric MIS is not a default metric, skipping...\n", + "Metric MIS is not a default metric, skipping...\n", + "Metric MIS is not a default metric, skipping...\n" + ] + } + ], "source": [ "steps = [1, 2]\n", - "point_evaluation_results = evaluation_manager.evaluate(df_actual, dfs_point, target='target', steps=steps)" + "point_evaluation_results = evaluation_manager.evaluate(df_actual, dfs_point, target='lr_target', steps=steps)" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -190,7 +200,7 @@ " ts01 0.420849 2.0)" ] }, - "execution_count": 36, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -208,21 +218,56 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "Metric RMSLE is not a default metric, skipping...\n", + "Metric RMSLE is not a default metric, skipping...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ "Metric RMSLE is not a default metric, skipping...\n", "Metric RMSLE is not a default metric, skipping...\n" ] } ], "source": [ - "uncertainty_evaluation_results = evaluation_manager.evaluate(df_actual, dfs_uncertainty, target='target', steps=steps)" + "uncertainty_evaluation_results = evaluation_manager.evaluate(df_actual, dfs_uncertainty, target='lr_target', steps=steps)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "( CRPS MIS\n", + " month100 0.555556 3.90\n", + " month101 2.333333 65.85\n", + " month102 4.111111 127.80,\n", + " CRPS MIS\n", + " step01 1.833333 45.85\n", + " step02 2.833333 85.85,\n", + " CRPS MIS\n", + " ts00 1.055556 23.9\n", + " ts01 3.611111 107.8)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "uncertainty_evaluation_results['month'][1], uncertainty_evaluation_results['step'][1], uncertainty_evaluation_results['time_series'][1]" ] }, { @@ -234,18 +279,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Get the evaluation type, i.e., uncertainty or point\n", - "is_uncertainty = EvaluationManager.get_evaluation_type(dfs_point)\n", - "month_point_evaluation_results = evaluation_manager.month_wise_evaluation(df_actual, dfs_point, target='target', is_uncertainty=is_uncertainty)" + "actual = EvaluationManager.transform_data(\n", + " EvaluationManager.convert_to_arrays(df_actual), 'lr_target'\n", + " )\n", + "predictions = [\n", + " EvaluationManager.transform_data(\n", + " EvaluationManager.convert_to_arrays(pred), f\"pred_lr_target\"\n", + " )\n", + " for pred in dfs_point\n", + "]\n", + "is_uncertainty = EvaluationManager.get_evaluation_type(predictions)\n", + "month_point_evaluation_results = evaluation_manager.month_wise_evaluation(actual, predictions, target='lr_target', is_uncertainty=is_uncertainty)" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -262,6 +316,13 @@ "source": [ "print(month_point_evaluation_results[1])" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/pyproject.toml b/pyproject.toml index bbb847d..5b66657 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "views_evaluation" -version = "0.2.0" +version = "0.4.0" description = "" authors = [ "Xiaolong Sun ", diff --git a/tests/test_evaluation_manager.py b/tests/test_evaluation_manager.py index 7498b75..46aec9c 100644 --- a/tests/test_evaluation_manager.py +++ b/tests/test_evaluation_manager.py @@ -55,19 +55,19 @@ def mock_actual(): ) df = pd.DataFrame( { - "target": [0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6], - "covariate_1": [3, 2, 4, 5, 2, 6, 8, 5, 3, 2, 9, 4], + "target": [0.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 5.0, 5.0, 6.0], + "covariate_1": [3.0, 2.0, 4.0, 5.0, 2.0, 6.0, 8.0, 5.0, 3.0, 2.0, 9.0, 4.0], }, index=index, ) - return df + return EvaluationManager.convert_to_arrays(df) @pytest.fixture def mock_point_predictions(mock_index): - df1 = pd.DataFrame({"pred_target": [1, 3, 5, 7, 9, 7]}, index=mock_index[0]) - df2 = pd.DataFrame({"pred_target": [2, 4, 6, 8, 10, 8]}, index=mock_index[1]) - return [df1, df2] + df1 = pd.DataFrame({"pred_target": [1.0, 3.0, 5.0, 7.0, 9.0, 7.0]}, index=mock_index[0]) + df2 = pd.DataFrame({"pred_target": [2.0, 4.0, 6.0, 8.0, 10.0, 8.0]}, index=mock_index[1]) + return [EvaluationManager.convert_to_arrays(df1), EvaluationManager.convert_to_arrays(df2)] @pytest.fixture @@ -75,12 +75,12 @@ def mock_uncertainty_predictions(mock_index): df1 = pd.DataFrame( { "pred_target": [ - [1, 2, 3], - [2, 3, 4], - [3, 4, 5], - [4, 5, 6], - [5, 6, 7], - [6, 7, 8], + [1.0, 2.0, 3.0], + [2.0, 3.0, 4.0], + [3.0, 4.0, 5.0], + [4.0, 5.0, 6.0], + [5.0, 6.0, 7.0], + [6.0, 7.0, 8.0], ] }, index=mock_index[0], @@ -88,74 +88,59 @@ def mock_uncertainty_predictions(mock_index): df2 = pd.DataFrame( { "pred_target": [ - [4, 6, 8], - [5, 7, 9], - [6, 8, 10], - [7, 9, 11], - [8, 10, 12], - [9, 11, 13], + [4.0, 6.0, 8.0], + [5.0, 7.0, 9.0], + [6.0, 8.0, 10.0], + [7.0, 9.0, 11.0], + [8.0, 10.0, 12.0], + [9.0, 11.0, 13.0], ] }, index=mock_index[1], ) - return [df1, df2] + return [EvaluationManager.convert_to_arrays(df1), EvaluationManager.convert_to_arrays(df2)] def test_validate_dataframes_valid_type(mock_point_predictions): with pytest.raises(TypeError): EvaluationManager.validate_predictions( - mock_point_predictions[0], "target", is_uncertainty=False + mock_point_predictions[0], "target" ) def test_validate_dataframes_valid_columns(mock_point_predictions): with pytest.raises(ValueError): EvaluationManager.validate_predictions( - mock_point_predictions, "y", is_uncertainty=False + mock_point_predictions, "y" ) - -def test_validate_dataframes_valid_point(mock_uncertainty_predictions): - with pytest.raises(ValueError): - EvaluationManager.validate_predictions( - mock_uncertainty_predictions, "target", is_uncertainty=False - ) - - -def test_validate_dataframes_valid_uncertainty(mock_point_predictions): - with pytest.raises(ValueError): - EvaluationManager.validate_predictions( - mock_point_predictions, "devpar", is_uncertainty=True - ) - - def test_get_evaluation_type(): # Test case 1: All DataFrames for uncertainty evaluation predictions_uncertainty = [ - pd.DataFrame({'pred_target': [[1, 2], [3, 4]]}), - pd.DataFrame({'pred_target': [[5, 6], [7, 8]]}), + pd.DataFrame({'pred_target': [[1.0, 2.0], [3.0, 4.0]]}), + pd.DataFrame({'pred_target': [[5.0, 6.0], [7.0, 8.0]]}), ] assert EvaluationManager.get_evaluation_type(predictions_uncertainty) == True # Test case 2: All DataFrames for point evaluation predictions_point = [ - pd.DataFrame({'pred_target': [1.0, 2.0]}), - pd.DataFrame({'pred_target': [3.0, 4.0]}), + pd.DataFrame({'pred_target': [[1.0], [2.0]]}), + pd.DataFrame({'pred_target': [[3.0], [4.0]]}), ] assert EvaluationManager.get_evaluation_type(predictions_point) == False # Test case 3: Mixed evaluation types predictions_mixed = [ - pd.DataFrame({'pred_target': [[1, 2], [3, 4]]}), - pd.DataFrame({'pred_target': [5.0, 6.0]}), + pd.DataFrame({'pred_target': [[1.0, 2.0], [3.0, 4.0]]}), + pd.DataFrame({'pred_target': [[5.0], [6.0]]}), ] with pytest.raises(ValueError): EvaluationManager.get_evaluation_type(predictions_mixed) # Test case 4: Single element lists predictions_single_element = [ - pd.DataFrame({'pred_target': [[1], [2]]}), - pd.DataFrame({'pred_target': [[3], [4]]}), + pd.DataFrame({'pred_target': [[1.0], [2.0]]}), + pd.DataFrame({'pred_target': [[3.0], [4.0]]}), ] assert EvaluationManager.get_evaluation_type(predictions_single_element) == False @@ -164,8 +149,8 @@ def test_match_actual_pred_point( mock_actual, mock_point_predictions, mock_uncertainty_predictions, mock_index ): df_matched = [ - pd.DataFrame({"target": [1, 2, 2, 3, 3, 4]}, index=mock_index[0]), - pd.DataFrame({"target": [2, 3, 3, 4, 4, 5]}, index=mock_index[1]), + pd.DataFrame({"target": [[1.0], [2.0], [2.0], [3.0], [3.0], [4.0]]}, index=mock_index[0]), + pd.DataFrame({"target": [[2.0], [3.0], [3.0], [4.0], [4.0], [5.0]]}, index=mock_index[1]), ] for i in range(len(df_matched)): df_matched_actual_point, df_matched_point = ( @@ -186,44 +171,44 @@ def test_match_actual_pred_point( def test_split_dfs_by_step(mock_point_predictions, mock_uncertainty_predictions): df_splitted_point = [ - pd.DataFrame( - {"pred_target": [1, 3, 2, 4]}, + EvaluationManager.convert_to_arrays(pd.DataFrame( + {"pred_target": [[1.0], [3.0], [2.0], [4.0]]}, index=pd.MultiIndex.from_tuples( [(100, 1), (100, 2), (101, 1), (101, 2)], names=["month", "country"] ), - ), - pd.DataFrame( - {"pred_target": [5, 7, 6, 8]}, + )), + EvaluationManager.convert_to_arrays(pd.DataFrame( + {"pred_target": [[5.0], [7.0], [6.0], [8.0]]}, index=pd.MultiIndex.from_tuples( [(101, 1), (101, 2), (102, 1), (102, 2)], names=["month", "country"] ), - ), - pd.DataFrame( - {"pred_target": [9, 7, 10, 8]}, + )), + EvaluationManager.convert_to_arrays(pd.DataFrame( + {"pred_target": [[9.0], [7.0], [10.0], [8.0]]}, index=pd.MultiIndex.from_tuples( [(102, 1), (102, 2), (103, 1), (103, 2)], names=["month", "country"] ), - ), + )), ] df_splitted_uncertainty = [ - pd.DataFrame( - {"pred_target": [[1, 2, 3], [2, 3, 4], [4, 6, 8], [5, 7, 9]]}, + EvaluationManager.convert_to_arrays(pd.DataFrame( + {"pred_target": [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [4.0, 6.0, 8.0], [5.0, 7.0, 9.0]]}, index=pd.MultiIndex.from_tuples( [(100, 1), (100, 2), (101, 1), (101, 2)], names=["month", "country"] ), - ), - pd.DataFrame( - {"pred_target": [[3, 4, 5], [4, 5, 6], [6, 8, 10], [7, 9, 11]]}, + )), + EvaluationManager.convert_to_arrays(pd.DataFrame( + {"pred_target": [[3.0, 4.0, 5.0], [4.0, 5.0, 6.0], [6.0, 8.0, 10.0], [7.0, 9.0, 11.0]]}, index=pd.MultiIndex.from_tuples( [(101, 1), (101, 2), (102, 1), (102, 2)], names=["month", "country"] ), - ), - pd.DataFrame( - {"pred_target": [[5, 6, 7], [6, 7, 8], [8, 10, 12], [9, 11, 13]]}, + )), + EvaluationManager.convert_to_arrays(pd.DataFrame( + {"pred_target": [[5.0, 6.0, 7.0], [6.0, 7.0, 8.0], [8.0, 10.0, 12.0], [9.0, 11.0, 13.0]]}, index=pd.MultiIndex.from_tuples( [(102, 1), (102, 2), (103, 1), (103, 2)], names=["month", "country"] ), - ), + )), ] df_splitted_point_test = EvaluationManager._split_dfs_by_step( mock_point_predictions @@ -393,3 +378,54 @@ def test_month_wise_evaluation_uncertainty(mock_actual, mock_uncertainty_predict evaluation_dict.keys() ) assert np.allclose(df_evaluation, df_evaluation_test, atol=0.000001) + + +def test_calculate_ap_point_predictions(): + actual_data = {'target': [[40], [20], [35], [25]]} + pred_data = {'pred_target': [[35], [30], [20], [15]]} + threshold=30 + + matched_actual = pd.DataFrame(actual_data) + matched_pred = pd.DataFrame(pred_data) + + from views_evaluation.evaluation.metric_calculators import calculate_ap + ap_score = calculate_ap(matched_actual, matched_pred, 'target', threshold) + + actual_binary = [1, 0, 1, 0] # 40>30, 20<30, 35>30, 25<30 + pred_binary = [1, 1, 0, 0] # 35>30, 30=30, 20<30, 15<30 + from sklearn.metrics import average_precision_score + expected_ap = average_precision_score(actual_binary, pred_binary) + + assert abs(ap_score - expected_ap) < 0.01 + + +def test_calculate_ap_uncertainty_predictions(): + actual_data = {'target': [[40], [20], [35], [25]]} + pred_data = { + 'pred_target': [ + [35, 40, 45], + [30, 35, 40], + [20, 25, 30], + [15, 20, 25] + ] + } + threshold=30 + matched_actual = pd.DataFrame(actual_data) + matched_pred = pd.DataFrame(pred_data) + + from views_evaluation.evaluation.metric_calculators import calculate_ap + ap_score = calculate_ap(matched_actual, matched_pred, 'target', threshold) + + pred_values = [35, 40, 45, 30, 35, 40, 20, 25, 30, 15, 20, 25] + actual_values = [40, 40, 40, 20, 20, 20, 35, 35, 35, 25, 25, 25] + actual_binary = [1 if x > threshold else 0 for x in actual_values] + pred_binary = [1 if x >= threshold else 0 for x in pred_values] + + from sklearn.metrics import average_precision_score + expected_ap = average_precision_score(actual_binary, pred_binary) + + assert abs(ap_score - expected_ap) < 0.01 + + + + diff --git a/tests/test_metric_calculators.py b/tests/test_metric_calculators.py new file mode 100644 index 0000000..1ee54f1 --- /dev/null +++ b/tests/test_metric_calculators.py @@ -0,0 +1,149 @@ +import pytest +import pandas as pd +import numpy as np +from views_evaluation.evaluation.metric_calculators import ( + calculate_rmsle, + calculate_crps, + calculate_ap, + calculate_emd, + calculate_pearson, + calculate_coverage, + calculate_ignorance_score, + calculate_mean_interval_score, + POINT_METRIC_FUNCTIONS, + UNCERTAINTY_METRIC_FUNCTIONS, +) + + +@pytest.fixture +def sample_data(): + """Create sample data for testing.""" + actual = pd.DataFrame({ + 'target': [[1.0], [2.0], [3.0], [4.0]] + }) + pred = pd.DataFrame({ + 'pred_target': [[1.1], [1.9], [3.1], [3.9]] + }) + return actual, pred + + +@pytest.fixture +def sample_uncertainty_data(): + """Create sample uncertainty data for testing.""" + actual = pd.DataFrame({ + 'target': [[1.0], [2.0], [3.0], [4.0]] + }) + pred = pd.DataFrame({ + 'pred_target': [[1.0, 1.1, 1.2], [1.8, 2.0, 2.2], [2.9, 3.0, 3.1], [3.8, 4.0, 4.2]] + }) + return actual, pred + + +def test_calculate_rmsle(sample_data): + """Test RMSLE calculation.""" + actual, pred = sample_data + result = calculate_rmsle(actual, pred, 'target') + assert isinstance(result, float) + assert result >= 0 + + +def test_calculate_crps(sample_uncertainty_data): + """Test CRPS calculation.""" + actual, pred = sample_uncertainty_data + result = calculate_crps(actual, pred, 'target') + assert isinstance(result, float) + assert result >= 0 + + +def test_calculate_ap(sample_data): + """Test Average Precision calculation.""" + actual, pred = sample_data + result = calculate_ap(actual, pred, 'target', threshold=2.5) + assert isinstance(result, float) + assert 0 <= result <= 1 + + +def test_calculate_emd(sample_data): + """Test Earth Mover's Distance calculation.""" + actual, pred = sample_data + result = calculate_emd(actual, pred, 'target') + assert isinstance(result, float) + assert result >= 0 + + +def test_calculate_pearson(sample_data): + """Test Pearson correlation calculation.""" + actual, pred = sample_data + result = calculate_pearson(actual, pred, 'target') + assert isinstance(result, float) + assert -1 <= result <= 1 + + +def test_calculate_coverage(sample_uncertainty_data): + """Test Coverage calculation.""" + actual, pred = sample_uncertainty_data + result = calculate_coverage(actual, pred, 'target') + assert isinstance(result, float) + assert 0 <= result <= 1 + + +def test_calculate_ignorance_score(sample_uncertainty_data): + """Test Ignorance Score calculation.""" + actual, pred = sample_uncertainty_data + result = calculate_ignorance_score(actual, pred, 'target') + assert isinstance(result, float) + assert result >= 0 + + +def test_calculate_mis(sample_uncertainty_data): + """Test Mean Interval Score calculation.""" + actual, pred = sample_uncertainty_data + result = calculate_mean_interval_score(actual, pred, 'target') + assert isinstance(result, float) + assert result >= 0 + + +def test_point_metric_functions(): + """Test that all point metric functions are available.""" + expected_metrics = [ + "RMSLE", "CRPS", "AP", "EMD", "SD", "pEMDiv", "Pearson", "Variogram" + ] + + for metric in expected_metrics: + assert metric in POINT_METRIC_FUNCTIONS + assert callable(POINT_METRIC_FUNCTIONS[metric]) + + +def test_uncertainty_metric_functions(): + """Test that all uncertainty metric functions are available.""" + expected_metrics = ["CRPS", "MIS", "Ignorance", "Brier", "Jeffreys", "Coverage"] + + for metric in expected_metrics: + assert metric in UNCERTAINTY_METRIC_FUNCTIONS + assert callable(UNCERTAINTY_METRIC_FUNCTIONS[metric]) + + +def test_not_implemented_metrics(): + """Test that unimplemented metrics raise NotImplementedError.""" + actual = pd.DataFrame({'target': [[1.0]]}) + pred = pd.DataFrame({'pred_target': [[1.0]]}) + + from views_evaluation.evaluation.metric_calculators import ( + calculate_brier, + calculate_jeffreys, + calculate_sd, + calculate_pEMDiv, + calculate_variogram, + ) + + unimplemented_functions = [ + calculate_brier, + calculate_jeffreys, + calculate_sd, + calculate_pEMDiv, + calculate_variogram, + ] + + for func in unimplemented_functions: + with pytest.raises(NotImplementedError): + func(actual, pred, 'target') \ No newline at end of file diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py index 0f5074a..9b0f859 100644 --- a/views_evaluation/evaluation/evaluation_manager.py +++ b/views_evaluation/evaluation/evaluation_manager.py @@ -1,17 +1,15 @@ from typing import List, Dict, Tuple, Optional +import logging import pandas as pd import numpy as np -import properscoring as ps -from sklearn.metrics import ( - root_mean_squared_error, - root_mean_squared_log_error, - average_precision_score, -) from views_evaluation.evaluation.metrics import ( PointEvaluationMetrics, UncertaintyEvaluationMetrics, ) -import logging +from views_evaluation.evaluation.metric_calculators import ( + POINT_METRIC_FUNCTIONS, + UNCERTAINTY_METRIC_FUNCTIONS, +) logger = logging.getLogger(__name__) @@ -31,107 +29,55 @@ def __init__(self, metrics_list: list): """ self.metrics_list = metrics_list - self.point_metric_functions = { - "RMSLE": self._calculate_rmsle, - "CRPS": self._calculate_crps, - "AP": self._calculate_ap, - "Brier": self._calculate_brier, - "Jeffreys": self._calculate_jeffreys, - "Coverage": self._calculate_coverage, - "EMD": self._calculate_emd, - "SD": self._calculate_sd, - "pEMDiv": self._calculate_pEMDiv, - "Pearson": self._calculate_pearson, - "Variogram": self._calculate_variogram, - } - self.uncertainty_metric_functions = { - "CRPS": self._calculate_crps, - } - - @staticmethod - def _calculate_rmsle( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - return ( - root_mean_squared_error(matched_actual, matched_pred) - if target.startswith("ln") - else root_mean_squared_log_error(matched_actual, matched_pred) - ) - - @staticmethod - def _calculate_crps( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - return np.mean( - [ - ps.crps_ensemble(actual, np.array(pred)) - for actual, pred in zip( - matched_actual[target], matched_pred[f"pred_{target}"] - ) - ] - ) + self.point_metric_functions = POINT_METRIC_FUNCTIONS + self.uncertainty_metric_functions = UNCERTAINTY_METRIC_FUNCTIONS @staticmethod - def _calculate_ap( - matched_actual: pd.DataFrame, - matched_pred: pd.DataFrame, - target: str, - threshold=0.01, - ) -> float: + def transform_data(df: pd.DataFrame, target: str) -> pd.DataFrame: """ - Calculate Average Precision (AP) for binary predictions with a threshold. + Transform the data to normal distribution. """ - matched_pred_binary = (matched_pred >= threshold).astype(int) - matched_actual_binary = (matched_actual > 0).astype(int) - return average_precision_score(matched_actual_binary, matched_pred_binary) - - @staticmethod - def _calculate_brier( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - pass - - @staticmethod - def _calculate_jeffreys( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - pass - - @staticmethod - def _calculate_coverage( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - pass - - @staticmethod - def _calculate_emd( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - pass - - @staticmethod - def _calculate_sd( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - pass + if target.startswith("ln") or target.startswith("pred_ln"): + df[[target]] = df[[target]].applymap( + lambda x: ( + np.exp(x) - 1 + if isinstance(x, (list, np.ndarray)) + else np.exp(x) - 1 + ) + ) + elif target.startswith("lx") or target.startswith("pred_lx"): + df[[target]] = df[[target]].applymap( + lambda x: ( + np.exp(x) - np.exp(100) + if isinstance(x, (list, np.ndarray)) + else np.exp(x) - np.exp(100) + ) + ) + elif target.startswith("lr") or target.startswith("pred_lr"): + df[[target]] = df[[target]].applymap( + lambda x: x if isinstance(x, (list, np.ndarray)) else x + ) + else: + raise ValueError(f"Target {target} is not a valid target") + return df @staticmethod - def _calculate_pEMDiv( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - pass + def convert_to_arrays(df: pd.DataFrame) -> pd.DataFrame: + """ + Convert columns in a DataFrame to numpy arrays. - @staticmethod - def _calculate_pearson( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - pass + Args: + df (pd.DataFrame): The input DataFrame with columns that may contain lists. - @staticmethod - def _calculate_variogram( - matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str - ) -> float: - pass + Returns: + pd.DataFrame: A new DataFrame with columns converted to numpy arrays. + """ + converted = df.copy() + for col in converted.columns: + converted[col] = converted[col].apply( + lambda x: np.array(x) if isinstance(x, list) else np.array([x]) + ) + return converted @staticmethod def get_evaluation_type(predictions: List[pd.DataFrame]) -> bool: @@ -144,47 +90,55 @@ def get_evaluation_type(predictions: List[pd.DataFrame]) -> bool: Returns: bool: True if all DataFrames are for uncertainty evaluation, - False if any DataFrame is suitable for point evaluation. + False if all DataFrame are for point evaluation. Raises: - ValueError: If there is a mix of results (some DataFrames for uncertainty and others for point evaluation). + ValueError: If there is a mix of single and multiple values in the lists, + or if uncertainty lists have different lengths. """ - all_uncertainty = True - all_point = True + is_uncertainty = False + is_point = False + uncertainty_length = None for df in predictions: - if all( - isinstance(value, list) and len(value) >= 2 - for value in df.values.flatten() - ): - all_point = False - else: - all_uncertainty = False - - if all_uncertainty and not all_point: - return True - elif all_point and not all_uncertainty: - return False - else: + for value in df.values.flatten(): + if not (isinstance(value, np.ndarray) or isinstance(value, list)): + raise ValueError( + "All values must be lists or numpy arrays. Convert the data." + ) + + if len(value) > 1: + is_uncertainty = True + # For uncertainty evaluation, check that all lists have the same length + if uncertainty_length is None: + uncertainty_length = len(value) + elif len(value) != uncertainty_length: + raise ValueError( + f"Inconsistent list lengths in uncertainty evaluation. " + f"Found lengths {uncertainty_length} and {len(value)}" + ) + elif len(value) == 1: + is_point = True + else: + raise ValueError("Empty lists are not allowed") + + if is_uncertainty and is_point: raise ValueError( - "Mix of evaluation types detected: some DataFrames are for uncertainty, others for point evaluation." - "Please ensure all DataFrames are consistent in their evaluation type" + "Mix of evaluation types detected: some rows contain single values, others contain multiple values. " + "Please ensure all rows are consistent in their evaluation type" ) + return is_uncertainty + @staticmethod - def validate_predictions( - predictions: List[pd.DataFrame], target: str, is_uncertainty: bool - ): + def validate_predictions(predictions: List[pd.DataFrame], target: str): """ Checks if the predictions are valid DataFrames. - Each DataFrame must have exactly one column named `pred_column_name`. - - If is_uncertainty is True, all elements in the column must be lists. - - If is_uncertainty is False, all elements in the column must be floats. Args: predictions (List[pd.DataFrame]): A list of DataFrames containing the predictions. target (str): The target column in the actual DataFrame. - is_uncertainty (bool): Flag to indicate if the evaluation is for uncertainty. """ pred_column_name = f"pred_{target}" if not isinstance(predictions, list): @@ -195,20 +149,10 @@ def validate_predictions( raise TypeError(f"Predictions[{i}] must be a DataFrame.") if df.empty: raise ValueError(f"Predictions[{i}] must not be empty.") - if df.columns.tolist() != [pred_column_name]: + if pred_column_name not in df.columns: raise ValueError( - f"Predictions[{i}] must contain only one column named '{pred_column_name}'." + f"Predictions[{i}] must contain the column named '{pred_column_name}'." ) - if ( - is_uncertainty - and not df.applymap(lambda x: isinstance(x, list)).all().all() - ): - raise ValueError("Each row in the predictions must be a list.") - if ( - not is_uncertainty - and not df.applymap(lambda x: isinstance(x, (int, float))).all().all() - ): - raise ValueError("Each row in the predictions must be a float.") @staticmethod def _match_actual_pred( @@ -271,6 +215,7 @@ def step_wise_evaluation( target: str, steps: List[int], is_uncertainty: bool, + **kwargs, ): """ Evaluates the predictions step-wise and calculates the specified metrics. @@ -298,7 +243,6 @@ def step_wise_evaluation( ) metric_functions = self.point_metric_functions - step_metrics = {} result_dfs = EvaluationManager._split_dfs_by_step(predictions) for metric in self.metrics_list: @@ -310,7 +254,7 @@ def step_wise_evaluation( ) evaluation_dict[f"step{str(step).zfill(2)}"].__setattr__( metric, - metric_functions[metric](matched_actual, matched_pred, target), + metric_functions[metric](matched_actual, matched_pred, target, **kwargs), ) else: logger.warning(f"Metric {metric} is not a default metric, skipping...") @@ -326,6 +270,7 @@ def time_series_wise_evaluation( predictions: List[pd.DataFrame], target: str, is_uncertainty: bool, + **kwargs, ): """ Evaluates the predictions time series-wise and calculates the specified metrics. @@ -362,7 +307,7 @@ def time_series_wise_evaluation( ) evaluation_dict[f"ts{str(i).zfill(2)}"].__setattr__( metric, - metric_functions[metric](matched_actual, matched_pred, target), + metric_functions[metric](matched_actual, matched_pred, target, **kwargs), ) else: logger.warning(f"Metric {metric} is not a default metric, skipping...") @@ -378,6 +323,7 @@ def month_wise_evaluation( predictions: List[pd.DataFrame], target: str, is_uncertainty: bool, + **kwargs, ): """ Evaluates the predictions month-wise and calculates the specified metrics. @@ -395,7 +341,7 @@ def month_wise_evaluation( month_range = pred_concat.index.get_level_values(0).unique() month_start = month_range.min() month_end = month_range.max() - + if is_uncertainty: evaluation_dict = ( UncertaintyEvaluationMetrics.make_month_wise_evaluation_dict( @@ -423,6 +369,7 @@ def month_wise_evaluation( matched_actual.loc[df.index, [target]], matched_pred.loc[df.index, [f"pred_{target}"]], target, + **kwargs, ) ) @@ -444,6 +391,7 @@ def evaluate( predictions: List[pd.DataFrame], target: str, steps: List[int], + **kwargs, ): """ Evaluates the predictions and calculates the specified point metrics. @@ -455,18 +403,33 @@ def evaluate( steps (List[int]): The steps to evaluate. """ + + EvaluationManager.validate_predictions(predictions, target) + actual = EvaluationManager.transform_data( + EvaluationManager.convert_to_arrays(actual), target + ) + predictions = [ + EvaluationManager.transform_data( + EvaluationManager.convert_to_arrays(pred), f"pred_{target}" + ) + for pred in predictions + ] is_uncertainty = EvaluationManager.get_evaluation_type(predictions) - EvaluationManager.validate_predictions(predictions, target, is_uncertainty) evaluation_results = {} evaluation_results["month"] = self.month_wise_evaluation( - actual, predictions, target, is_uncertainty + actual, predictions, target, is_uncertainty, **kwargs ) evaluation_results["time_series"] = self.time_series_wise_evaluation( - actual, predictions, target, is_uncertainty + actual, predictions, target, is_uncertainty, **kwargs ) evaluation_results["step"] = self.step_wise_evaluation( - actual, predictions, target, steps, is_uncertainty, + actual, + predictions, + target, + steps, + is_uncertainty, + **kwargs, ) return evaluation_results diff --git a/views_evaluation/evaluation/metric_calculators.py b/views_evaluation/evaluation/metric_calculators.py new file mode 100644 index 0000000..02d775f --- /dev/null +++ b/views_evaluation/evaluation/metric_calculators.py @@ -0,0 +1,380 @@ +from typing import List, Dict, Tuple, Optional +from collections import Counter +import pandas as pd +import numpy as np +import properscoring as ps +from sklearn.metrics import ( + root_mean_squared_log_error, + average_precision_score, +) +from scipy.stats import wasserstein_distance, pearsonr + + +def calculate_rmsle( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate Root Mean Squared Logarithmic Error (RMSLE) for each prediction. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Average RMSLE score + """ + actual_values = np.concatenate(matched_actual[target].values) + pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) + + actual_expanded = np.repeat( + actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] + ) + + return root_mean_squared_log_error(actual_expanded, pred_values) + + +def calculate_crps( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate Continuous Ranked Probability Score (CRPS) for each prediction. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Average CRPS score + """ + return np.mean( + [ + ps.crps_ensemble(actual[0], np.array(pred)) + for actual, pred in zip( + matched_actual[target], matched_pred[f"pred_{target}"] + ) + ] + ) + + +def calculate_ap( + matched_actual: pd.DataFrame, + matched_pred: pd.DataFrame, + target: str, + threshold=25, +) -> float: + """ + Calculate Average Precision (AP) for binary predictions with a threshold. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + threshold (float): Threshold to convert predictions to binary values + + Returns: + float: Average Precision score + """ + actual_values = np.concatenate(matched_actual[target].values) + pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) + + actual_expanded = np.repeat( + actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] + ) + + actual_binary = (actual_expanded > threshold).astype(int) + pred_binary = (pred_values >= threshold).astype(int) + + return average_precision_score(actual_binary, pred_binary) + + +def calculate_emd( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate Earth Mover's Distance (EMD) between predicted and actual distributions. + EMD measures the minimum amount of work needed to transform one distribution into another. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Average EMD score + """ + emd_list = [] + for actual, preds in zip(matched_actual[target], matched_pred[f"pred_{target}"]): + actual_val = np.asarray(actual) + preds_arr = np.asarray(preds) + emd_list.append(wasserstein_distance(preds_arr, actual_val)) + return np.mean(emd_list) + + +def calculate_sd( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate Sinkhorn Distance between predicted and actual distributions. + + Sinkhorn Distance is a regularized version of the Earth Mover's Distance + that is computationally more efficient. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Sinkhorn Distance score + """ + raise NotImplementedError("Sinkhorn Distance calculation not yet implemented") + + +def calculate_pEMDiv( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate pseudo-Earth Mover Divergence between predicted and actual distributions. + + pEMDiv is a computationally efficient approximation of the Earth Mover's Distance. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: pEMDiv score + """ + raise NotImplementedError("pEMDiv calculation not yet implemented") + + +def calculate_pearson( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate Pearson correlation coefficient between actual and predicted values. + This measures the linear correlation between predictions and actual values. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Pearson correlation coefficient + """ + actual_values = np.concatenate(matched_actual[target].values) + pred_values = np.concatenate(matched_pred[f"pred_{target}"].values) + + actual_expanded = np.repeat( + actual_values, [len(x) for x in matched_pred[f"pred_{target}"]] + ) + + correlation, _ = pearsonr(actual_expanded, pred_values) + return correlation + + +def calculate_variogram( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + !! How to accountr for time and location? + Calculate the variogram score between actual and predicted values. + This measures the spatial/temporal correlation structure. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Variogram score + """ + raise NotImplementedError("Variogram calculation not yet implemented") + + +def calculate_brier( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate Brier Score for probabilistic predictions. + + The Brier Score measures the accuracy of probabilistic predictions. + Lower values indicate better predictions. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Brier Score + """ + raise NotImplementedError("Brier Score calculation not yet implemented") + + +def calculate_jeffreys( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str +) -> float: + """ + Calculate Jeffreys Divergence between predicted and actual distributions. + + Jeffreys Divergence is a symmetric measure of the difference between + two probability distributions. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + + Returns: + float: Jeffreys Divergence score + """ + raise NotImplementedError("Jeffreys Divergence calculation not yet implemented") + + +def calculate_coverage( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str, alpha=0.1 +) -> float: + """ + Calculate Coverage (Histograms) for probabilistic predictions. + + Coverage measures how well the predicted distribution covers the actual values. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + alpha (float): Significance level for the interval (default: 0.1) + Returns: + float: Coverage score + """ + y_true = matched_actual[target].values + y_pred_samples = matched_pred[f"pred_{target}"].values + + lower_q = alpha / 2 + upper_q = 1 - alpha / 2 + + covered = [] + for yt, pred_list in zip(y_true, y_pred_samples): + lower = np.quantile(pred_list, lower_q) + upper = np.quantile(pred_list, upper_q) + covered.append(lower <= yt <= upper) + + return np.mean(covered) + + +def calculate_mean_interval_score( + matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str, alpha=0.05 +): + """ + Calculate the Mean Interval Score (MIS) for probabilistic predictions. + + The Mean Interval Score measures the average width of prediction intervals + and the coverage of the actual values. + + Args: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + alpha (float): Significance level for the interval (default: 0.05) + + Returns: + float: Mean Interval Score + """ + lower = np.array( + [np.quantile(row, q=alpha / 2) for row in matched_pred[f"pred_{target}"]] + ) + upper = np.array( + [np.quantile(row, q=1 - (alpha / 2)) for row in matched_pred[f"pred_{target}"]] + ) + actuals = np. array( + [ + row[0] if isinstance(row, (np.ndarray, list)) else row + for row in matched_actual[target] + ] + ) + + interval_width = upper - lower + lower_coverage = (2 / alpha) * (lower - actuals) * (actuals < lower) + upper_coverage = (2 / alpha) * (actuals - upper) * (actuals > upper) + interval_score = interval_width + lower_coverage + upper_coverage + + return np.mean(interval_score) + + +def calculate_ignorance_score( + matched_actual: pd.DataFrame, + matched_pred: pd.DataFrame, + target: str, + bins=[0, 0.5, 2.5, 5.5, 10.5, 25.5, 50.5, 100.5, 250.5, 500.5, 1000.5], + low_bin=0, + high_bin=10000, +): + """ + !!Note unfinished. Bins need to be fixed bacause in competition we evaluate over log values but not here. + This is an adapted version from https://github.com/prio-data/prediction_competition_2023/tree/main + Compute Binned Ignorance Score for predictions and observations. + + Parameters: + matched_actual (pd.DataFrame): DataFrame containing actual values + matched_pred (pd.DataFrame): DataFrame containing predictions + target (str): The target column name + bins (list): List of bins for the histogram + low_bin (float): The lower bound of the bins + high_bin (float): The upper bound of the bins + + Returns: + float: Mean ignorance score. + """ + + def digitize_minus_one(x, edges): + return np.digitize(x, edges, right=False) - 1 + + def _calculate_ignorance_score(predictions, observed, n): + c = Counter(predictions) + prob = c[observed] / n + return -np.log2(prob) + + scores = [] + for row_p, row_o in zip(matched_pred[f"pred_{target}"], matched_actual[target]): + preds = np.asarray(row_p) + truth = float(np.asarray(row_o).squeeze()) + + edges = np.histogram_bin_edges(preds, bins=bins, range=(low_bin, high_bin)) + + binned_preds = digitize_minus_one(preds, edges) + binned_obs = digitize_minus_one([truth], edges)[0] + + synthetic = np.arange(len(edges) - 1) + binned_preds = np.concatenate([binned_preds, synthetic]) + + n = len(binned_preds) + score = _calculate_ignorance_score(binned_preds, binned_obs, n) + scores.append(score) + + return np.mean(scores) + + +POINT_METRIC_FUNCTIONS = { + "RMSLE": calculate_rmsle, + "CRPS": calculate_crps, + "AP": calculate_ap, + "EMD": calculate_emd, + "SD": calculate_sd, + "pEMDiv": calculate_pEMDiv, + "Pearson": calculate_pearson, + "Variogram": calculate_variogram, +} + +UNCERTAINTY_METRIC_FUNCTIONS = { + "CRPS": calculate_crps, + "MIS": calculate_mean_interval_score, + "Ignorance": calculate_ignorance_score, + "Brier": calculate_brier, + "Jeffreys": calculate_jeffreys, + "Coverage": calculate_coverage, +} diff --git a/views_evaluation/evaluation/metrics.py b/views_evaluation/evaluation/metrics.py index 70f158e..36b2cb5 100644 --- a/views_evaluation/evaluation/metrics.py +++ b/views_evaluation/evaluation/metrics.py @@ -121,9 +121,6 @@ class PointEvaluationMetrics(BaseEvaluationMetrics): RMSLE: Optional[float] = None CRPS: Optional[float] = None AP: Optional[float] = None - Brier: Optional[float] = None - Jeffreys: Optional[float] = None - Coverage: Optional[float] = None EMD: Optional[float] = None SD: Optional[float] = None pEMDiv: Optional[float] = None @@ -140,4 +137,10 @@ class UncertaintyEvaluationMetrics(BaseEvaluationMetrics): CRPS (Optional[float]): Continuous Ranked Probability Score. """ - CRPS: Optional[float] = None \ No newline at end of file + CRPS: Optional[float] = None + MIS: Optional[float] = None + Ignorance: Optional[float] = None + Brier: Optional[float] = None + Jeffreys: Optional[float] = None + Coverage: Optional[float] = None + \ No newline at end of file