From 48fcb9cbb24ce6f7e87307c6542f63a21955e9f3 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Thu, 10 Jul 2025 14:19:21 +0200
Subject: [PATCH 01/31] fix np.array error

---
 .../evaluation/evaluation_manager.py          | 36 +++++++++++++------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
index 9b0f859..f7372fa 100644
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ b/views_evaluation/evaluation/evaluation_manager.py
@@ -62,7 +62,7 @@ def transform_data(df: pd.DataFrame, target: str) -> pd.DataFrame:
         return df
 
     @staticmethod
-    def convert_to_arrays(df: pd.DataFrame) -> pd.DataFrame:
+    def convert_to_array(df: pd.DataFrame, target: str) -> pd.DataFrame:
         """
         Convert columns in a DataFrame to numpy arrays.
 
@@ -73,12 +73,22 @@ def convert_to_arrays(df: pd.DataFrame) -> pd.DataFrame:
             pd.DataFrame: A new DataFrame with columns converted to numpy arrays.
         """
         converted = df.copy()
-        for col in converted.columns:
-            converted[col] = converted[col].apply(
-                lambda x: np.array(x) if isinstance(x, list) else np.array([x])
-            )
+        converted[target] = converted[target].apply(
+            lambda x: x if isinstance(x, np.ndarray) else (np.array(x) if isinstance(x, list) else np.array([x]))
+        )
         return converted
 
+    @staticmethod
+    def convert_to_scalar(df: pd.DataFrame, target: str) -> pd.DataFrame:
+        """
+        Convert columns in a DataFrame to scalar values by taking the mean of the list.
+        """
+        converted = df.copy()
+        converted[target] = converted[target].apply(
+            lambda x: np.mean(x) if isinstance(x, (list, np.ndarray)) else x
+        )
+        return converted
+        
     @staticmethod
     def get_evaluation_type(predictions: List[pd.DataFrame]) -> bool:
         """
@@ -106,7 +116,7 @@ def get_evaluation_type(predictions: List[pd.DataFrame]) -> bool:
                     raise ValueError(
                         "All values must be lists or numpy arrays. Convert the data."
                     )
-                
+
                 if len(value) > 1:
                     is_uncertainty = True
                     # For uncertainty evaluation, check that all lists have the same length
@@ -254,7 +264,9 @@ def step_wise_evaluation(
                     )
                     evaluation_dict[f"step{str(step).zfill(2)}"].__setattr__(
                         metric,
-                        metric_functions[metric](matched_actual, matched_pred, target, **kwargs),
+                        metric_functions[metric](
+                            matched_actual, matched_pred, target, **kwargs
+                        ),
                     )
             else:
                 logger.warning(f"Metric {metric} is not a default metric, skipping...")
@@ -307,7 +319,9 @@ def time_series_wise_evaluation(
                     )
                     evaluation_dict[f"ts{str(i).zfill(2)}"].__setattr__(
                         metric,
-                        metric_functions[metric](matched_actual, matched_pred, target, **kwargs),
+                        metric_functions[metric](
+                            matched_actual, matched_pred, target, **kwargs
+                        ),
                     )
             else:
                 logger.warning(f"Metric {metric} is not a default metric, skipping...")
@@ -359,6 +373,8 @@ def month_wise_evaluation(
             actual, pred_concat, target
         )
         # matched_concat = pd.merge(matched_actual, matched_pred, left_index=True, right_index=True)
+        print(matched_actual.head())
+        print(matched_pred.head())
 
         for metric in self.metrics_list:
             if metric in metric_functions:
@@ -406,11 +422,11 @@ def evaluate(
 
         EvaluationManager.validate_predictions(predictions, target)
         actual = EvaluationManager.transform_data(
-            EvaluationManager.convert_to_arrays(actual), target
+            EvaluationManager.convert_to_array(actual, target), target
         )
         predictions = [
             EvaluationManager.transform_data(
-                EvaluationManager.convert_to_arrays(pred), f"pred_{target}"
+                EvaluationManager.convert_to_array(pred, f"pred_{target}"), f"pred_{target}"
             )
             for pred in predictions
         ]

From ad5814b94de13c72d25a3d5b22882f869861a101 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Thu, 10 Jul 2025 14:19:41 +0200
Subject: [PATCH 02/31] fix test

---
 tests/test_evaluation_manager.py | 30 +++++++++++++++---------------
 tests/test_metric_calculators.py | 17 ++++++++++++-----
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/tests/test_evaluation_manager.py b/tests/test_evaluation_manager.py
index 46aec9c..30f703b 100644
--- a/tests/test_evaluation_manager.py
+++ b/tests/test_evaluation_manager.py
@@ -60,14 +60,14 @@ def mock_actual():
         },
         index=index,
     )
-    return EvaluationManager.convert_to_arrays(df)
+    return EvaluationManager.convert_to_array(df, "target")
 
 
 @pytest.fixture
 def mock_point_predictions(mock_index):
     df1 = pd.DataFrame({"pred_target": [1.0, 3.0, 5.0, 7.0, 9.0, 7.0]}, index=mock_index[0])
     df2 = pd.DataFrame({"pred_target": [2.0, 4.0, 6.0, 8.0, 10.0, 8.0]}, index=mock_index[1])
-    return [EvaluationManager.convert_to_arrays(df1), EvaluationManager.convert_to_arrays(df2)]
+    return [EvaluationManager.convert_to_array(df1, "pred_target"), EvaluationManager.convert_to_array(df2, "pred_target")]
 
 
 @pytest.fixture
@@ -98,7 +98,7 @@ def mock_uncertainty_predictions(mock_index):
         },
         index=mock_index[1],
     )
-    return [EvaluationManager.convert_to_arrays(df1), EvaluationManager.convert_to_arrays(df2)]
+    return [EvaluationManager.convert_to_array(df1, "pred_target"), EvaluationManager.convert_to_array(df2, "pred_target")]
 
 
 def test_validate_dataframes_valid_type(mock_point_predictions):
@@ -171,44 +171,44 @@ def test_match_actual_pred_point(
 
 def test_split_dfs_by_step(mock_point_predictions, mock_uncertainty_predictions):
     df_splitted_point = [
-        EvaluationManager.convert_to_arrays(pd.DataFrame(
+        EvaluationManager.convert_to_array(pd.DataFrame(
             {"pred_target": [[1.0], [3.0], [2.0], [4.0]]},
             index=pd.MultiIndex.from_tuples(
                 [(100, 1), (100, 2), (101, 1), (101, 2)], names=["month", "country"]
             ),
-        )),
-        EvaluationManager.convert_to_arrays(pd.DataFrame(
+        ), "pred_target"),
+        EvaluationManager.convert_to_array(pd.DataFrame(
             {"pred_target": [[5.0], [7.0], [6.0], [8.0]]},
             index=pd.MultiIndex.from_tuples(
                 [(101, 1), (101, 2), (102, 1), (102, 2)], names=["month", "country"]
             ),
-        )),
-        EvaluationManager.convert_to_arrays(pd.DataFrame(
+        ), "pred_target"),
+        EvaluationManager.convert_to_array(pd.DataFrame(
             {"pred_target": [[9.0], [7.0], [10.0], [8.0]]},
             index=pd.MultiIndex.from_tuples(
                 [(102, 1), (102, 2), (103, 1), (103, 2)], names=["month", "country"]
             ),
-        )),
+        ), "pred_target"),
     ]
     df_splitted_uncertainty = [
-        EvaluationManager.convert_to_arrays(pd.DataFrame(
+        EvaluationManager.convert_to_array(pd.DataFrame(
             {"pred_target": [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [4.0, 6.0, 8.0], [5.0, 7.0, 9.0]]},
             index=pd.MultiIndex.from_tuples(
                 [(100, 1), (100, 2), (101, 1), (101, 2)], names=["month", "country"]
             ),
-        )),
-        EvaluationManager.convert_to_arrays(pd.DataFrame(
+        ), "pred_target"),
+        EvaluationManager.convert_to_array(pd.DataFrame(
             {"pred_target": [[3.0, 4.0, 5.0], [4.0, 5.0, 6.0], [6.0, 8.0, 10.0], [7.0, 9.0, 11.0]]},
             index=pd.MultiIndex.from_tuples(
                 [(101, 1), (101, 2), (102, 1), (102, 2)], names=["month", "country"]
             ),
-        )),
-        EvaluationManager.convert_to_arrays(pd.DataFrame(
+        ), "pred_target"),
+        EvaluationManager.convert_to_array(pd.DataFrame(
             {"pred_target": [[5.0, 6.0, 7.0], [6.0, 7.0, 8.0], [8.0, 10.0, 12.0], [9.0, 11.0, 13.0]]},
             index=pd.MultiIndex.from_tuples(
                 [(102, 1), (102, 2), (103, 1), (103, 2)], names=["month", "country"]
             ),
-        )),
+        ), "pred_target"),
     ]
     df_splitted_point_test = EvaluationManager._split_dfs_by_step(
         mock_point_predictions
diff --git a/tests/test_metric_calculators.py b/tests/test_metric_calculators.py
index 1ee54f1..91d1996 100644
--- a/tests/test_metric_calculators.py
+++ b/tests/test_metric_calculators.py
@@ -39,15 +39,22 @@ def sample_uncertainty_data():
     return actual, pred
 
 
-def test_calculate_rmsle(sample_data):
+def test_calculate_rmsle_point(sample_data):
     """Test RMSLE calculation."""
     actual, pred = sample_data
     result = calculate_rmsle(actual, pred, 'target')
     assert isinstance(result, float)
     assert result >= 0
 
+def test_calculate_crps_point(sample_data):
+    """Test CRPS calculation."""
+    actual, pred = sample_data
+    result = calculate_crps(actual, pred, 'target')
+    assert isinstance(result, float)
+    assert result >= 0
+
 
-def test_calculate_crps(sample_uncertainty_data):
+def test_calculate_crps_uncertainty(sample_uncertainty_data):
     """Test CRPS calculation."""
     actual, pred = sample_uncertainty_data
     result = calculate_crps(actual, pred, 'target')
@@ -79,7 +86,7 @@ def test_calculate_pearson(sample_data):
     assert -1 <= result <= 1
 
 
-def test_calculate_coverage(sample_uncertainty_data):
+def test_calculate_coverage_uncertainty(sample_uncertainty_data):
     """Test Coverage calculation."""
     actual, pred = sample_uncertainty_data
     result = calculate_coverage(actual, pred, 'target')
@@ -87,7 +94,7 @@ def test_calculate_coverage(sample_uncertainty_data):
     assert 0 <= result <= 1
 
 
-def test_calculate_ignorance_score(sample_uncertainty_data):
+def test_calculate_ignorance_score_uncertainty(sample_uncertainty_data):
     """Test Ignorance Score calculation."""
     actual, pred = sample_uncertainty_data
     result = calculate_ignorance_score(actual, pred, 'target')
@@ -95,7 +102,7 @@ def test_calculate_ignorance_score(sample_uncertainty_data):
     assert result >= 0
 
 
-def test_calculate_mis(sample_uncertainty_data):
+def test_calculate_mis_uncertainty(sample_uncertainty_data):
     """Test Mean Interval Score calculation."""
     actual, pred = sample_uncertainty_data
     result = calculate_mean_interval_score(actual, pred, 'target')

From 0527822f60b5a253434c993e9c746bcc9a86bff2 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Thu, 10 Jul 2025 14:26:14 +0200
Subject: [PATCH 03/31] remove print

---
 views_evaluation/evaluation/evaluation_manager.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
index f7372fa..2810324 100644
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ b/views_evaluation/evaluation/evaluation_manager.py
@@ -373,8 +373,6 @@ def month_wise_evaluation(
             actual, pred_concat, target
         )
         # matched_concat = pd.merge(matched_actual, matched_pred, left_index=True, right_index=True)
-        print(matched_actual.head())
-        print(matched_pred.head())
 
         for metric in self.metrics_list:
             if metric in metric_functions:

From 6bfd9b5eef9e3723d1da37f5d45615a4c1c48b61 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Fri, 11 Jul 2025 12:43:06 +0200
Subject: [PATCH 04/31] allow transform multiple targets

---
 .../evaluation/evaluation_manager.py          | 54 +++++++++++--------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
index 2810324..df65daa 100644
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ b/views_evaluation/evaluation/evaluation_manager.py
@@ -33,36 +33,39 @@ def __init__(self, metrics_list: list):
         self.uncertainty_metric_functions = UNCERTAINTY_METRIC_FUNCTIONS
 
     @staticmethod
-    def transform_data(df: pd.DataFrame, target: str) -> pd.DataFrame:
+    def transform_data(df: pd.DataFrame, target: str | list[str]) -> pd.DataFrame:
         """
         Transform the data to normal distribution.
         """
-        if target.startswith("ln") or target.startswith("pred_ln"):
-            df[[target]] = df[[target]].applymap(
+        if isinstance(target, str):
+            target = [target]
+        for t in target:
+            if t.startswith("ln") or t.startswith("pred_ln"):
+                df[[t]] = df[[t]].applymap(
                 lambda x: (
                     np.exp(x) - 1
                     if isinstance(x, (list, np.ndarray))
                     else np.exp(x) - 1
                 )
             )
-        elif target.startswith("lx") or target.startswith("pred_lx"):
-            df[[target]] = df[[target]].applymap(
+            elif t.startswith("lx") or t.startswith("pred_lx"):
+                df[[t]] = df[[t]].applymap(
                 lambda x: (
                     np.exp(x) - np.exp(100)
                     if isinstance(x, (list, np.ndarray))
                     else np.exp(x) - np.exp(100)
                 )
             )
-        elif target.startswith("lr") or target.startswith("pred_lr"):
-            df[[target]] = df[[target]].applymap(
-                lambda x: x if isinstance(x, (list, np.ndarray)) else x
-            )
-        else:
-            raise ValueError(f"Target {target} is not a valid target")
+            elif t.startswith("lr") or t.startswith("pred_lr"):
+                df[[t]] = df[[t]].applymap(
+                    lambda x: x if isinstance(x, (list, np.ndarray)) else x
+                )
+            else:
+                raise ValueError(f"Target {t} is not a valid target")
         return df
 
     @staticmethod
-    def convert_to_array(df: pd.DataFrame, target: str) -> pd.DataFrame:
+    def convert_to_array(df: pd.DataFrame, target: str | list[str]) -> pd.DataFrame:
         """
         Convert columns in a DataFrame to numpy arrays.
 
@@ -73,20 +76,27 @@ def convert_to_array(df: pd.DataFrame, target: str) -> pd.DataFrame:
             pd.DataFrame: A new DataFrame with columns converted to numpy arrays.
         """
         converted = df.copy()
-        converted[target] = converted[target].apply(
-            lambda x: x if isinstance(x, np.ndarray) else (np.array(x) if isinstance(x, list) else np.array([x]))
-        )
+        if isinstance(target, str):
+            target = [target]
+            
+        for t in target:
+            converted[t] = converted[t].apply(
+                lambda x: x if isinstance(x, np.ndarray) else (np.array(x) if isinstance(x, list) else np.array([x]))
+            )
         return converted
 
     @staticmethod
-    def convert_to_scalar(df: pd.DataFrame, target: str) -> pd.DataFrame:
+    def convert_to_scalar(df: pd.DataFrame, target: str | list[str]) -> pd.DataFrame:
         """
         Convert columns in a DataFrame to scalar values by taking the mean of the list.
         """
         converted = df.copy()
-        converted[target] = converted[target].apply(
-            lambda x: np.mean(x) if isinstance(x, (list, np.ndarray)) else x
-        )
+        if isinstance(target, str):
+            target = [target]
+        for t in target:
+            converted[t] = converted[t].apply(
+                lambda x: np.mean(x) if isinstance(x, (list, np.ndarray)) else x
+            )
         return converted
         
     @staticmethod
@@ -181,7 +191,7 @@ def _match_actual_pred(
         - matched_pred: pd.DataFrame aligned with actual.
         """
         actual_target = actual[[target]]
-        aligned_actual, aligned_pred = actual_target.align(pred, join="inner")
+        aligned_actual, aligned_pred = actual_target.align(pred, join="inner")  # type: ignore
         matched_actual = aligned_actual.reindex(index=aligned_pred.index)
         matched_actual[[target]] = actual_target
 
@@ -353,8 +363,8 @@ def month_wise_evaluation(
         """
         pred_concat = pd.concat(predictions)
         month_range = pred_concat.index.get_level_values(0).unique()
-        month_start = month_range.min()
-        month_end = month_range.max()
+        month_start = int(month_range.min())  # type: ignore
+        month_end = int(month_range.max())    # type: ignore
 
         if is_uncertainty:
             evaluation_dict = (

From 419c20e498b143d20457198d5ef686e76662f4a7 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Tue, 15 Jul 2025 16:59:08 +0200
Subject: [PATCH 05/31] remove comments

---
 views_evaluation/evaluation/evaluation_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
index df65daa..4665e00 100644
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ b/views_evaluation/evaluation/evaluation_manager.py
@@ -191,7 +191,7 @@ def _match_actual_pred(
         - matched_pred: pd.DataFrame aligned with actual.
         """
         actual_target = actual[[target]]
-        aligned_actual, aligned_pred = actual_target.align(pred, join="inner")  # type: ignore
+        aligned_actual, aligned_pred = actual_target.align(pred, join="inner")  
         matched_actual = aligned_actual.reindex(index=aligned_pred.index)
         matched_actual[[target]] = actual_target
 

From cf76d480b988afdaa7419ba6d852398766e127d0 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Tue, 15 Jul 2025 16:59:21 +0200
Subject: [PATCH 06/31] add mse

---
 tests/test_metric_calculators.py              |  8 +++++
 .../evaluation/metric_calculators.py          | 36 ++++++++++++++++---
 views_evaluation/evaluation/metrics.py        |  1 +
 3 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/tests/test_metric_calculators.py b/tests/test_metric_calculators.py
index 91d1996..31872a2 100644
--- a/tests/test_metric_calculators.py
+++ b/tests/test_metric_calculators.py
@@ -2,6 +2,7 @@
 import pandas as pd
 import numpy as np
 from views_evaluation.evaluation.metric_calculators import (
+    calculate_mse,
     calculate_rmsle,
     calculate_crps,
     calculate_ap,
@@ -39,6 +40,13 @@ def sample_uncertainty_data():
     return actual, pred
 
 
+def test_calculate_mse(sample_data):
+    """Test MSE calculation."""
+    actual, pred = sample_data
+    result = calculate_mse(actual, pred, 'target')
+    assert isinstance(result, float)
+    assert result >= 0
+
 def test_calculate_rmsle_point(sample_data):
     """Test RMSLE calculation."""
     actual, pred = sample_data
diff --git a/views_evaluation/evaluation/metric_calculators.py b/views_evaluation/evaluation/metric_calculators.py
index 02d775f..eff3f28 100644
--- a/views_evaluation/evaluation/metric_calculators.py
+++ b/views_evaluation/evaluation/metric_calculators.py
@@ -1,15 +1,39 @@
-from typing import List, Dict, Tuple, Optional
 from collections import Counter
 import pandas as pd
 import numpy as np
 import properscoring as ps
 from sklearn.metrics import (
     root_mean_squared_log_error,
+    mean_squared_error,
     average_precision_score,
 )
 from scipy.stats import wasserstein_distance, pearsonr
 
 
+def calculate_mse(
+    matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str
+) -> float:
+    """
+    Calculate Mean Square Error for each prediction.
+
+    Args:
+        matched_actual (pd.DataFrame): DataFrame containing actual values
+        matched_pred (pd.DataFrame): DataFrame containing predictions
+        target (str): The target column name
+
+    Returns:
+        float: Average MSE score
+    """
+    actual_values = np.concatenate(matched_actual[target].values)
+    pred_values = np.concatenate(matched_pred[f"pred_{target}"].values)
+
+    actual_expanded = np.repeat(
+        actual_values, [len(x) for x in matched_pred[f"pred_{target}"]]
+    )
+
+    return mean_squared_error(actual_expanded, pred_values)
+    
+
 def calculate_rmsle(
     matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str
 ) -> float:
@@ -334,9 +358,11 @@ def calculate_ignorance_score(
     def digitize_minus_one(x, edges):
         return np.digitize(x, edges, right=False) - 1
 
-    def _calculate_ignorance_score(predictions, observed, n):
-        c = Counter(predictions)
-        prob = c[observed] / n
+    def _calculate_ignorance_score(predictions, observed, n, all_bins):
+        # Initialize each bin with 1 (Laplace smoothing)
+        c = Counter({bin_idx: 1 for bin_idx in all_bins})
+        c.update(predictions)
+        prob = c[observed] / sum(c.values())
         return -np.log2(prob)
 
     scores = []
@@ -353,7 +379,7 @@ def _calculate_ignorance_score(predictions, observed, n):
         binned_preds = np.concatenate([binned_preds, synthetic])
 
         n = len(binned_preds)
-        score = _calculate_ignorance_score(binned_preds, binned_obs, n)
+        score = _calculate_ignorance_score(binned_preds, binned_obs, n, synthetic)
         scores.append(score)
 
     return np.mean(scores)
diff --git a/views_evaluation/evaluation/metrics.py b/views_evaluation/evaluation/metrics.py
index 36b2cb5..49f1ec0 100644
--- a/views_evaluation/evaluation/metrics.py
+++ b/views_evaluation/evaluation/metrics.py
@@ -119,6 +119,7 @@ class PointEvaluationMetrics(BaseEvaluationMetrics):
     """
 
     RMSLE: Optional[float] = None
+    MSE: Optional[float] = None
     CRPS: Optional[float] = None
     AP: Optional[float] = None
     EMD: Optional[float] = None

From 762be86e67c9915ae01d57777d02e5b27d17d49b Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Tue, 15 Jul 2025 16:59:33 +0200
Subject: [PATCH 07/31] update functions

---
 examples/quickstart.ipynb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/quickstart.ipynb b/examples/quickstart.ipynb
index 084dba4..1b7323e 100644
--- a/examples/quickstart.ipynb
+++ b/examples/quickstart.ipynb
@@ -279,17 +279,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Get the evaluation type, i.e., uncertainty or point\n",
     "actual = EvaluationManager.transform_data(\n",
-    "            EvaluationManager.convert_to_arrays(df_actual), 'lr_target'\n",
+    "            EvaluationManager.convert_to_array(df_actual, \"lr_target\"), 'lr_target'\n",
     "        )\n",
     "predictions = [\n",
     "    EvaluationManager.transform_data(\n",
-    "        EvaluationManager.convert_to_arrays(pred), f\"pred_lr_target\"\n",
+    "        EvaluationManager.convert_to_array(pred, f\"pred_lr_target\"), f\"pred_lr_target\"\n",
     "    )\n",
     "    for pred in dfs_point\n",
     "]\n",

From bbb4addb644590d49babcc879eae600c950ac24d Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Wed, 16 Jul 2025 10:35:34 +0200
Subject: [PATCH 08/31] remove comment

---
 views_evaluation/evaluation/evaluation_manager.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
index 4665e00..6d0131f 100644
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ b/views_evaluation/evaluation/evaluation_manager.py
@@ -363,8 +363,8 @@ def month_wise_evaluation(
         """
         pred_concat = pd.concat(predictions)
         month_range = pred_concat.index.get_level_values(0).unique()
-        month_start = int(month_range.min())  # type: ignore
-        month_end = int(month_range.max())    # type: ignore
+        month_start = int(month_range.min())  
+        month_end = int(month_range.max())
 
         if is_uncertainty:
             evaluation_dict = (
@@ -408,7 +408,7 @@ def month_wise_evaluation(
             evaluation_dict,
             PointEvaluationMetrics.evaluation_dict_to_dataframe(evaluation_dict),
         )
-
+    
     def evaluate(
         self,
         actual: pd.DataFrame,

From 026a5c7ef013987ea575622f9037a4098c29efb2 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Wed, 16 Jul 2025 10:36:06 +0200
Subject: [PATCH 09/31] more metrics to be done

---
 views_evaluation/evaluation/metrics.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/views_evaluation/evaluation/metrics.py b/views_evaluation/evaluation/metrics.py
index 49f1ec0..001873c 100644
--- a/views_evaluation/evaluation/metrics.py
+++ b/views_evaluation/evaluation/metrics.py
@@ -118,8 +118,8 @@ class PointEvaluationMetrics(BaseEvaluationMetrics):
         Variogram (Optional[float]): Variogram.
     """
 
-    RMSLE: Optional[float] = None
     MSE: Optional[float] = None
+    RMSLE: Optional[float] = None
     CRPS: Optional[float] = None
     AP: Optional[float] = None
     EMD: Optional[float] = None
@@ -141,7 +141,9 @@ class UncertaintyEvaluationMetrics(BaseEvaluationMetrics):
     CRPS: Optional[float] = None
     MIS: Optional[float] = None
     Ignorance: Optional[float] = None
+    Coverage: Optional[float] = None
+    pEMDiv: Optional[float] = None
     Brier: Optional[float] = None
     Jeffreys: Optional[float] = None
-    Coverage: Optional[float] = None
+    
     
\ No newline at end of file

From e82889daa2f7cf9154823a36bae0c34d69c1f287 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Thu, 17 Jul 2025 10:17:18 +0200
Subject: [PATCH 10/31] add adr 004 evaluation input schema

---
 .../ADRs/004_evaluation_input_schema.md       | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 documentation/ADRs/004_evaluation_input_schema.md

diff --git a/documentation/ADRs/004_evaluation_input_schema.md b/documentation/ADRs/004_evaluation_input_schema.md
new file mode 100644
index 0000000..5c9e0f8
--- /dev/null
+++ b/documentation/ADRs/004_evaluation_input_schema.md
@@ -0,0 +1,56 @@
+# Evaluation Input Schema
+
+| ADR Info            | Details           |
+|---------------------|-------------------|
+| Subject             | Evaluation Input Schema  |
+| ADR Number          | 004   |
+| Status              | Proposed   |
+| Author              | Xiaolong   |
+| Date                | 16.06.2025     |
+
+## Context
+In our modeling pipeline, a consistent and flexible evaluation framework is essential to compare model performance across different runs, time steps, and geographic units.
+
+
+## Decision
+
+We adopt the `views-evaluation` package to standardize the evaluation of model predictions. The core component of this package is the `EvaluationManager` class, which is initialized with a **list of evaluation metrics**.
+
+The `evaluate` method accepts the following inputs:
+
+1. A DataFrame of actual values,  
+2. A list of prediction DataFrames,  
+3. The target variable name,  
+4. The forecast steps to evaluate.  
+
+Both the actual and prediction DataFrames must use a multi-index of `(month_id, country_id/priogrid_gid)` and contain a column for the target variable. In the actuals DataFrame, this column must be named exactly as the target. In each prediction DataFrame, the predicted column must be named `f'pred_{target}'`.
+
+The number of prediction DataFrames is flexible. However, the standard practice is to evaluate **12 sequences**. When more than two predictions are provided, the evaluation will behave similarly to a **rolling origin evaluation** with a **fixed holdout size of 1**. For further reference, see the [ADR 002](https://github.com/views-platform/views-evaluation/blob/main/documentation/ADRs/002_evaluation_strategy.md) on rolling origin methodology.
+
+The class automatically determines the evaluation type (point or uncertainty) and aligns `month_id` values between the actuals and each prediction. By default, the evaluation is performed **month-wise**, **step-wise**, **time-series-wise** (more information in [ADR 003](https://github.com/views-platform/views-evaluation/blob/main/documentation/ADRs/003_metric_calculation.md))
+
+
+## Consequences
+
+**Positive Effects:**
+
+- Standardized evaluation across all models.
+
+**Negative Effects:**
+
+- Requires strict adherence to index and column naming conventions.
+
+## Rationale
+
+Using the `views-evaluation` package enforces consistency and reproducibility in model evaluation. The built-in support for rolling origin evaluation reflects a realistic scenario for time-series forecasting where the model is updated or evaluated sequentially. Its flexible design aligns with our workflow, where multiple prediction sets across multiple horizons are common.
+
+
+### Considerations
+
+- Other evaluation types, such as correlation matrices, may be requested in the future. These might not be compatible with the current architecture or evaluation strategy of the `views-evaluation` package.
+
+- Consider accepting `config` as input instead of separate `target` and `steps` arguments. This would improve consistency because these parameters are already defined in config. It would allow for more flexible or partial evaluation workflows (e.g., when only one or two evaluation strategies are desired).
+
+## Feedback and Suggestions
+Any feedback or suggestion is welcomed
+

From 43f10c85124015930b2fc36ef4e37cf97dfc3394 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Thu, 17 Jul 2025 15:02:14 +0200
Subject: [PATCH 11/31] add mse to default metric list

---
 tests/test_evaluation_manager.py              |  8 +++----
 .../evaluation/evaluation_manager.py          | 22 +++++++++----------
 .../evaluation/metric_calculators.py          |  2 ++
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/tests/test_evaluation_manager.py b/tests/test_evaluation_manager.py
index 30f703b..3c3f807 100644
--- a/tests/test_evaluation_manager.py
+++ b/tests/test_evaluation_manager.py
@@ -120,14 +120,14 @@ def test_get_evaluation_type():
         pd.DataFrame({'pred_target': [[1.0, 2.0], [3.0, 4.0]]}),
         pd.DataFrame({'pred_target': [[5.0, 6.0], [7.0, 8.0]]}),
     ]
-    assert EvaluationManager.get_evaluation_type(predictions_uncertainty) == True
+    assert EvaluationManager.get_evaluation_type(predictions_uncertainty, "pred_target") == True
 
     # Test case 2: All DataFrames for point evaluation
     predictions_point = [
         pd.DataFrame({'pred_target': [[1.0], [2.0]]}),
         pd.DataFrame({'pred_target': [[3.0], [4.0]]}),
     ]
-    assert EvaluationManager.get_evaluation_type(predictions_point) == False
+    assert EvaluationManager.get_evaluation_type(predictions_point, "pred_target") == False
 
     # Test case 3: Mixed evaluation types
     predictions_mixed = [
@@ -135,14 +135,14 @@ def test_get_evaluation_type():
         pd.DataFrame({'pred_target': [[5.0], [6.0]]}),
     ]
     with pytest.raises(ValueError):
-        EvaluationManager.get_evaluation_type(predictions_mixed)
+        EvaluationManager.get_evaluation_type(predictions_mixed, "pred_target")
 
     # Test case 4: Single element lists
     predictions_single_element = [
         pd.DataFrame({'pred_target': [[1.0], [2.0]]}),
         pd.DataFrame({'pred_target': [[3.0], [4.0]]}),
     ]
-    assert EvaluationManager.get_evaluation_type(predictions_single_element) == False
+    assert EvaluationManager.get_evaluation_type(predictions_single_element, "pred_target") == False
 
 
 def test_match_actual_pred_point(
diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
index 6d0131f..02cf715 100644
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ b/views_evaluation/evaluation/evaluation_manager.py
@@ -100,7 +100,7 @@ def convert_to_scalar(df: pd.DataFrame, target: str | list[str]) -> pd.DataFrame
         return converted
         
     @staticmethod
-    def get_evaluation_type(predictions: List[pd.DataFrame]) -> bool:
+    def get_evaluation_type(predictions: List[pd.DataFrame], target: str) -> bool:
         """
         Validates the values in each DataFrame in the list.
         The return value indicates whether all DataFrames are for uncertainty evaluation.
@@ -120,8 +120,9 @@ def get_evaluation_type(predictions: List[pd.DataFrame]) -> bool:
         is_point = False
         uncertainty_length = None
 
+
         for df in predictions:
-            for value in df.values.flatten():
+            for value in df[target].values.flatten():
                 if not (isinstance(value, np.ndarray) or isinstance(value, list)):
                     raise ValueError(
                         "All values must be lists or numpy arrays. Convert the data."
@@ -414,7 +415,7 @@ def evaluate(
         actual: pd.DataFrame,
         predictions: List[pd.DataFrame],
         target: str,
-        steps: List[int],
+        config: dict,
         **kwargs,
     ):
         """
@@ -424,10 +425,8 @@ def evaluate(
             actual (pd.DataFrame): The actual values.
             predictions (List[pd.DataFrame]): A list of DataFrames containing the predictions.
             target (str): The target column in the actual DataFrame.
-            steps (List[int]): The steps to evaluate.
-
+            config (dict): The configuration dictionary.
         """
-
         EvaluationManager.validate_predictions(predictions, target)
         actual = EvaluationManager.transform_data(
             EvaluationManager.convert_to_array(actual, target), target
@@ -438,22 +437,23 @@ def evaluate(
             )
             for pred in predictions
         ]
-        is_uncertainty = EvaluationManager.get_evaluation_type(predictions)
+        self.is_uncertainty = EvaluationManager.get_evaluation_type(predictions, f"pred_{target}")
 
         evaluation_results = {}
         evaluation_results["month"] = self.month_wise_evaluation(
-            actual, predictions, target, is_uncertainty, **kwargs
+            actual, predictions, target, self.is_uncertainty, **kwargs
         )
         evaluation_results["time_series"] = self.time_series_wise_evaluation(
-            actual, predictions, target, is_uncertainty, **kwargs
+            actual, predictions, target, self.is_uncertainty, **kwargs
         )
         evaluation_results["step"] = self.step_wise_evaluation(
             actual,
             predictions,
             target,
-            steps,
-            is_uncertainty,
+            config["steps"],
+            self.is_uncertainty,
             **kwargs,
         )
 
         return evaluation_results
+
diff --git a/views_evaluation/evaluation/metric_calculators.py b/views_evaluation/evaluation/metric_calculators.py
index eff3f28..5b5af16 100644
--- a/views_evaluation/evaluation/metric_calculators.py
+++ b/views_evaluation/evaluation/metric_calculators.py
@@ -386,6 +386,7 @@ def _calculate_ignorance_score(predictions, observed, n, all_bins):
 
 
 POINT_METRIC_FUNCTIONS = {
+    "MSE": calculate_mse,
     "RMSLE": calculate_rmsle,
     "CRPS": calculate_crps,
     "AP": calculate_ap,
@@ -403,4 +404,5 @@ def _calculate_ignorance_score(predictions, observed, n, all_bins):
     "Brier": calculate_brier,
     "Jeffreys": calculate_jeffreys,
     "Coverage": calculate_coverage,
+    "pEMDiv": calculate_pEMDiv,
 }

From 01962a833264ab351c97dac2ee2508b4de5e4cbf Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Fri, 18 Jul 2025 09:09:30 +0200
Subject: [PATCH 12/31] no config

---
 .../evaluation/evaluation_manager.py          | 33 +++++++++++--------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
index 02cf715..4e062af 100644
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ b/views_evaluation/evaluation/evaluation_manager.py
@@ -35,7 +35,7 @@ def __init__(self, metrics_list: list):
     @staticmethod
     def transform_data(df: pd.DataFrame, target: str | list[str]) -> pd.DataFrame:
         """
-        Transform the data to normal distribution.
+        Transform the data.
         """
         if isinstance(target, str):
             target = [target]
@@ -229,6 +229,21 @@ def _split_dfs_by_step(dfs: list) -> list:
 
         return result_dfs
 
+    def _process_data(self, actual: pd.DataFrame, predictions: List[pd.DataFrame], target: str):
+        """
+        Process the data for evaluation.
+        """
+        actual = EvaluationManager.transform_data(
+            EvaluationManager.convert_to_array(actual, target), target
+        )
+        predictions = [
+            EvaluationManager.transform_data(
+                EvaluationManager.convert_to_array(pred, f"pred_{target}"), f"pred_{target}"
+            )
+            for pred in predictions
+        ]
+        return actual, predictions
+
     def step_wise_evaluation(
         self,
         actual: pd.DataFrame,
@@ -415,7 +430,7 @@ def evaluate(
         actual: pd.DataFrame,
         predictions: List[pd.DataFrame],
         target: str,
-        config: dict,
+        steps: List[int],
         **kwargs,
     ):
         """
@@ -425,18 +440,10 @@ def evaluate(
             actual (pd.DataFrame): The actual values.
             predictions (List[pd.DataFrame]): A list of DataFrames containing the predictions.
             target (str): The target column in the actual DataFrame.
-            config (dict): The configuration dictionary.
+            steps (List[int]): The steps to evaluate.
         """
         EvaluationManager.validate_predictions(predictions, target)
-        actual = EvaluationManager.transform_data(
-            EvaluationManager.convert_to_array(actual, target), target
-        )
-        predictions = [
-            EvaluationManager.transform_data(
-                EvaluationManager.convert_to_array(pred, f"pred_{target}"), f"pred_{target}"
-            )
-            for pred in predictions
-        ]
+        actual, predictions = self._process_data(actual, predictions, target)
         self.is_uncertainty = EvaluationManager.get_evaluation_type(predictions, f"pred_{target}")
 
         evaluation_results = {}
@@ -450,7 +457,7 @@ def evaluate(
             actual,
             predictions,
             target,
-            config["steps"],
+            steps,
             self.is_uncertainty,
             **kwargs,
         )

From 541c3380679b244f1f9f102897a6e52f49bdb2b6 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Fri, 18 Jul 2025 09:53:07 +0200
Subject: [PATCH 13/31] add adr 004

---
 documentation/ADRs/004_evaluation_input_schema.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/documentation/ADRs/004_evaluation_input_schema.md b/documentation/ADRs/004_evaluation_input_schema.md
index 5c9e0f8..9115096 100644
--- a/documentation/ADRs/004_evaluation_input_schema.md
+++ b/documentation/ADRs/004_evaluation_input_schema.md
@@ -9,7 +9,7 @@
 | Date                | 16.06.2025     |
 
 ## Context
-In our modeling pipeline, a consistent and flexible evaluation framework is essential to compare model performance across different runs, time steps, and geographic units.
+In our modeling pipeline, a consistent and flexible evaluation framework is essential to compare model performance.
 
 
 ## Decision
@@ -17,11 +17,10 @@ In our modeling pipeline, a consistent and flexible evaluation framework is esse
 We adopt the `views-evaluation` package to standardize the evaluation of model predictions. The core component of this package is the `EvaluationManager` class, which is initialized with a **list of evaluation metrics**.
 
 The `evaluate` method accepts the following inputs:
-
 1. A DataFrame of actual values,  
 2. A list of prediction DataFrames,  
 3. The target variable name,  
-4. The forecast steps to evaluate.  
+4. The steps.  
 
 Both the actual and prediction DataFrames must use a multi-index of `(month_id, country_id/priogrid_gid)` and contain a column for the target variable. In the actuals DataFrame, this column must be named exactly as the target. In each prediction DataFrame, the predicted column must be named `f'pred_{target}'`.
 

From aecb943f8a798d3b0952aabde67a27e6f9e24f51 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Fri, 18 Jul 2025 09:53:17 +0200
Subject: [PATCH 14/31] add adr 005

---
 .../ADRs/005_evaluation_output_schema.md      | 110 ++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 documentation/ADRs/005_evaluation_output_schema.md

diff --git a/documentation/ADRs/005_evaluation_output_schema.md b/documentation/ADRs/005_evaluation_output_schema.md
new file mode 100644
index 0000000..e1dffbb
--- /dev/null
+++ b/documentation/ADRs/005_evaluation_output_schema.md
@@ -0,0 +1,110 @@
+# Evaluation Output Schema
+
+| ADR Info            | Details           |
+|---------------------|-------------------|
+| Subject             | Evaluation Output Schema  |
+| ADR Number          | 005   |
+| Status              | Proposed   |
+| Author              | Xiaolong   |
+| Date                | 16.06.2025     |
+
+## Context
+As part of our model evaluation workflow, we generate comprehensive reports summarizing model performance across a range of metrics and time periods. These reports are intended primarily for comparing ensemble models against their constituent models and baselines.
+
+We use the `views-evaluation` package for computing evaluation metrics, while the report generation logic resides in the `views-pipeline-core` package. This separation of concerns avoids circular dependency, but requires a well-defined schema for passing data between the two components.
+
+## Decision
+
+We define a standard output schema for model evaluation reports using two formats:
+
+1. **JSON file** – machine-readable output storing structured evaluation data.
+2. **HTML file** – human-readable report with charts, tables, and summaries.
+
+These files are stored in the `reports/` directory for each model within `views-models`.
+
+To prevent a circular dependency between `views-evaluation` and `views-pipeline-core`, the `views-evaluation` package **does not** generate reports directly. Instead, it outputs intermediate results. These are then consumed by the reporting module in `views-pipeline-core` to generate final report files.
+
+### Schema Overview (JSON)
+Each report follows a standardized JSON structure that includes:
+````
+{
+    "Target": "target",
+    "Forecast Type": "point",
+    "Level of Analysis": "cm",
+    "Data Partition": "validation",
+    "Training Period": [121,492],
+    "Testing Period": [493,540],
+    "Forecast Horizon": 36,
+    "Number of Rolling Origins": 12,
+    "Evaluation Results": [
+        {
+            "Type": "Ensemble",
+            "Model Name": "ensemble_model",
+            "MSE": mse_e,
+            "MSLE": msle_e,
+            "mean prediction": mp_e 
+        },
+        {
+            "Type": "Constituent",
+            "Model Name": "constitute_a",
+            "MSE": mse_a,
+            "MSLE": msle_a,
+            "mean prediction": mp_a 
+        },
+        {
+            "Type": "Constituent",
+            "Model Name": "constitute_b",
+            "MSE": mse_b,
+            "MSLE": msle_b,
+            "mean prediction": mp_b 
+        }
+        ...
+    ]
+}
+````
+The output file is name with the following name convention:
+```
+eval_validation_{conflict_type}_{timestamp}.json
+```
+
+
+## Consequences
+
+**Positive Effects:**
+
+- Avoids circular dependency between `views-evaluation` and `views-pipeline-core`.
+
+- Provides consistent input for both HTML rendering and potential downstream systems (e.g., dashboards, APIs)
+
+- Facilitates modularity and separation of concerns.
+
+
+**Negative Effects:**
+
+- Requires tight coordination between both packages to maintain schema compatibility
+
+- Some redundancy between evaluation and report generation may occur
+
+- May require schema migrations as new report sections are added
+
+
+
+## Rationale
+
+Generating reports within `views-pipeline-core` ensures full control over rendering, formatting, and contextual customization (e.g., comparing different model families). By letting `views-evaluation` focus strictly on metrics and alignment logic, we maintain cleaner package boundaries.
+
+
+### Considerations
+
+- This schema may evolve as we introduce new types of evaluation (e.g., correlation matrix).
+
+
+- Reports are currently only generated for **ensemble models**, as comparison against constituent models is the primary use case.
+
+-Future extensibility (e.g., visual version diffs) should be considered when evolving the format.
+
+
+
+## Feedback and Suggestions
+Any feedback or suggestion is welcomed
+

From c7961843e4609839ddae32e5879d185fc43e5dad Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Fri, 18 Jul 2025 11:10:48 +0200
Subject: [PATCH 15/31] update ADR 005

---
 documentation/ADRs/005_evaluation_output_schema.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/documentation/ADRs/005_evaluation_output_schema.md b/documentation/ADRs/005_evaluation_output_schema.md
index e1dffbb..b74ee40 100644
--- a/documentation/ADRs/005_evaluation_output_schema.md
+++ b/documentation/ADRs/005_evaluation_output_schema.md
@@ -62,12 +62,15 @@ Each report follows a standardized JSON structure that includes:
     ]
 }
 ````
+Here, the 
+
 The output file is name with the following name convention:
 ```
 eval_validation_{conflict_type}_{timestamp}.json
 ```
 
 
+
 ## Consequences
 
 **Positive Effects:**

From 79ac74f1ac7e3196826b334c6b3273e32e44c7fc Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Fri, 18 Jul 2025 11:10:58 +0200
Subject: [PATCH 16/31] update ADR 002

---
 documentation/ADRs/002_evaluation_strategy.md | 92 +++++++++++--------
 1 file changed, 55 insertions(+), 37 deletions(-)

diff --git a/documentation/ADRs/002_evaluation_strategy.md b/documentation/ADRs/002_evaluation_strategy.md
index 0990d84..e693ded 100644
--- a/documentation/ADRs/002_evaluation_strategy.md
+++ b/documentation/ADRs/002_evaluation_strategy.md
@@ -4,81 +4,99 @@
 |---------------------|-------------------|
 | Subject             | Evaluation Strategy  |
 | ADR Number          | 002   |
-| Status              | Accepted|
-| Author              | Mihai, Xiaolong|
-| Date                | 31.10.2024 |
+| Status              | Proposed |
+| Author              | Xiaolong, Mihai|
+| Date                | 16.07.2025 |
 
 ## Context
-The primary output of VIEWS is a panel of forecasts, which consist of temporal sequences of predictions for each observation at the corresponding level of analysis (LOA). These forecasts span a defined forecasting window and can represent values such as predicted fatalities, probabilities, quantiles, or sample vectors.
+To ensure reliable and realistic model performance assessment, our forecasting framework supports both **offline** and **online** evaluation strategies. These strategies serve complementary purposes: offline evaluation simulates the forecasting process retrospectively, while online evaluation assesses actual deployed forecasts against observed data.
 
-The machine learning models generating these predictions are trained on historical time-series data, supplemented by covariates that may be either known in advance (future features) or only available for training data (shifted features). Given the variety of models used, evaluation routines must remain model- and paradigm-agnostic to ensure consistency across different methodologies.
+Both strategies are designed to work with time-series predictions and support multi-step forecast horizons, ensuring robustness across temporal scales and use cases.
 
 
 ## Decision
-The evaluation strategy must be structured to assess predictive performance comprehensively.
+We adopt a dual evaluation approach consisting of:
+1. **Offline Evaluation:** Evaluating a model's performance on historical data, before deployment.
+
+2. **Online Evaluation:** The ongoing process of evaluating a deployed model's performance as new, real-world data becomes available.
+
 
 ### Points of Definition: 
 
-1. *Time*: All time points and horizons mentioned below are in  **outcome space**, also known as $Y$-space : this means that they refer to the time point of the (forecasted or observed) outcome. This is especially important for such models where the feature-space and outcome-space are shifted and refer to different time points.
+- **Rolling-Origin Holdout:** A robust backtesting strategy that simulates a real-world forecasting scenario by generating forecasts from multiple, rolling time origins.
 
-2. *Temporal resolution*: The temporal resolution of VIEWS is the calendar-month. These are referred in VIEWS by an ordinal (Julian) month identifier (`month_id`) which is a serial numeric identifier with a reference epoch (month 0) of December 1979. For control purposes, January 2024 is month 529. VIEWS does not define behavior and does not have the ability to store data prior to the reference epoch (with negative `month_id`). Conflict history data, which marks the earliest possible meaningful start of the training time-series, is available from `month_id==109`.
+- **Forecast Steps:** The time increment between predictions within a \textbf{sequence} of forecasts (further referred to as steps).
 
-3. *Forecasting Steps* (further referred to as steps) is defined as the 1-indexed number of months from the start of a forecast time-series.
+- **Sequence:** An ordered set of data points indexed by time. 
 
-### General Evaluation Strategy
+### Diagram
 ![path](../img/approach.png)
 
-The general evaluation strategy involves *training* one model on a time-series that goes up to the training horizon $H_0$. This sequence is then used to predict a number of sequences (time-series). The first such sequence goes from $H_{0+1}$ to $H_{0+36}$, thus containing 36 forecasted values -- i.e. 36 months. The next one goes from $H_{0+2}$ to $H_{0+37}$. This is repeated until we reach a constant stop-point $k$ such that the last sequence forecasted is $H_{0+k+1}$ to $H_{0+k+36}$. 
+### Offline Evaluation
+We adopt a **rolling-origin holdout evaluation strategy** for all offline (backtesting) evaluations.
 
-Normally, it is up to the modeller whether the model performs *expanding window* or *rolling window* evaluation, since *how* prediction is carried out all evaluations are of the *expanding window forecasting* type, i.e. the training window. 
+The offline evaluation strategy involves 
+1. **A single model** is trained on historical data up to training cutoff $H_0$.
+2. Using this trained model object, a forecast is generated for the next **36 months**:
+    - Sequence 1: $H_{0+1}$ -> $H_{0+36}$
+3. The origin is then rolled forward by one month, and another forecast is generated:
+    - Sequence 2: $H_{0+2}$ -> $H_{0+37}$
+4. This process continues until a fixed number of sequences **k** is reached.
+5. In our standardized offline evaluation, **12 forecast sequences** are used (i.e., k = 12).
 
-#### Live evaluation
+It is important to note that **offline evaluation is not a true forecast**. Instead it is a simulation using historical data from the **Validation Partition** to approximate forecasting performance under realistic, rolling deployment conditions. (See [ADR TBD] for data partitioning strategy.)
 
-For **live** evaluation, we suggest doing this in the same way as has been done for VIEWS2020/FCDO (_confirm with HH and/or Mike_), i.e. predict to k=12, resulting in *12* time series over a prediction window of *48* months. We call this the evaluation partition end $H_{e,live}$. This gives a prediction horizon of 48 months, thus $H_{47}$ in our notation.
 
-Note that this is **not** the final version of online evaluation.
+### Online Evaluation
+Online evaluation reflects **true forecasting** and is based on the **Forecasting Partition** 
 
-#### Offline evaluation
+Suppose the latest available data point is $H_{36}$. Over time, the system would have generated the following forecast sequences:
+- Sequence 1: forecast for $H_{1}$ → $H_{36}$, generated at time **t = 0**
+- Sequence 2: forecast for $H_{2}$ → $H_{37}$, generated at **t = 1**
+- ...
+- Sequence 36: forecast for $H_{36}$ → $H_{71}$, generated at **t = 35**
 
-For **offline** model evaluation, we suggest doing this in a way that simulates production over a longer time-span. For this, a new model is trained at every **twelve** months interval, thus resetting $H_0$ at months $H_{0+0}, H_{0+12}, H_{0+24}, \dots H_{0+12r}$ where $12r=H_e$.
+At time $H_{36}$, we evaluate all forecasts made for $H_{36}$, i.e., the predictions from each of these 36 sequences are compared to the true value observed at $H_{36}$.
 
-The default way is to set $H_{e_eval}$ to 48 months, meaning we only train the model once at $H_0$. This will result in **12** time series. We call it **standard** evaluation.
+This provides a comprehensive view of how well the deployed model performs across multiple forecast origins and steps.
 
-We also propose the following practical approaches:
+## Consequences
 
-1. A **long** evaluation where we set $H_{e_eval}$ to 72 months. This will result in *36* predicted time-series.
-   
-2. A **complete** evaluation system, the longest one, where we set $H_0$ at 36 months of data (157 for models depending on UCDP GED), and iterate until the end of data (currently, the final $H_0$ will be 529).
+**Positive Effects:**
+- Reflects realistic deployment and monitoring conditions.
 
-For comparability and abstraction of seasonality (which is inherent in both the DGP as well as the conflict data we rely on, due to their definition), $H_0$ should always be December or June (this also adds convenience).
+- Allows for evaluation across multiple forecast origins and time horizons.
 
-The three approaches have trade-offs besides increasing computational complexity.  Since conflict is not a stationary process, evaluation carried for long time-periods will prefer models that predict whatever stationary components exist in the DGP (and thus in the time-series). For example these may include salient factors such GDP, HDI, infant mortality etc.. Evaluation on such very long time-spans may substantially penalize models that predict more current event, due shorter term causes that were not so salient in the past. Examples of these may be the change in the taboo on inter-state war after 2014 and 2022 with Russia invading Ukraine.
+- Improves robustness by capturing temporal variation in model performance.
 
 
+**Negative Effects:**
+- Requires careful alignment of sequences and forecast windows.
 
-## Consequences
+- May introduce computational overhead due to repeated evaluation across multiple origins.
 
-**Positive Effects:**
-- Standardized evaluation across models, ensuring comparability.
+- Models must be capable of generalizing across slightly shifted input windows.
 
-- Clear separation of live and offline evaluation, facilitating both operational monitoring and research validation.
 
-**Negative Effects:**
-- Increased computational demands for long and complete evaluations.
+## Rationale
+The dual evaluation setup strikes a balance between experimentation and real-world monitoring:
 
-- Potential complexity in managing multiple evaluation strategies.
+- **Offline evaluation** provides a controlled and reproducible environment for backtesting.
+- **Online evaluation** reflects actual model behavior in production.
+
+For further technical details:
+- See [ADR 004 – Evaluation Input Schema](https://github.com/views-platform/views-evaluation/blob/main/documentation/ADRs/004_evaluation_input_schema.md)
+- See [ADR 003 – Metric Calculation](https://github.com/views-platform/views-evaluation/blob/main/documentation/ADRs/003_metric_calculation.md)
 
-- Additional infrastructure requirements.
 
-## Rationale
-By structuring evaluation routines to be agnostic of the modeling approach, the framework ensures consistency in assessing predictive performance. Using multiple evaluation methodologies balances computational feasibility with robustness in performance assessment.
 
 ### Considerations
-- Computational cost vs. granularity of evaluation results.
+- Sequence length (currently 36 months) may need to be adjusted for different use cases (e.g., quarterly or annual models).
+
+- The number of sequences (k) can be tuned depending on evaluation budget or forecast range.
 
-- Trade-offs between short-term and long-term predictive performance.
+- Consider future support for probabilistic or uncertainty-aware forecasts in the same rolling evaluation framework.
 
-- Ensuring reproducibility and scalability of evaluation routines.
 
 
 ## Feedback and Suggestions

From 3291a8d2e2f296fbe49c3bd34d117f1db9b2e2f5 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Mon, 21 Jul 2025 10:55:04 +0200
Subject: [PATCH 17/31] update adr

---
 documentation/ADRs/005_evaluation_output_schema.md | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/documentation/ADRs/005_evaluation_output_schema.md b/documentation/ADRs/005_evaluation_output_schema.md
index b74ee40..e9d463c 100644
--- a/documentation/ADRs/005_evaluation_output_schema.md
+++ b/documentation/ADRs/005_evaluation_output_schema.md
@@ -11,8 +11,6 @@
 ## Context
 As part of our model evaluation workflow, we generate comprehensive reports summarizing model performance across a range of metrics and time periods. These reports are intended primarily for comparing ensemble models against their constituent models and baselines.
 
-We use the `views-evaluation` package for computing evaluation metrics, while the report generation logic resides in the `views-pipeline-core` package. This separation of concerns avoids circular dependency, but requires a well-defined schema for passing data between the two components.
-
 ## Decision
 
 We define a standard output schema for model evaluation reports using two formats:
@@ -22,7 +20,7 @@ We define a standard output schema for model evaluation reports using two format
 
 These files are stored in the `reports/` directory for each model within `views-models`.
 
-To prevent a circular dependency between `views-evaluation` and `views-pipeline-core`, the `views-evaluation` package **does not** generate reports directly. Instead, it outputs intermediate results. These are then consumed by the reporting module in `views-pipeline-core` to generate final report files.
+To prevent a circular dependency between `views-evaluation` and `views-pipeline-core`, the `views-evaluation` package returns the evaluation dictionary, and then  `views-pipeline-core` continues saving it as a json file.
 
 ### Schema Overview (JSON)
 Each report follows a standardized JSON structure that includes:
@@ -94,14 +92,13 @@ eval_validation_{conflict_type}_{timestamp}.json
 
 ## Rationale
 
-Generating reports within `views-pipeline-core` ensures full control over rendering, formatting, and contextual customization (e.g., comparing different model families). By letting `views-evaluation` focus strictly on metrics and alignment logic, we maintain cleaner package boundaries.
+Saving reports within `views-pipeline-core` ensures full control over rendering, formatting, and contextual customization (e.g., comparing different model families). By letting `views-evaluation` focus strictly on metrics and alignment logic, we maintain cleaner package boundaries.
 
 
 ### Considerations
 
 - This schema may evolve as we introduce new types of evaluation (e.g., correlation matrix).
 
-
 - Reports are currently only generated for **ensemble models**, as comparison against constituent models is the primary use case.
 
 -Future extensibility (e.g., visual version diffs) should be considered when evolving the format.

From fe6d9710d7138177f8006789cfef8da6e7ac7a28 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Mon, 21 Jul 2025 10:56:24 +0200
Subject: [PATCH 18/31] add eval dictionary generator

---
 views_evaluation/evaluation/__init__.py       |  0
 .../evaluation/evaluation_manager.py          | 39 +++++----
 views_evaluation/reports/__init__.py          |  0
 views_evaluation/reports/generator.py         | 85 +++++++++++++++++++
 4 files changed, 106 insertions(+), 18 deletions(-)
 create mode 100644 views_evaluation/evaluation/__init__.py
 create mode 100644 views_evaluation/reports/__init__.py
 create mode 100644 views_evaluation/reports/generator.py

diff --git a/views_evaluation/evaluation/__init__.py b/views_evaluation/evaluation/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
index 4e062af..8ab7b1c 100644
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ b/views_evaluation/evaluation/evaluation_manager.py
@@ -10,6 +10,7 @@
     POINT_METRIC_FUNCTIONS,
     UNCERTAINTY_METRIC_FUNCTIONS,
 )
+from views_evaluation.reports.generator import EvalReportGenerator
 
 logger = logging.getLogger(__name__)
 
@@ -31,6 +32,7 @@ def __init__(self, metrics_list: list):
         self.metrics_list = metrics_list
         self.point_metric_functions = POINT_METRIC_FUNCTIONS
         self.uncertainty_metric_functions = UNCERTAINTY_METRIC_FUNCTIONS
+        self.evaluation_results = {}
 
     @staticmethod
     def transform_data(df: pd.DataFrame, target: str | list[str]) -> pd.DataFrame:
@@ -430,7 +432,7 @@ def evaluate(
         actual: pd.DataFrame,
         predictions: List[pd.DataFrame],
         target: str,
-        steps: List[int],
+        config: dict,
         **kwargs,
     ):
         """
@@ -440,27 +442,28 @@ def evaluate(
             actual (pd.DataFrame): The actual values.
             predictions (List[pd.DataFrame]): A list of DataFrames containing the predictions.
             target (str): The target column in the actual DataFrame.
-            steps (List[int]): The steps to evaluate.
+            config (dict): The configuration dictionary.
         """
         EvaluationManager.validate_predictions(predictions, target)
-        actual, predictions = self._process_data(actual, predictions, target)
-        self.is_uncertainty = EvaluationManager.get_evaluation_type(predictions, f"pred_{target}")
-
-        evaluation_results = {}
-        evaluation_results["month"] = self.month_wise_evaluation(
-            actual, predictions, target, self.is_uncertainty, **kwargs
+        self.actual, self.predictions = self._process_data(actual, predictions, target)
+        self.is_uncertainty = EvaluationManager.get_evaluation_type(self.predictions, f"pred_{target}")
+        
+        self.evaluation_results["month"] = self.month_wise_evaluation(
+            self.actual, self.predictions, target, self.is_uncertainty, **kwargs
         )
-        evaluation_results["time_series"] = self.time_series_wise_evaluation(
-            actual, predictions, target, self.is_uncertainty, **kwargs
+        self.evaluation_results["time_series"] = self.time_series_wise_evaluation(
+            self.actual, self.predictions, target, self.is_uncertainty, **kwargs
         )
-        evaluation_results["step"] = self.step_wise_evaluation(
-            actual,
-            predictions,
-            target,
-            steps,
-            self.is_uncertainty,
-            **kwargs,
+        self.evaluation_results["step"] = self.step_wise_evaluation(
+            self.actual, self.predictions, target, config["steps"], self.is_uncertainty, **kwargs,
         )
 
-        return evaluation_results
+        return self.evaluation_results
+    
+    def generate_dict_report(self, config: dict, target: str, conflict_type: str):
+        """
+        Generate a report of the evaluation results.
+        """
+        report_generator = EvalReportGenerator(config, target, conflict_type)
+        return report_generator.generate_eval_report_dict(self.predictions, self.evaluation_results["time_series"][1])
 
diff --git a/views_evaluation/reports/__init__.py b/views_evaluation/reports/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/views_evaluation/reports/generator.py b/views_evaluation/reports/generator.py
new file mode 100644
index 0000000..1422f77
--- /dev/null
+++ b/views_evaluation/reports/generator.py
@@ -0,0 +1,85 @@
+import numpy as np
+import pandas as pd
+
+
+class EvalReportGenerator:
+    """Generate evaluation reports for ensemble or single model forecasts."""
+
+    def __init__(self, config: dict, target: str, conflict_type: str):
+        self.config = config
+        self.target = target
+        self.conflict_type = conflict_type
+        self.level = config.get("level")
+        self.run_type = config.get("run_type")
+        self.eval_type = config.get("eval_type")
+        self.is_ensemble = True if "models" in config else False
+
+    def generate_eval_report_dict(self, df_preds: list[pd.DataFrame], df_eval_ts: pd.DataFrame):
+        """Return a dictionary with evaluation report data."""
+        eval_report = {
+            "Target": self.target,
+            "Forecast Type": self._forecast_type(df_preds),
+            "Level of Analysis": self.level,
+            "Data Partition": self.run_type,
+            "Training Period": self._partition("train"),
+            "Testing Period": self._partition("test"),
+            "Forecast Horizon": len(self.config.get("steps", [])),
+            "Number of Rolling Origins": len(df_preds), 
+            "Evaluation Results": []
+        }
+
+        eval_report["Evaluation Results"].append(
+            self._single_result(
+                "Ensemble" if self.is_ensemble else "Model",
+                self.config["name"],
+                df_eval_ts,
+                df_preds
+            )
+        )
+
+        if self.is_ensemble:
+            from views_pipeline_core.managers.model import ModelPathManager
+            for model_name in self.config["models"]:
+                pm = ModelPathManager(model_name)
+                eval_report["Evaluation Results"].append(
+                    self._single_result(
+                        "Constituent",
+                        model_name,
+                        self._eval_ts(pm),
+                        self._preds(pm, rolling_origin_number=len(df_preds))
+                    )
+                )
+        return eval_report
+
+    def _forecast_type(self, df_preds: list[pd.DataFrame]):
+        from views_evaluation.evaluation.evaluation_manager import EvaluationManager
+        arr = [EvaluationManager.convert_to_array(df_pred, f"pred_{self.target}") for df_pred in df_preds]
+        return "point" if not EvaluationManager.get_evaluation_type(arr, f"pred_{self.target}") else "uncertainty"
+
+    def _partition(self, key: str):
+        return self.config[self.run_type][key]
+
+    def _eval_ts(self, pm):
+        from views_pipeline_core.files.utils import read_dataframe
+        path = pm._get_eval_file_paths(self.run_type, self.conflict_type)[0]
+        return read_dataframe(path)
+
+    def _preds(self, pm, rolling_origin_number: int):
+        from views_pipeline_core.files.utils import read_dataframe
+        paths = pm._get_generated_predictions_data_file_paths(self.run_type)[:rolling_origin_number]
+        return [read_dataframe(path) for path in paths]
+
+    def _single_result(self, model_type: str, model_name: str, df_eval_ts: pd.DataFrame, df_preds: list[pd.DataFrame]):
+        # mse = df_eval_ts["MSE"].mean() # Add back after publishing latest version of views-evaluation
+        msle = np.sqrt(df_eval_ts["RMSLE"]).mean()
+        mean_pred = np.mean([df_pred[f"pred_{self.target}"].mean() for df_pred in df_preds])
+        
+        return {
+            "Type": model_type,
+            "Model Name": model_name,
+            # "MSE": mse,
+            "MSLE": msle,
+            "mean prediction": mean_pred
+        }
+
+

From a49b90ebded4ffe99f505c3c286389a523a7c5f5 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Mon, 21 Jul 2025 15:13:16 +0200
Subject: [PATCH 19/31] update adr

---
 documentation/ADRs/004_evaluation_input_schema.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/documentation/ADRs/004_evaluation_input_schema.md b/documentation/ADRs/004_evaluation_input_schema.md
index 9115096..52499d7 100644
--- a/documentation/ADRs/004_evaluation_input_schema.md
+++ b/documentation/ADRs/004_evaluation_input_schema.md
@@ -20,7 +20,7 @@ The `evaluate` method accepts the following inputs:
 1. A DataFrame of actual values,  
 2. A list of prediction DataFrames,  
 3. The target variable name,  
-4. The steps.  
+4. The model config.  
 
 Both the actual and prediction DataFrames must use a multi-index of `(month_id, country_id/priogrid_gid)` and contain a column for the target variable. In the actuals DataFrame, this column must be named exactly as the target. In each prediction DataFrame, the predicted column must be named `f'pred_{target}'`.
 

From 803198db144b52835c93524e796383890b833a8b Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Mon, 21 Jul 2025 15:13:51 +0200
Subject: [PATCH 20/31] revert

---
 .../evaluation/evaluation_manager.py          | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
index 8ab7b1c..08a3b2b 100644
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ b/views_evaluation/evaluation/evaluation_manager.py
@@ -32,7 +32,6 @@ def __init__(self, metrics_list: list):
         self.metrics_list = metrics_list
         self.point_metric_functions = POINT_METRIC_FUNCTIONS
         self.uncertainty_metric_functions = UNCERTAINTY_METRIC_FUNCTIONS
-        self.evaluation_results = {}
 
     @staticmethod
     def transform_data(df: pd.DataFrame, target: str | list[str]) -> pd.DataFrame:
@@ -448,22 +447,16 @@ def evaluate(
         self.actual, self.predictions = self._process_data(actual, predictions, target)
         self.is_uncertainty = EvaluationManager.get_evaluation_type(self.predictions, f"pred_{target}")
         
-        self.evaluation_results["month"] = self.month_wise_evaluation(
+        evaluation_results = {}
+        evaluation_results["month"] = self.month_wise_evaluation(
             self.actual, self.predictions, target, self.is_uncertainty, **kwargs
         )
-        self.evaluation_results["time_series"] = self.time_series_wise_evaluation(
+        evaluation_results["time_series"] = self.time_series_wise_evaluation(
             self.actual, self.predictions, target, self.is_uncertainty, **kwargs
         )
-        self.evaluation_results["step"] = self.step_wise_evaluation(
+        evaluation_results["step"] = self.step_wise_evaluation(
             self.actual, self.predictions, target, config["steps"], self.is_uncertainty, **kwargs,
         )
 
-        return self.evaluation_results
-    
-    def generate_dict_report(self, config: dict, target: str, conflict_type: str):
-        """
-        Generate a report of the evaluation results.
-        """
-        report_generator = EvalReportGenerator(config, target, conflict_type)
-        return report_generator.generate_eval_report_dict(self.predictions, self.evaluation_results["time_series"][1])
-
+        return evaluation_results
+    
\ No newline at end of file

From f08d90c0aa7ec2bdad0b5acb3be26ba39e891508 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Mon, 21 Jul 2025 15:14:03 +0200
Subject: [PATCH 21/31] add msle

---
 .../evaluation/metric_calculators.py          | 26 ++++++++++++++++++-
 views_evaluation/evaluation/metrics.py        |  1 +
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/views_evaluation/evaluation/metric_calculators.py b/views_evaluation/evaluation/metric_calculators.py
index 5b5af16..842ecea 100644
--- a/views_evaluation/evaluation/metric_calculators.py
+++ b/views_evaluation/evaluation/metric_calculators.py
@@ -5,6 +5,7 @@
 from sklearn.metrics import (
     root_mean_squared_log_error,
     mean_squared_error,
+    mean_squared_log_error,
     average_precision_score,
 )
 from scipy.stats import wasserstein_distance, pearsonr
@@ -32,7 +33,29 @@ def calculate_mse(
     )
 
     return mean_squared_error(actual_expanded, pred_values)
-    
+
+
+def calculate_msle(
+    matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str
+) -> float:
+    """
+    Calculate Mean Squared Logarithmic Error (MSLE) for each prediction.
+
+    Args:
+        matched_actual (pd.DataFrame): DataFrame containing actual values
+        matched_pred (pd.DataFrame): DataFrame containing predictions
+        target (str): The target column name
+
+    Returns:
+        float: Average MSLE score
+    """
+    actual_values = np.concatenate(matched_actual[target].values)
+    pred_values = np.concatenate(matched_pred[f"pred_{target}"].values)
+    actual_expanded = np.repeat(
+        actual_values, [len(x) for x in matched_pred[f"pred_{target}"]]
+    )
+    return mean_squared_log_error(actual_expanded, pred_values)
+
 
 def calculate_rmsle(
     matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str
@@ -387,6 +410,7 @@ def _calculate_ignorance_score(predictions, observed, n, all_bins):
 
 POINT_METRIC_FUNCTIONS = {
     "MSE": calculate_mse,
+    "MSLE": calculate_msle,
     "RMSLE": calculate_rmsle,
     "CRPS": calculate_crps,
     "AP": calculate_ap,
diff --git a/views_evaluation/evaluation/metrics.py b/views_evaluation/evaluation/metrics.py
index 001873c..77a799a 100644
--- a/views_evaluation/evaluation/metrics.py
+++ b/views_evaluation/evaluation/metrics.py
@@ -119,6 +119,7 @@ class PointEvaluationMetrics(BaseEvaluationMetrics):
     """
 
     MSE: Optional[float] = None
+    MSLE: Optional[float] = None
     RMSLE: Optional[float] = None
     CRPS: Optional[float] = None
     AP: Optional[float] = None

From ff1b415e0c5cff64eb8e55593e53487e97c3b285 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Mon, 21 Jul 2025 15:14:24 +0200
Subject: [PATCH 22/31] update generator

---
 views_evaluation/reports/generator.py | 57 +++++++++++++--------------
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/views_evaluation/reports/generator.py b/views_evaluation/reports/generator.py
index 1422f77..ba46894 100644
--- a/views_evaluation/reports/generator.py
+++ b/views_evaluation/reports/generator.py
@@ -13,10 +13,11 @@ def __init__(self, config: dict, target: str, conflict_type: str):
         self.run_type = config.get("run_type")
         self.eval_type = config.get("eval_type")
         self.is_ensemble = True if "models" in config else False
+        self.eval_report = {}
 
     def generate_eval_report_dict(self, df_preds: list[pd.DataFrame], df_eval_ts: pd.DataFrame):
         """Return a dictionary with evaluation report data."""
-        eval_report = {
+        self.eval_report = {
             "Target": self.target,
             "Forecast Type": self._forecast_type(df_preds),
             "Level of Analysis": self.level,
@@ -28,7 +29,7 @@ def generate_eval_report_dict(self, df_preds: list[pd.DataFrame], df_eval_ts: pd
             "Evaluation Results": []
         }
 
-        eval_report["Evaluation Results"].append(
+        self.eval_report["Evaluation Results"].append(
             self._single_result(
                 "Ensemble" if self.is_ensemble else "Model",
                 self.config["name"],
@@ -36,20 +37,18 @@ def generate_eval_report_dict(self, df_preds: list[pd.DataFrame], df_eval_ts: pd
                 df_preds
             )
         )
+        return self.eval_report
 
-        if self.is_ensemble:
-            from views_pipeline_core.managers.model import ModelPathManager
-            for model_name in self.config["models"]:
-                pm = ModelPathManager(model_name)
-                eval_report["Evaluation Results"].append(
-                    self._single_result(
-                        "Constituent",
-                        model_name,
-                        self._eval_ts(pm),
-                        self._preds(pm, rolling_origin_number=len(df_preds))
-                    )
-                )
-        return eval_report
+    def update_ensemble_eval_report(self, model_name, df_preds: list[pd.DataFrame], df_eval_ts: pd.DataFrame):
+        self.eval_report["Evaluation Results"].append(
+            self._single_result(
+                "Constituent",
+                model_name,
+                df_eval_ts,
+                df_preds
+            )
+        )
+        return self.eval_report
 
     def _forecast_type(self, df_preds: list[pd.DataFrame]):
         from views_evaluation.evaluation.evaluation_manager import EvaluationManager
@@ -59,27 +58,25 @@ def _forecast_type(self, df_preds: list[pd.DataFrame]):
     def _partition(self, key: str):
         return self.config[self.run_type][key]
 
-    def _eval_ts(self, pm):
-        from views_pipeline_core.files.utils import read_dataframe
-        path = pm._get_eval_file_paths(self.run_type, self.conflict_type)[0]
-        return read_dataframe(path)
-
-    def _preds(self, pm, rolling_origin_number: int):
-        from views_pipeline_core.files.utils import read_dataframe
-        paths = pm._get_generated_predictions_data_file_paths(self.run_type)[:rolling_origin_number]
-        return [read_dataframe(path) for path in paths]
-
     def _single_result(self, model_type: str, model_name: str, df_eval_ts: pd.DataFrame, df_preds: list[pd.DataFrame]):
-        # mse = df_eval_ts["MSE"].mean() # Add back after publishing latest version of views-evaluation
-        msle = np.sqrt(df_eval_ts["RMSLE"]).mean()
-        mean_pred = np.mean([df_pred[f"pred_{self.target}"].mean() for df_pred in df_preds])
+        from views_evaluation.evaluation.evaluation_manager import EvaluationManager
+        df_preds = [
+            EvaluationManager.transform_data(
+                EvaluationManager.convert_to_array(df_pred, f"pred_{self.target}"), f"pred_{self.target}"
+            )
+            for df_pred in df_preds
+        ]
+        mse = df_eval_ts["MSE"].mean() 
+        msle = df_eval_ts["MSLE"].mean()
+        all_preds = np.concatenate([np.asarray(v).flatten() for df_pred in df_preds for v in df_pred[f"pred_{self.target}"]])
+        mean_pred = np.mean(all_preds)
         
         return {
             "Type": model_type,
             "Model Name": model_name,
-            # "MSE": mse,
+            "MSE": mse,
             "MSLE": msle,
-            "mean prediction": mean_pred
+            r"$\bar{\hat{y}}$": mean_pred
         }
 
 

From 72568de9eb34e35f442f85d38f1daafd6626e42e Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Mon, 28 Jul 2025 11:59:06 +0200
Subject: [PATCH 23/31] update logic

---
 .../evaluation/evaluation_manager.py          | 67 ++++++++++++-------
 views_evaluation/reports/generator.py         | 19 ++++--
 2 files changed, 56 insertions(+), 30 deletions(-)

diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
index 08a3b2b..92fa4c5 100644
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ b/views_evaluation/evaluation/evaluation_manager.py
@@ -43,20 +43,20 @@ def transform_data(df: pd.DataFrame, target: str | list[str]) -> pd.DataFrame:
         for t in target:
             if t.startswith("ln") or t.startswith("pred_ln"):
                 df[[t]] = df[[t]].applymap(
-                lambda x: (
-                    np.exp(x) - 1
-                    if isinstance(x, (list, np.ndarray))
-                    else np.exp(x) - 1
+                    lambda x: (
+                        np.exp(x) - 1
+                        if isinstance(x, (list, np.ndarray))
+                        else np.exp(x) - 1
+                    )
                 )
-            )
             elif t.startswith("lx") or t.startswith("pred_lx"):
                 df[[t]] = df[[t]].applymap(
-                lambda x: (
-                    np.exp(x) - np.exp(100)
-                    if isinstance(x, (list, np.ndarray))
-                    else np.exp(x) - np.exp(100)
+                    lambda x: (
+                        np.exp(x) - np.exp(100)
+                        if isinstance(x, (list, np.ndarray))
+                        else np.exp(x) - np.exp(100)
+                    )
                 )
-            )
             elif t.startswith("lr") or t.startswith("pred_lr"):
                 df[[t]] = df[[t]].applymap(
                     lambda x: x if isinstance(x, (list, np.ndarray)) else x
@@ -79,10 +79,14 @@ def convert_to_array(df: pd.DataFrame, target: str | list[str]) -> pd.DataFrame:
         converted = df.copy()
         if isinstance(target, str):
             target = [target]
-            
+
         for t in target:
             converted[t] = converted[t].apply(
-                lambda x: x if isinstance(x, np.ndarray) else (np.array(x) if isinstance(x, list) else np.array([x]))
+                lambda x: (
+                    x
+                    if isinstance(x, np.ndarray)
+                    else (np.array(x) if isinstance(x, list) else np.array([x]))
+                )
             )
         return converted
 
@@ -99,7 +103,7 @@ def convert_to_scalar(df: pd.DataFrame, target: str | list[str]) -> pd.DataFrame
                 lambda x: np.mean(x) if isinstance(x, (list, np.ndarray)) else x
             )
         return converted
-        
+
     @staticmethod
     def get_evaluation_type(predictions: List[pd.DataFrame], target: str) -> bool:
         """
@@ -121,7 +125,6 @@ def get_evaluation_type(predictions: List[pd.DataFrame], target: str) -> bool:
         is_point = False
         uncertainty_length = None
 
-
         for df in predictions:
             for value in df[target].values.flatten():
                 if not (isinstance(value, np.ndarray) or isinstance(value, list)):
@@ -193,7 +196,7 @@ def _match_actual_pred(
         - matched_pred: pd.DataFrame aligned with actual.
         """
         actual_target = actual[[target]]
-        aligned_actual, aligned_pred = actual_target.align(pred, join="inner")  
+        aligned_actual, aligned_pred = actual_target.align(pred, join="inner")
         matched_actual = aligned_actual.reindex(index=aligned_pred.index)
         matched_actual[[target]] = actual_target
 
@@ -230,7 +233,9 @@ def _split_dfs_by_step(dfs: list) -> list:
 
         return result_dfs
 
-    def _process_data(self, actual: pd.DataFrame, predictions: List[pd.DataFrame], target: str):
+    def _process_data(
+        self, actual: pd.DataFrame, predictions: List[pd.DataFrame], target: str
+    ):
         """
         Process the data for evaluation.
         """
@@ -239,7 +244,8 @@ def _process_data(self, actual: pd.DataFrame, predictions: List[pd.DataFrame], t
         )
         predictions = [
             EvaluationManager.transform_data(
-                EvaluationManager.convert_to_array(pred, f"pred_{target}"), f"pred_{target}"
+                EvaluationManager.convert_to_array(pred, f"pred_{target}"),
+                f"pred_{target}",
             )
             for pred in predictions
         ]
@@ -380,7 +386,7 @@ def month_wise_evaluation(
         """
         pred_concat = pd.concat(predictions)
         month_range = pred_concat.index.get_level_values(0).unique()
-        month_start = int(month_range.min())  
+        month_start = int(month_range.min())
         month_end = int(month_range.max())
 
         if is_uncertainty:
@@ -426,6 +432,13 @@ def month_wise_evaluation(
             PointEvaluationMetrics.evaluation_dict_to_dataframe(evaluation_dict),
         )
     
+    def calculate_mean_prediction(self, predictions: List[pd.DataFrame], target: str, **kwargs):
+        """
+        Calculate the mean prediction.
+        """
+        all_preds = np.concatenate([np.asarray(v).flatten() for df_pred in predictions for v in df_pred[f"pred_{target}"]])
+        return np.mean(all_preds)
+
     def evaluate(
         self,
         actual: pd.DataFrame,
@@ -445,8 +458,10 @@ def evaluate(
         """
         EvaluationManager.validate_predictions(predictions, target)
         self.actual, self.predictions = self._process_data(actual, predictions, target)
-        self.is_uncertainty = EvaluationManager.get_evaluation_type(self.predictions, f"pred_{target}")
-        
+        self.is_uncertainty = EvaluationManager.get_evaluation_type(
+            self.predictions, f"pred_{target}"
+        )
+
         evaluation_results = {}
         evaluation_results["month"] = self.month_wise_evaluation(
             self.actual, self.predictions, target, self.is_uncertainty, **kwargs
@@ -455,8 +470,14 @@ def evaluate(
             self.actual, self.predictions, target, self.is_uncertainty, **kwargs
         )
         evaluation_results["step"] = self.step_wise_evaluation(
-            self.actual, self.predictions, target, config["steps"], self.is_uncertainty, **kwargs,
+            self.actual,
+            self.predictions,
+            target,
+            config["steps"],
+            self.is_uncertainty,
+            **kwargs,
+        )
+        evaluation_results["mean_prediction"] = self.calculate_mean_prediction(
+            self.predictions, target, **kwargs
         )
-
         return evaluation_results
-    
\ No newline at end of file
diff --git a/views_evaluation/reports/generator.py b/views_evaluation/reports/generator.py
index ba46894..6f42c64 100644
--- a/views_evaluation/reports/generator.py
+++ b/views_evaluation/reports/generator.py
@@ -15,7 +15,7 @@ def __init__(self, config: dict, target: str, conflict_type: str):
         self.is_ensemble = True if "models" in config else False
         self.eval_report = {}
 
-    def generate_eval_report_dict(self, df_preds: list[pd.DataFrame], df_eval_ts: pd.DataFrame):
+    def generate_eval_report_dict(self, df_preds: list[pd.DataFrame], df_eval_ts: pd.DataFrame, mean_prediction: float=None):
         """Return a dictionary with evaluation report data."""
         self.eval_report = {
             "Target": self.target,
@@ -34,18 +34,20 @@ def generate_eval_report_dict(self, df_preds: list[pd.DataFrame], df_eval_ts: pd
                 "Ensemble" if self.is_ensemble else "Model",
                 self.config["name"],
                 df_eval_ts,
-                df_preds
+                df_preds,
+                mean_prediction
             )
         )
         return self.eval_report
 
-    def update_ensemble_eval_report(self, model_name, df_preds: list[pd.DataFrame], df_eval_ts: pd.DataFrame):
+    def update_ensemble_eval_report(self, model_name, df_preds: list[pd.DataFrame], df_eval_ts: pd.DataFrame, mean_prediction: float=None):
         self.eval_report["Evaluation Results"].append(
             self._single_result(
                 "Constituent",
                 model_name,
                 df_eval_ts,
-                df_preds
+                df_preds,
+                mean_prediction
             )
         )
         return self.eval_report
@@ -58,7 +60,7 @@ def _forecast_type(self, df_preds: list[pd.DataFrame]):
     def _partition(self, key: str):
         return self.config[self.run_type][key]
 
-    def _single_result(self, model_type: str, model_name: str, df_eval_ts: pd.DataFrame, df_preds: list[pd.DataFrame]):
+    def _single_result(self, model_type: str, model_name: str, df_eval_ts: pd.DataFrame, df_preds: list[pd.DataFrame], mean_prediction: float=None):
         from views_evaluation.evaluation.evaluation_manager import EvaluationManager
         df_preds = [
             EvaluationManager.transform_data(
@@ -68,8 +70,11 @@ def _single_result(self, model_type: str, model_name: str, df_eval_ts: pd.DataFr
         ]
         mse = df_eval_ts["MSE"].mean() 
         msle = df_eval_ts["MSLE"].mean()
-        all_preds = np.concatenate([np.asarray(v).flatten() for df_pred in df_preds for v in df_pred[f"pred_{self.target}"]])
-        mean_pred = np.mean(all_preds)
+        if mean_prediction is None:
+            all_preds = np.concatenate([np.asarray(v).flatten() for df_pred in df_preds for v in df_pred[f"pred_{self.target}"]])
+            mean_pred = np.mean(all_preds)
+        else:
+            mean_pred = mean_prediction
         
         return {
             "Type": model_type,

From 36d2eb7f7a0eebeac751c808f2354fd2b7d839a3 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Mon, 28 Jul 2025 13:35:28 +0200
Subject: [PATCH 24/31] move mean prediction to metric

---
 .../evaluation/evaluation_manager.py          | 11 +--------
 .../evaluation/metric_calculators.py          | 11 +++++++++
 views_evaluation/evaluation/metrics.py        |  3 ++-
 views_evaluation/reports/generator.py         | 23 ++++---------------
 4 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
index 92fa4c5..95938b2 100644
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ b/views_evaluation/evaluation/evaluation_manager.py
@@ -431,13 +431,6 @@ def month_wise_evaluation(
             evaluation_dict,
             PointEvaluationMetrics.evaluation_dict_to_dataframe(evaluation_dict),
         )
-    
-    def calculate_mean_prediction(self, predictions: List[pd.DataFrame], target: str, **kwargs):
-        """
-        Calculate the mean prediction.
-        """
-        all_preds = np.concatenate([np.asarray(v).flatten() for df_pred in predictions for v in df_pred[f"pred_{target}"]])
-        return np.mean(all_preds)
 
     def evaluate(
         self,
@@ -477,7 +470,5 @@ def evaluate(
             self.is_uncertainty,
             **kwargs,
         )
-        evaluation_results["mean_prediction"] = self.calculate_mean_prediction(
-            self.predictions, target, **kwargs
-        )
+
         return evaluation_results
diff --git a/views_evaluation/evaluation/metric_calculators.py b/views_evaluation/evaluation/metric_calculators.py
index 842ecea..37d6fa1 100644
--- a/views_evaluation/evaluation/metric_calculators.py
+++ b/views_evaluation/evaluation/metric_calculators.py
@@ -408,6 +408,15 @@ def _calculate_ignorance_score(predictions, observed, n, all_bins):
     return np.mean(scores)
 
 
+def calculate_mean_prediction(
+    matched_actual: pd.DataFrame, matched_pred: pd.DataFrame, target: str
+) -> float:
+    """
+    Calculate the mean prediction.
+    """
+    all_preds = np.concatenate([np.asarray(v).flatten() for v in matched_pred[f"pred_{target}"]])
+    return np.mean(all_preds)
+
 POINT_METRIC_FUNCTIONS = {
     "MSE": calculate_mse,
     "MSLE": calculate_msle,
@@ -419,6 +428,7 @@ def _calculate_ignorance_score(predictions, observed, n, all_bins):
     "pEMDiv": calculate_pEMDiv,
     "Pearson": calculate_pearson,
     "Variogram": calculate_variogram,
+    "Mean_Prediction": calculate_mean_prediction,
 }
 
 UNCERTAINTY_METRIC_FUNCTIONS = {
@@ -429,4 +439,5 @@ def _calculate_ignorance_score(predictions, observed, n, all_bins):
     "Jeffreys": calculate_jeffreys,
     "Coverage": calculate_coverage,
     "pEMDiv": calculate_pEMDiv,
+    "Mean_Prediction": calculate_mean_prediction,
 }
diff --git a/views_evaluation/evaluation/metrics.py b/views_evaluation/evaluation/metrics.py
index 77a799a..6988348 100644
--- a/views_evaluation/evaluation/metrics.py
+++ b/views_evaluation/evaluation/metrics.py
@@ -128,6 +128,7 @@ class PointEvaluationMetrics(BaseEvaluationMetrics):
     pEMDiv: Optional[float] = None
     Pearson: Optional[float] = None
     Variogram: Optional[float] = None
+    Mean_Prediction: Optional[float] = None
 
   
 @dataclass
@@ -146,5 +147,5 @@ class UncertaintyEvaluationMetrics(BaseEvaluationMetrics):
     pEMDiv: Optional[float] = None
     Brier: Optional[float] = None
     Jeffreys: Optional[float] = None
-    
+    Mean_Prediction: Optional[float] = None
     
\ No newline at end of file
diff --git a/views_evaluation/reports/generator.py b/views_evaluation/reports/generator.py
index 6f42c64..b84dde8 100644
--- a/views_evaluation/reports/generator.py
+++ b/views_evaluation/reports/generator.py
@@ -15,7 +15,7 @@ def __init__(self, config: dict, target: str, conflict_type: str):
         self.is_ensemble = True if "models" in config else False
         self.eval_report = {}
 
-    def generate_eval_report_dict(self, df_preds: list[pd.DataFrame], df_eval_ts: pd.DataFrame, mean_prediction: float=None):
+    def generate_eval_report_dict(self, df_preds: list[pd.DataFrame], df_eval_ts: pd.DataFrame):
         """Return a dictionary with evaluation report data."""
         self.eval_report = {
             "Target": self.target,
@@ -34,20 +34,16 @@ def generate_eval_report_dict(self, df_preds: list[pd.DataFrame], df_eval_ts: pd
                 "Ensemble" if self.is_ensemble else "Model",
                 self.config["name"],
                 df_eval_ts,
-                df_preds,
-                mean_prediction
             )
         )
         return self.eval_report
 
-    def update_ensemble_eval_report(self, model_name, df_preds: list[pd.DataFrame], df_eval_ts: pd.DataFrame, mean_prediction: float=None):
+    def update_ensemble_eval_report(self, model_name, df_eval_ts: pd.DataFrame):
         self.eval_report["Evaluation Results"].append(
             self._single_result(
                 "Constituent",
                 model_name,
                 df_eval_ts,
-                df_preds,
-                mean_prediction
             )
         )
         return self.eval_report
@@ -60,21 +56,10 @@ def _forecast_type(self, df_preds: list[pd.DataFrame]):
     def _partition(self, key: str):
         return self.config[self.run_type][key]
 
-    def _single_result(self, model_type: str, model_name: str, df_eval_ts: pd.DataFrame, df_preds: list[pd.DataFrame], mean_prediction: float=None):
-        from views_evaluation.evaluation.evaluation_manager import EvaluationManager
-        df_preds = [
-            EvaluationManager.transform_data(
-                EvaluationManager.convert_to_array(df_pred, f"pred_{self.target}"), f"pred_{self.target}"
-            )
-            for df_pred in df_preds
-        ]
+    def _single_result(self, model_type: str, model_name: str, df_eval_ts: pd.DataFrame):
         mse = df_eval_ts["MSE"].mean() 
         msle = df_eval_ts["MSLE"].mean()
-        if mean_prediction is None:
-            all_preds = np.concatenate([np.asarray(v).flatten() for df_pred in df_preds for v in df_pred[f"pred_{self.target}"]])
-            mean_pred = np.mean(all_preds)
-        else:
-            mean_pred = mean_prediction
+        mean_pred = df_eval_ts["Mean_Prediction"].mean()
         
         return {
             "Type": model_type,

From 1028a390a02ab807613aaad85c5d418d9a30918f Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Mon, 28 Jul 2025 14:20:05 +0200
Subject: [PATCH 25/31] rename to y_hat_bar

---
 views_evaluation/evaluation/evaluation_manager.py | 1 -
 views_evaluation/evaluation/metric_calculators.py | 4 ++--
 views_evaluation/evaluation/metrics.py            | 4 ++--
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
index 95938b2..3bf27e2 100644
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ b/views_evaluation/evaluation/evaluation_manager.py
@@ -10,7 +10,6 @@
     POINT_METRIC_FUNCTIONS,
     UNCERTAINTY_METRIC_FUNCTIONS,
 )
-from views_evaluation.reports.generator import EvalReportGenerator
 
 logger = logging.getLogger(__name__)
 
diff --git a/views_evaluation/evaluation/metric_calculators.py b/views_evaluation/evaluation/metric_calculators.py
index 37d6fa1..1bc7523 100644
--- a/views_evaluation/evaluation/metric_calculators.py
+++ b/views_evaluation/evaluation/metric_calculators.py
@@ -428,7 +428,7 @@ def calculate_mean_prediction(
     "pEMDiv": calculate_pEMDiv,
     "Pearson": calculate_pearson,
     "Variogram": calculate_variogram,
-    "Mean_Prediction": calculate_mean_prediction,
+    "y_hat_bar": calculate_mean_prediction,
 }
 
 UNCERTAINTY_METRIC_FUNCTIONS = {
@@ -439,5 +439,5 @@ def calculate_mean_prediction(
     "Jeffreys": calculate_jeffreys,
     "Coverage": calculate_coverage,
     "pEMDiv": calculate_pEMDiv,
-    "Mean_Prediction": calculate_mean_prediction,
+    "y_hat_bar": calculate_mean_prediction,
 }
diff --git a/views_evaluation/evaluation/metrics.py b/views_evaluation/evaluation/metrics.py
index 6988348..a7dcf33 100644
--- a/views_evaluation/evaluation/metrics.py
+++ b/views_evaluation/evaluation/metrics.py
@@ -128,7 +128,7 @@ class PointEvaluationMetrics(BaseEvaluationMetrics):
     pEMDiv: Optional[float] = None
     Pearson: Optional[float] = None
     Variogram: Optional[float] = None
-    Mean_Prediction: Optional[float] = None
+    y_hat_bar: Optional[float] = None
 
   
 @dataclass
@@ -147,5 +147,5 @@ class UncertaintyEvaluationMetrics(BaseEvaluationMetrics):
     pEMDiv: Optional[float] = None
     Brier: Optional[float] = None
     Jeffreys: Optional[float] = None
-    Mean_Prediction: Optional[float] = None
+    y_hat_bar: Optional[float] = None
     
\ No newline at end of file

From f9829a406eb733a67b07c32401567a8b051d752b Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Mon, 28 Jul 2025 16:01:26 +0200
Subject: [PATCH 26/31] update index querying

---
 views_evaluation/evaluation/evaluation_manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
index 3bf27e2..0a714cf 100644
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ b/views_evaluation/evaluation/evaluation_manager.py
@@ -412,8 +412,8 @@ def month_wise_evaluation(
                     level=matched_pred.index.names[0]
                 ).apply(
                     lambda df: metric_functions[metric](
-                        matched_actual.loc[df.index, [target]],
-                        matched_pred.loc[df.index, [f"pred_{target}"]],
+                        matched_actual.loc[df.index.unique(), [target]],
+                        matched_pred.loc[df.index.unique(), [f"pred_{target}"]],
                         target,
                         **kwargs,
                     )

From a192141ae62c8cc34f81d7fe661d9751c826e971 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Mon, 28 Jul 2025 16:01:40 +0200
Subject: [PATCH 27/31] update name

---
 views_evaluation/reports/generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/views_evaluation/reports/generator.py b/views_evaluation/reports/generator.py
index b84dde8..30feeab 100644
--- a/views_evaluation/reports/generator.py
+++ b/views_evaluation/reports/generator.py
@@ -59,7 +59,7 @@ def _partition(self, key: str):
     def _single_result(self, model_type: str, model_name: str, df_eval_ts: pd.DataFrame):
         mse = df_eval_ts["MSE"].mean() 
         msle = df_eval_ts["MSLE"].mean()
-        mean_pred = df_eval_ts["Mean_Prediction"].mean()
+        mean_pred = df_eval_ts["y_hat_bar"].mean()
         
         return {
             "Type": model_type,

From cc9637bfcd3894aeb6f9ed74cfa4331b8ec0628b Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Wed, 30 Jul 2025 11:45:41 +0200
Subject: [PATCH 28/31] update quickstart

---
 examples/quickstart.ipynb | 75 +++++++++++++++++++++++++--------------
 1 file changed, 49 insertions(+), 26 deletions(-)

diff --git a/examples/quickstart.ipynb b/examples/quickstart.ipynb
index 1b7323e..c7246f1 100644
--- a/examples/quickstart.ipynb
+++ b/examples/quickstart.ipynb
@@ -53,13 +53,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
     "import numpy as np\n",
-    "from views_evaluation.evaluation.evaluation_manager import EvaluationManager"
+    "from views_evaluation.evaluation.evaluation_manager import EvaluationManager\n"
    ]
   },
   {
@@ -73,7 +73,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -93,7 +93,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -109,7 +109,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -145,7 +145,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -162,7 +162,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -176,13 +176,13 @@
     }
    ],
    "source": [
-    "steps = [1, 2]\n",
-    "point_evaluation_results = evaluation_manager.evaluate(df_actual, dfs_point, target='lr_target', steps=steps)"
+    "config = {\"steps\": [1, 2]}\n",
+    "point_evaluation_results = evaluation_manager.evaluate(df_actual, dfs_point, target='lr_target', config=config)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -200,7 +200,7 @@
        " ts01  0.420849   2.0)"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -218,32 +218,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Metric RMSLE is not a default metric, skipping...\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
+      "Metric RMSLE is not a default metric, skipping...\n",
       "Metric RMSLE is not a default metric, skipping...\n",
       "Metric RMSLE is not a default metric, skipping...\n"
      ]
     }
    ],
    "source": [
-    "uncertainty_evaluation_results = evaluation_manager.evaluate(df_actual, dfs_uncertainty, target='lr_target', steps=steps)"
+    "uncertainty_evaluation_results = evaluation_manager.evaluate(df_actual, dfs_uncertainty, target='lr_target', config=config)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -261,7 +255,7 @@
        " ts01  3.611111  107.8)"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -279,9 +273,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Metric MIS is not a default metric, skipping...\n"
+     ]
+    }
+   ],
    "source": [
     "# Get the evaluation type, i.e., uncertainty or point\n",
     "actual = EvaluationManager.transform_data(\n",
@@ -293,13 +295,13 @@
     "    )\n",
     "    for pred in dfs_point\n",
     "]\n",
-    "is_uncertainty = EvaluationManager.get_evaluation_type(predictions)\n",
+    "is_uncertainty = EvaluationManager.get_evaluation_type(predictions, 'pred_lr_target')\n",
     "month_point_evaluation_results = evaluation_manager.month_wise_evaluation(actual, predictions, target='lr_target', is_uncertainty=is_uncertainty)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -317,6 +319,27 @@
     "print(month_point_evaluation_results[1])"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'step01': PointEvaluationMetrics(MSE=None, MSLE=None, RMSLE=0.18203984406117593, CRPS=0.5, AP=None, EMD=None, SD=None, pEMDiv=None, Pearson=None, Variogram=None, y_hat_bar=None),\n",
+       " 'step02': PointEvaluationMetrics(MSE=None, MSLE=None, RMSLE=0.636311445241193, CRPS=3.5, AP=None, EMD=None, SD=None, pEMDiv=None, Pearson=None, Variogram=None, y_hat_bar=None)}"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "point_evaluation_results['step'][0]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

From 76343a9d042d3f77c48b1f5f1a3f2d9c518d7590 Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Wed, 30 Jul 2025 11:45:59 +0200
Subject: [PATCH 29/31] add post analysis

---
 .../evaluation/evaluation_manager.py          | 118 ++++++++++++++++++
 1 file changed, 118 insertions(+)

diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
index 0a714cf..39afe63 100644
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ b/views_evaluation/evaluation/evaluation_manager.py
@@ -3,6 +3,7 @@
 import pandas as pd
 import numpy as np
 from views_evaluation.evaluation.metrics import (
+    BaseEvaluationMetrics,
     PointEvaluationMetrics,
     UncertaintyEvaluationMetrics,
 )
@@ -471,3 +472,120 @@ def evaluate(
         )
 
         return evaluation_results
+
+    @staticmethod
+    def filter_step_wise_evaluation(
+        step_wise_evaluation_results: dict,
+        filter_steps: list[int] = [1, 3, 6, 12, 36],
+    ):
+        """
+        Filter step-wise evaluation results to include only specific steps.
+
+        Args:
+            step_wise_evaluation_results (dict): The step-wise evaluation results containing evaluation dict and DataFrame.
+            filter_steps (list[int]): List of step numbers to include in the filtered results. Defaults to [1, 3, 6, 12, 36].
+
+        Returns:
+            dict: A dictionary containing the filtered evaluation dictionary and DataFrame for the selected steps.
+        """
+        step_wise_evaluation_dict = step_wise_evaluation_results[0]
+        step_wise_evaluation_df = step_wise_evaluation_results[1]
+
+        selected_keys = [f"step{str(step).zfill(2)}" for step in filter_steps]
+
+        filtered_evaluation_dict = {
+            key: step_wise_evaluation_dict[key]
+            for key in selected_keys
+            if key in step_wise_evaluation_dict
+        }
+
+        filtered_evaluation_df = step_wise_evaluation_df.loc[
+            step_wise_evaluation_df.index.isin(selected_keys)
+        ]
+
+        return (filtered_evaluation_dict, filtered_evaluation_df)
+
+    @staticmethod
+    def aggregate_month_wise_evaluation(
+        month_wise_evaluation_results: dict,
+        aggregation_period: int = 6,
+        aggregation_type: str = "mean",
+    ):
+        """
+        Aggregate month-wise evaluation results by grouping months into periods and applying aggregation.
+
+        Args:
+            month_wise_evaluation_results (dict): The month-wise evaluation results containing evaluation dict and DataFrame.
+            aggregation_period (int): Number of months to group together for aggregation.
+            aggregation_type (str): Type of aggregation to apply.
+        Returns:
+            dict: A dictionary containing the aggregated evaluation dictionary and DataFrame.
+        """
+        month_wise_evaluation_dict = month_wise_evaluation_results[0]
+        month_wise_evaluation_df = month_wise_evaluation_results[1]
+
+        available_months = [
+            int(month.replace("month", "")) for month in month_wise_evaluation_df.index
+        ]
+        available_months.sort()
+
+        if len(available_months) < aggregation_period:
+            raise ValueError(
+                f"Not enough months to aggregate. Available months: {available_months}, aggregation period: {aggregation_period}"
+            )
+
+        aggregated_dict = {}
+        aggregated_data = []
+
+        for i in range(0, len(available_months), aggregation_period):
+            period_months = available_months[i : i + aggregation_period]
+            period_start = period_months[0]
+            period_end = period_months[-1]
+            period_key = f"month_{period_start}_{period_end}"
+
+            period_metrics = []
+            for month in period_months:
+                month_key = f"month{month}"
+                if month_key in month_wise_evaluation_dict:
+                    period_metrics.append(month_wise_evaluation_dict[month_key])
+
+            if period_metrics:
+                aggregated_metrics = {}
+                for metric_name in period_metrics[0].__annotations__.keys():
+                    metric_values = [
+                        getattr(metric, metric_name)
+                        for metric in period_metrics
+                        if getattr(metric, metric_name) is not None
+                    ]
+
+                    if metric_values:
+                        if aggregation_type == "mean":
+                            aggregated_value = np.mean(metric_values)
+                        elif aggregation_type == "median":
+                            aggregated_value = np.median(metric_values)
+                        else:
+                            raise ValueError(
+                                f"Unsupported aggregation type: {aggregation_type}"
+                            )
+
+                        aggregated_metrics[metric_name] = aggregated_value
+                    else:
+                        aggregated_metrics[metric_name] = None
+
+                if hasattr(period_metrics[0], "__class__"):
+                    aggregated_eval_metrics = period_metrics[0].__class__(
+                        **aggregated_metrics
+                    )
+                else:
+                    aggregated_eval_metrics = aggregated_metrics
+
+                aggregated_dict[period_key] = aggregated_eval_metrics
+
+                aggregated_data.append({"month_id": period_key, **aggregated_metrics})
+
+        if aggregated_data:
+            aggregated_df = BaseEvaluationMetrics.evaluation_dict_to_dataframe(
+                aggregated_dict
+            )
+
+        return (aggregated_dict, aggregated_df)

From 43e9f75ccbadc67883ada80bbf1b501d3f33717c Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Thu, 31 Jul 2025 16:31:18 +0200
Subject: [PATCH 30/31] update match_actual_pred to deal with missing countries
 and duplicated

---
 views_evaluation/evaluation/evaluation_manager.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
index 39afe63..7a33607 100644
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ b/views_evaluation/evaluation/evaluation_manager.py
@@ -196,11 +196,14 @@ def _match_actual_pred(
         - matched_pred: pd.DataFrame aligned with actual.
         """
         actual_target = actual[[target]]
-        aligned_actual, aligned_pred = actual_target.align(pred, join="inner")
-        matched_actual = aligned_actual.reindex(index=aligned_pred.index)
-        matched_actual[[target]] = actual_target
+        # Get indices from pred that exist in actual_target, preserving duplicates
+        mask = pred.index.isin(actual_target.index)
+        common_indices = pred.index[mask]
+        matched_actual = actual_target.reindex(common_indices).sort_index()
+        matched_pred = pred.reindex(common_indices).sort_index()
+        
+        return matched_actual, matched_pred
 
-        return matched_actual.sort_index(), pred.sort_index()
 
     @staticmethod
     def _split_dfs_by_step(dfs: list) -> list:

From 7270606996d61428be0853d5505ab360221128dd Mon Sep 17 00:00:00 2001
From: xiaolongsun <95378566+xiaolong0728@users.noreply.github.com>
Date: Fri, 1 Aug 2025 11:01:20 +0200
Subject: [PATCH 31/31] refactor index matching in EvaluationManager to improve
 handling of actual and predicted data alignment

---
 views_evaluation/evaluation/evaluation_manager.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/views_evaluation/evaluation/evaluation_manager.py b/views_evaluation/evaluation/evaluation_manager.py
index 7a33607..6f8371d 100644
--- a/views_evaluation/evaluation/evaluation_manager.py
+++ b/views_evaluation/evaluation/evaluation_manager.py
@@ -196,12 +196,16 @@ def _match_actual_pred(
         - matched_pred: pd.DataFrame aligned with actual.
         """
         actual_target = actual[[target]]
-        # Get indices from pred that exist in actual_target, preserving duplicates
-        mask = pred.index.isin(actual_target.index)
-        common_indices = pred.index[mask]
-        matched_actual = actual_target.reindex(common_indices).sort_index()
-        matched_pred = pred.reindex(common_indices).sort_index()
+        common_indices = actual_target.index.intersection(pred.index)
+        matched_pred = pred[pred.index.isin(common_indices)].copy()
         
+        # Create matched_actual by reindexing actual_target to match pred's index structure
+        # This will duplicate rows in actual where pred has duplicate indices
+        matched_actual = actual_target.reindex(matched_pred.index)
+        
+        matched_actual = matched_actual.sort_index()
+        matched_pred = matched_pred.sort_index()
+
         return matched_actual, matched_pred