electronsandstuff · electronsandstuff · Feb 20, 2025 · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025
diff --git a/src/paretobench/analyze_metrics.py b/src/paretobench/analyze_metrics.py
@@ -84,14 +84,30 @@ def apply_feval_cutoff(df: pd.DataFrame, max_feval: int = 300) -> pd.DataFrame:
 
     Parameters
     ----------
-    df : Dataframe
+    df : DataFrame
         The metric data
     max_feval : int, optional
         Cutoff of how many function evaluations are allowed, by default 300
+
+    Returns
+    -------
+    DataFrame
+        Filtered DataFrame containing one row per group with the largest fevals value not exceeding max_feval
     """
+    # Filter to rows within max_feval
     df = df[df["fevals"] <= max_feval]
-    idx = df.groupby(by=["run_idx", "exp_idx", "problem"])["fevals"].idxmax()
-    return df.loc[idx].reset_index(drop=True)
+
+    # Grab the row with the largest value of `fevals` within each group
+    df = df[df.groupby(["run_idx", "exp_idx", "problem"])["fevals"].transform("max") == df["fevals"]]
+
+    # Check for duplicate maximum values within groups
+    counts = df.groupby(["run_idx", "exp_idx", "problem"]).size()
+    if (counts > 1).any():
+        raise ValueError("At least one evaluation has a duplicate value for `fevals`")
+
+    # Fix the index
+    df = df.reset_index(drop=True)
+    return df
 
 
 def aggregate_metrics_feval_budget(
@@ -151,49 +167,53 @@ def aggregate_metrics_feval_budget(
     df["problem"] = df.apply(lambda x: normalize_problem_name(x["problem"]), axis=1)
     df = apply_feval_cutoff(df, max_feval)
 
-    def get_wilcoxon_comparison(x, metric):
+    def get_wilcoxon_comparison(our_vals, metric):
         """
         Given the grouped evaluations for this problem and run, compare ourself with the "reference" run
         """
         # If we are the reference run don't perform the comparison against ourself
-        if df.loc[x.index[0]]["exp_idx"] == wilcoxon_idx:
+        if df.loc[our_vals.index[0]]["exp_idx"] == wilcoxon_idx:
             return ""
 
         # Get the problem name
-        problem = df.loc[x.index[0]]["problem"]
+        problem = df.loc[our_vals.index[0]]["problem"]
 
         # Get the values for the metric on this problem from the container we are comparing agianst
-        y = df.loc[(df["exp_idx"] == wilcoxon_idx) & (df["problem"] == problem)][metric]
+        other_vals = df[(df["exp_idx"] == wilcoxon_idx) & (df["problem"] == problem)][metric]
 
         # Use the stats test to compare values
-        if ranksums(x.to_numpy(), y.to_numpy(), "less")[1] < wilcoxon_p:
+        if ranksums(our_vals.to_numpy(), other_vals.to_numpy(), "less")[1] < wilcoxon_p:
             return {"-": "+", "+": "-"}[directions[metric]]
-        if ranksums(x.to_numpy(), y.to_numpy(), "greater")[1] < wilcoxon_p:
+        if ranksums(our_vals.to_numpy(), other_vals.to_numpy(), "greater")[1] < wilcoxon_p:
             return directions[metric]
         return "="
 
-    def is_best(x, metric):
+    def is_best(our_vals, metric):
         """
         Given one of the groupby objects, is this collection of evaluations one of the best in the table for this problem.
         """
         # Get the problem name
-        problem = df.loc[x.index[0]]["problem"]
+        problem = df.loc[our_vals.index[0]]["problem"]
 
-        # Go through each container figuring out if we are the best
-        for n in df["exp_idx"].unique().tolist():
-            y = df.loc[(df["exp_idx"] == n) & (df["problem"] == problem)][metric]
+        # Compare against every other container
+        for exp_idx in df["exp_idx"].unique().tolist():
+            # Get all of the values of this metric for the container we are comparing against and this problem
+            other_vals = df[(df["exp_idx"] == exp_idx) & (df["problem"] == problem)][metric]
 
-            if not len(y):
+            # Perform the rank-sum test and if the other is better than us, we are not the best
+            if not len(other_vals):
                 continue
             if (
                 ranksums(
-                    x.to_numpy(),
-                    y.to_numpy(),
+                    our_vals.to_numpy(),
+                    other_vals.to_numpy(),
                     {"-": "greater", "+": "less"}[directions[metric]],
                 )[1]
                 < wilcoxon_p
             ):
                 return False
+
+        # We could not find a container which beat us, we are among the best
         return True
 
     # Get metric names and validate
@@ -220,7 +240,9 @@ def is_best(x, metric):
 
     # Apply the aggregation and return
     by = ["problem", "exp_idx"]
-    by.append("fname")
+
+    if not df["fname"].isna().any():
+        by.append("fname")
     if "exp_name" in df.columns:
         by.append("exp_name")
     return df.groupby(by).agg(agg_funs)
@@ -503,14 +525,20 @@ def comparison_table_to_latex(df: pd.DataFrame) -> str:
     latex_str = df.to_latex(multirow=True, escape=False, index=True).replace("multirow[t]", "multirow")
 
     # Count the number of each comparison for the columns and construct the summary to go at the bottom
-    summary_str = r" \multicolumn{1}{c}{%d/%d/%d} "
+    # Do this by counting the characters at the end of each of the cells.
+    def val_counts_to_summary_str(counts):
+        n_plus = int(counts.get("+", 0))
+        n_minus = int(counts.get("-", 0))
+        n_equal = int(counts.get("=", 0))
+        if (n_minus + n_plus + n_equal) == 0:
+            return " "
+        return " \multicolumn{1}{c}{%d/%d/%d} " % (n_plus, n_minus, n_equal)
+
     comparisons = df.map(lambda x: (x[-1] if len(x) > 4 else "")).apply(pd.Series.value_counts).fillna(0)
-    comparisons = comparisons.apply(
-        lambda x: summary_str % (int(x.get("+", 0)), int(x.get("-", 0)), int(x.get("=", 0)))
-    )
+    comparisons = comparisons.apply(val_counts_to_summary_str)
 
     # Construct text for the final row
-    comparison_cells = [(r" \multicolumn{%d}{c}{+/-/=} " % df.index.nlevels)] + comparisons.values[:-1].tolist() + [" "]
+    comparison_cells = [(r" \multicolumn{%d}{c}{+/-/=} " % df.index.nlevels)] + comparisons.to_list()
     summary_line = "&".join(comparison_cells) + r"\\"
 
     # Bold and center the header
@@ -543,5 +571,12 @@ def comparison_table_to_latex(df: pd.DataFrame) -> str:
         lines.append(ln)
     latex_str = "\n".join(lines)
 
-    # Return it
-    return latex_str.replace("=", r"$\approx$")
+    # Keep first 3 lines unchanged, replace "=" in remaining lines, avoid problems with experiment names
+    # that have equals characert in them
+    lines = latex_str.split("\n")
+    for i in range(len(lines)):
+        if i >= 3:  # Only process lines after the third line
+            lines[i] = lines[i].replace("=", r"$\approx$")
+
+    # Join the lines back together
+    return "\n".join(lines)
diff --git a/tests/test_analyze_metrics.py b/tests/test_analyze_metrics.py
@@ -107,12 +107,15 @@ def test_aggregate_metrics_stats_test():
             assert agg.loc[prob_norm, exp_idx].iloc[0][("test", "wilcoxon_best")] == (loc == min(run_locs[prob]))
 
 
-def test_feval_cutoff():
+@pytest.mark.parametrize(
+    "index_pattern",
+    ["sequential", "shuffled", "duplicated"],
+)
+def test_feval_cutoff_index_variants(index_pattern):
     """
     Test the cutoff function by making a table of values with a test metric equalling 1.0 only at fevals=7. The cutoff is made
     for fevals=7 and we check all the metric values.
     """
-    # Create a test dataframe (big nasty nested loop)
     rows = []
     for run_idx in range(16):
         for exp_idx in range(4):
@@ -129,9 +132,22 @@ def test_feval_cutoff():
                     )
     df = pd.DataFrame(rows)
 
-    # Run the cutoff function
+    if index_pattern == "sequential":
+        pass
+    elif index_pattern == "shuffled":
+        n_rows = len(df)
+        shuffled_idx = np.random.permutation(n_rows)
+        df.index = shuffled_idx
+    elif index_pattern == "duplicated":
+        n_rows = len(df)
+        duplicate_idx = np.repeat(range(n_rows // 2), 2)[:n_rows]
+        df.index = duplicate_idx
+
     df_cutoff = apply_feval_cutoff(df, max_feval=7)
+
     assert (df_cutoff["test"] == 1.0).all()
+    assert len(df_cutoff) == len(df[df["fevals"] == 7])
+    assert all(col in df_cutoff.columns for col in ["run_idx", "exp_idx", "problem", "fevals", "test"])
 
 
 @pytest.mark.parametrize(