Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 60 additions & 25 deletions src/paretobench/analyze_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,30 @@ def apply_feval_cutoff(df: pd.DataFrame, max_feval: int = 300) -> pd.DataFrame:

Parameters
----------
df : Dataframe
df : DataFrame
The metric data
max_feval : int, optional
Cutoff of how many function evaluations are allowed, by default 300

Returns
-------
DataFrame
Filtered DataFrame containing one row per group with the largest fevals value not exceeding max_feval
"""
# Filter to rows within max_feval
df = df[df["fevals"] <= max_feval]
idx = df.groupby(by=["run_idx", "exp_idx", "problem"])["fevals"].idxmax()
return df.loc[idx].reset_index(drop=True)

# Grab the row with the largest value of `fevals` within each group
df = df[df.groupby(["run_idx", "exp_idx", "problem"])["fevals"].transform("max") == df["fevals"]]

# Check for duplicate maximum values within groups
counts = df.groupby(["run_idx", "exp_idx", "problem"]).size()
if (counts > 1).any():
raise ValueError("At least one evaluation has a duplicate value for `fevals`")

# Fix the index
df = df.reset_index(drop=True)
return df


def aggregate_metrics_feval_budget(
Expand Down Expand Up @@ -151,49 +167,53 @@ def aggregate_metrics_feval_budget(
df["problem"] = df.apply(lambda x: normalize_problem_name(x["problem"]), axis=1)
df = apply_feval_cutoff(df, max_feval)

def get_wilcoxon_comparison(x, metric):
def get_wilcoxon_comparison(our_vals, metric):
"""
Given the grouped evaluations for this problem and run, compare ourself with the "reference" run
"""
# If we are the reference run don't perform the comparison against ourself
if df.loc[x.index[0]]["exp_idx"] == wilcoxon_idx:
if df.loc[our_vals.index[0]]["exp_idx"] == wilcoxon_idx:
return ""

# Get the problem name
problem = df.loc[x.index[0]]["problem"]
problem = df.loc[our_vals.index[0]]["problem"]

# Get the values for the metric on this problem from the container we are comparing agianst
y = df.loc[(df["exp_idx"] == wilcoxon_idx) & (df["problem"] == problem)][metric]
other_vals = df[(df["exp_idx"] == wilcoxon_idx) & (df["problem"] == problem)][metric]

# Use the stats test to compare values
if ranksums(x.to_numpy(), y.to_numpy(), "less")[1] < wilcoxon_p:
if ranksums(our_vals.to_numpy(), other_vals.to_numpy(), "less")[1] < wilcoxon_p:
return {"-": "+", "+": "-"}[directions[metric]]
if ranksums(x.to_numpy(), y.to_numpy(), "greater")[1] < wilcoxon_p:
if ranksums(our_vals.to_numpy(), other_vals.to_numpy(), "greater")[1] < wilcoxon_p:
return directions[metric]
return "="

def is_best(x, metric):
def is_best(our_vals, metric):
"""
Given one of the groupby objects, is this collection of evaluations one of the best in the table for this problem.
"""
# Get the problem name
problem = df.loc[x.index[0]]["problem"]
problem = df.loc[our_vals.index[0]]["problem"]

# Go through each container figuring out if we are the best
for n in df["exp_idx"].unique().tolist():
y = df.loc[(df["exp_idx"] == n) & (df["problem"] == problem)][metric]
# Compare against every other container
for exp_idx in df["exp_idx"].unique().tolist():
# Get all of the values of this metric for the container we are comparing against and this problem
other_vals = df[(df["exp_idx"] == exp_idx) & (df["problem"] == problem)][metric]

if not len(y):
# Perform the rank-sum test and if the other is better than us, we are not the best
if not len(other_vals):
continue
if (
ranksums(
x.to_numpy(),
y.to_numpy(),
our_vals.to_numpy(),
other_vals.to_numpy(),
{"-": "greater", "+": "less"}[directions[metric]],
)[1]
< wilcoxon_p
):
return False

# We could not find a container which beat us, we are among the best
return True

# Get metric names and validate
Expand All @@ -220,7 +240,9 @@ def is_best(x, metric):

# Apply the aggregation and return
by = ["problem", "exp_idx"]
by.append("fname")

if not df["fname"].isna().any():
by.append("fname")
if "exp_name" in df.columns:
by.append("exp_name")
return df.groupby(by).agg(agg_funs)
Expand Down Expand Up @@ -503,14 +525,20 @@ def comparison_table_to_latex(df: pd.DataFrame) -> str:
latex_str = df.to_latex(multirow=True, escape=False, index=True).replace("multirow[t]", "multirow")

# Count the number of each comparison for the columns and construct the summary to go at the bottom
summary_str = r" \multicolumn{1}{c}{%d/%d/%d} "
# Do this by counting the characters at the end of each of the cells.
def val_counts_to_summary_str(counts):
n_plus = int(counts.get("+", 0))
n_minus = int(counts.get("-", 0))
n_equal = int(counts.get("=", 0))
if (n_minus + n_plus + n_equal) == 0:
return " "
return " \multicolumn{1}{c}{%d/%d/%d} " % (n_plus, n_minus, n_equal)

comparisons = df.map(lambda x: (x[-1] if len(x) > 4 else "")).apply(pd.Series.value_counts).fillna(0)
comparisons = comparisons.apply(
lambda x: summary_str % (int(x.get("+", 0)), int(x.get("-", 0)), int(x.get("=", 0)))
)
comparisons = comparisons.apply(val_counts_to_summary_str)

# Construct text for the final row
comparison_cells = [(r" \multicolumn{%d}{c}{+/-/=} " % df.index.nlevels)] + comparisons.values[:-1].tolist() + [" "]
comparison_cells = [(r" \multicolumn{%d}{c}{+/-/=} " % df.index.nlevels)] + comparisons.to_list()
summary_line = "&".join(comparison_cells) + r"\\"

# Bold and center the header
Expand Down Expand Up @@ -543,5 +571,12 @@ def comparison_table_to_latex(df: pd.DataFrame) -> str:
lines.append(ln)
latex_str = "\n".join(lines)

# Return it
return latex_str.replace("=", r"$\approx$")
# Keep first 3 lines unchanged, replace "=" in remaining lines, avoid problems with experiment names
# that have equals characert in them
lines = latex_str.split("\n")
for i in range(len(lines)):
if i >= 3: # Only process lines after the third line
lines[i] = lines[i].replace("=", r"$\approx$")

# Join the lines back together
return "\n".join(lines)
22 changes: 19 additions & 3 deletions tests/test_analyze_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,15 @@ def test_aggregate_metrics_stats_test():
assert agg.loc[prob_norm, exp_idx].iloc[0][("test", "wilcoxon_best")] == (loc == min(run_locs[prob]))


def test_feval_cutoff():
@pytest.mark.parametrize(
"index_pattern",
["sequential", "shuffled", "duplicated"],
)
def test_feval_cutoff_index_variants(index_pattern):
"""
Test the cutoff function by making a table of values with a test metric equalling 1.0 only at fevals=7. The cutoff is made
for fevals=7 and we check all the metric values.
"""
# Create a test dataframe (big nasty nested loop)
rows = []
for run_idx in range(16):
for exp_idx in range(4):
Expand All @@ -129,9 +132,22 @@ def test_feval_cutoff():
)
df = pd.DataFrame(rows)

# Run the cutoff function
if index_pattern == "sequential":
pass
elif index_pattern == "shuffled":
n_rows = len(df)
shuffled_idx = np.random.permutation(n_rows)
df.index = shuffled_idx
elif index_pattern == "duplicated":
n_rows = len(df)
duplicate_idx = np.repeat(range(n_rows // 2), 2)[:n_rows]
df.index = duplicate_idx

df_cutoff = apply_feval_cutoff(df, max_feval=7)

assert (df_cutoff["test"] == 1.0).all()
assert len(df_cutoff) == len(df[df["fevals"] == 7])
assert all(col in df_cutoff.columns for col in ["run_idx", "exp_idx", "problem", "fevals", "test"])


@pytest.mark.parametrize(
Expand Down
Loading