Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
7285e07
gepa minimal
parthkotwal Dec 4, 2025
6a5d322
gepa test with shell pytest arguments
parthkotwal Dec 5, 2025
af1f454
optimize_gepa.py runs successfully between BFCL and dspy's GEPA api
parthkotwal Dec 17, 2025
54fdf40
add filler files to utils/
parthkotwal Dec 17, 2025
77744c8
minimal version of GEPA ran
parthkotwal Dec 18, 2025
435a538
tried summarizing behavior
parthkotwal Dec 23, 2025
bb4dd4b
GEPA successfully works on BFCL Agent runs
parthkotwal Dec 24, 2025
396c173
GEPA experiment outputs more logs
parthkotwal Dec 27, 2025
9a8876c
Making GEPA-BFCL experiment more readable. Started with logging_utils
parthkotwal Dec 27, 2025
bb5fabf
Finished logging and scoring utils
parthkotwal Dec 28, 2025
682b5a4
working on BFCLAgent forward
parthkotwal Dec 28, 2025
dd53027
GEPA on BFCL package runs correctly
parthkotwal Dec 29, 2025
84aa5d1
Metric doesn’t include involved classes, excluded functions, or const…
parthkotwal Jan 4, 2026
ba39212
bfcl test cases can now be shuffled and run on an entire subset
parthkotwal Jan 4, 2026
cbe74cb
removed soft score
parthkotwal Jan 4, 2026
a5cf3ca
agent and reflection LMs are separated
parthkotwal Jan 6, 2026
a6d3922
Enhancing how models are separated
parthkotwal Jan 12, 2026
8fe0dda
Merge branch 'main' into parth-branch
parthkotwal Jan 14, 2026
3f0d69a
Specific test cases/range can be specified in args
parthkotwal Jan 13, 2026
757eaa5
Log how each run of a test case is mapped to which instruction
parthkotwal Jan 13, 2026
7b8ad05
RunContext wasn't being preserved across files
parthkotwal Jan 13, 2026
c2b8201
Evaluation output is sent to the model and logged. Ready for final run
parthkotwal Jan 15, 2026
60a1450
Initial scripts for candidate analysis
parthkotwal Jan 17, 2026
a686189
Updated candidate analysis to take in output_dir argument and easy to…
parthkotwal Jan 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ fastagent.secrets.yaml
outputs/
output*/
results/
experiments/
fastagent.jsonl
test_script_*.py
.claude/
Expand All @@ -64,3 +63,5 @@ site/

# Appworld data
data/

/utils/
Empty file added experiments/__init__.py
Empty file.
114 changes: 114 additions & 0 deletions experiments/gepa_analysis/candidate_snapshots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from pathlib import Path
import argparse
import json

import pandas as pd


def load_candidate_snapshots(path: Path) -> pd.DataFrame:
rows = []

with path.open() as f:
for line in f:
record = json.loads(line)

eval_info = record.get("latest_eval", {})

rows.append({
"ts": pd.to_datetime(record["ts"], utc=True),
"instruction_hash": record["instruction_hash"],
"instruction_text": record["instruction_text"],
"test_id": eval_info.get("test_id"),
"split": eval_info.get("split"),
"hard_valid": eval_info.get("hard_valid"),
"score": eval_info.get("final"),
})

return pd.DataFrame(rows)


def build_candidate_prompt_table(df: pd.DataFrame) -> pd.DataFrame:
grouped = df.groupby("instruction_hash")

rows = []

for instruction_hash, g in grouped:
instruction_text = g["instruction_text"].iloc[0]

train_scores = g[g["split"] == "train"]["score"]
dev_scores = g[g["split"] == "dev"]["score"]

rows.append({
"instruction_hash": instruction_hash,
"instruction_text": instruction_text,
"first_seen_ts": g["ts"].min(),
"last_seen_ts": g["ts"].max(),
"n_evals": len(g),
"train_pass_rate": train_scores.mean() if not train_scores.empty else None,
"dev_pass_rate": dev_scores.mean() if not dev_scores.empty else None,
"overall_pass_rate": g["score"].mean(),
"instruction_length_chars": len(instruction_text),
"instruction_length_lines": instruction_text.count("\n") + 1,
})

candidate_df = pd.DataFrame(rows)

return candidate_df.sort_values("first_seen_ts").reset_index(drop=True)


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Build candidate prompt table.")
parser.add_argument(
"--output-dir",
type=str,
default="1-14-prefinal",
help="Run directory name or path under outputs/gepa_on_bfcl (analysis lives in outputs/gepa_analysis).",
)
return parser.parse_args()

def resolve_run_dir(output_dir_arg: str) -> Path:
arg_path = Path(output_dir_arg)
parts = arg_path.parts
for idx, part in enumerate(parts[:-1]):
if part == "outputs" and parts[idx + 1] == "gepa_on_bfcl":
return arg_path
if part == "outputs" and parts[idx + 1] == "gepa_analysis":
return Path("./outputs/gepa_on_bfcl") / arg_path.name
return Path("./outputs/gepa_on_bfcl") / output_dir_arg


def main():
args = parse_args()
run_dir = resolve_run_dir(args.output_dir)
run_name = run_dir.name
output_dir = Path("./outputs/gepa_analysis") / run_name
output_dir.mkdir(parents=True, exist_ok=True)

snapshots_path = Path(run_dir / "candidate_snapshots.jsonl")

df_raw = load_candidate_snapshots(snapshots_path)
candidate_df = build_candidate_prompt_table(df_raw)

candidate_df.to_csv(output_dir / "candidate_snaps.csv", index=False)

print("\n=== Candidate Prompt Summary ===")
print(f"Total snapshot rows: {len(df_raw)}")
print(f"Unique prompts: {len(candidate_df)}")

print("\nTop prompts by dev pass rate:")
print(
candidate_df
.sort_values("dev_pass_rate", ascending=False)
.head(5)[
[
"instruction_hash",
"n_evals",
"dev_pass_rate",
"instruction_length_lines",
]
]
)


if __name__ == "__main__":
main()
105 changes: 105 additions & 0 deletions experiments/gepa_analysis/prompt_diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import difflib
from pathlib import Path
import argparse

import pandas as pd


def unified_prompt_diff(base_text: str, new_text: str) -> str:
base_lines = base_text.splitlines()
new_lines = new_text.splitlines()

diff = difflib.unified_diff(
base_lines,
new_lines,
fromfile="baseline",
tofile="candidate",
lineterm=""
)

return "\n".join(diff)

def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Generate prompt diffs.")
parser.add_argument(
"--output-dir",
type=str,
default="1-14-prefinal",
help="Run directory name or path under outputs/gepa_on_bfcl (analysis lives in outputs/gepa_analysis).",
)
return parser.parse_args()

def resolve_analysis_dir(output_dir_arg: str) -> Path:
arg_path = Path(output_dir_arg)
parts = arg_path.parts
for idx, part in enumerate(parts[:-1]):
if part == "outputs" and parts[idx + 1] == "gepa_analysis":
return arg_path
if part == "outputs" and parts[idx + 1] == "gepa_on_bfcl":
return Path("./outputs/gepa_analysis") / arg_path.name
return Path("./outputs/gepa_analysis") / output_dir_arg


def main():
args = parse_args()
output_dir = resolve_analysis_dir(args.output_dir)
df = pd.read_csv(output_dir / "candidate_snaps.csv")

output_md = Path(output_dir / "prompt_diffs.md")

# Baseline = most evaluated prompt
baseline = df.loc[df["n_evals"].idxmax()]

# Best non-baseline by overall pass rate
best_non_baseline = (
df.drop(index=baseline.name)
.sort_values("overall_pass_rate", ascending=False)
.iloc[0]
)

# Longest prompt (verbosity exploration)
longest_prompt = (
df.drop(index=baseline.name)
.sort_values("instruction_length_lines", ascending=False)
.iloc[0]
)

print("Baseline hash:", baseline["instruction_hash"])
print("Best non-baseline hash:", best_non_baseline["instruction_hash"])
print("Longest prompt hash:", longest_prompt["instruction_hash"])

with output_md.open("w") as f:
f.write("# Prompt Difference Analysis\n\n")

def write_section(title, base, other):
f.write(f"## {title}\n\n")
f.write(f"**Baseline hash:** `{base['instruction_hash']}`\n\n")
f.write(f"**Candidate hash:** `{other['instruction_hash']}`\n\n")
f.write(
f"- Overall pass rate: {other['overall_pass_rate']:.3f}\n"
f"- Instruction length (lines): {other['instruction_length_lines']}\n\n"
)

diff_text = unified_prompt_diff(
base["instruction_text"],
other["instruction_text"],
)

f.write("```diff\n")
f.write(diff_text if diff_text else "(No textual differences)\n")
f.write("\n```\n\n")

write_section(
"Baseline vs Best Non-Baseline Prompt",
baseline,
best_non_baseline,
)

write_section(
"Baseline vs Longest Prompt",
baseline,
longest_prompt,
)

if __name__ == "__main__":
main()
123 changes: 123 additions & 0 deletions experiments/gepa_analysis/prompt_timeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import numpy as np
from pathlib import Path
import argparse

import matplotlib.pyplot as plt
import pandas as pd

def plot_prompt_search_timeline(candidate_df: pd.DataFrame, output_dir: Path):
# Add discovery order
df = candidate_df.copy()
df["discovery_index"] = range(len(df))

# Baseline = most evaluated prompt (more robust than "first seen")
baseline_idx = df["n_evals"].idxmax()
baseline = df.loc[baseline_idx]
others = df.drop(index=baseline_idx)

# Y values: dev pass rate; if NaN, place slightly below 0 to show "no dev eval"
y = df["dev_pass_rate"].copy()
no_dev_mask = y.isna()
y_plot = y.copy()
y_plot[no_dev_mask] = -0.05 # sentinel row for "no dev eval"

fig, ax = plt.subplots(figsize=(10, 6))

# Get colormap normalization based on all instruction lengths
norm = plt.Normalize(
vmin=df["instruction_length_lines"].min(),
vmax=df["instruction_length_lines"].max()
)
cmap = plt.cm.viridis

# Plot all non-baseline prompts
scatter = ax.scatter(
df.loc[df.index != baseline_idx, "discovery_index"],
y_plot.loc[df.index != baseline_idx],
c=df.loc[df.index != baseline_idx, "instruction_length_lines"],
cmap="viridis",
norm=norm,
s=80,
alpha=0.9,
)

# Plot baseline prompt with viridis color
ax.scatter(
baseline["discovery_index"],
(-0.05 if pd.isna(baseline["dev_pass_rate"]) else baseline["dev_pass_rate"]),
marker="*",
s=250,
c=[baseline["instruction_length_lines"]],
cmap="viridis",
norm=norm,
# edgecolor="black",
linewidth=2,
label=f"Baseline (n={int(baseline['n_evals'])})",
zorder=5, # Ensure it's on top
)

# Add trend line for prompts with dev evals
valid_mask = ~no_dev_mask
if valid_mask.sum() > 1:
z = np.polyfit(df.loc[valid_mask, "discovery_index"],
df.loc[valid_mask, "dev_pass_rate"], 1)
p = np.poly1d(z)
ax.plot(df.loc[valid_mask, "discovery_index"],
p(df.loc[valid_mask, "discovery_index"]),
"r--", alpha=0.3, linewidth=1.5, label="Trend")

ax.set_title("GEPA Prompt Exploration (Dev Pass Rate)",
fontsize=13, fontweight='bold')
ax.set_xlabel("Prompt Discovery Order", fontsize=11)
ax.set_ylabel("Dev Pass Rate", fontsize=11)

# Make the "no dev eval" row interpretable
ax.set_ylim(-0.08, 1.05)
ax.axhline(-0.05, linestyle="--", linewidth=1, color='gray', alpha=0.5)
ax.text(
0, -0.048, "no dev eval",
fontsize=9, va="bottom", style='italic', color='gray'
)

# Add grid for easier reading
ax.grid(True, alpha=0.2, linestyle=':')

cbar = plt.colorbar(scatter, ax=ax)
cbar.set_label("Instruction Length (lines)", fontsize=10)

ax.legend(loc="upper right", framealpha=0.9)

plt.tight_layout()
plt.savefig(output_dir / "prompt_search_timeline.png", dpi=150)
plt.close()

def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Plot prompt search timeline.")
parser.add_argument(
"--output-dir",
type=str,
default="1-14-prefinal",
help="Run directory name or path under outputs/gepa_on_bfcl (analysis lives in outputs/gepa_analysis).",
)
return parser.parse_args()

def resolve_analysis_dir(output_dir_arg: str) -> Path:
arg_path = Path(output_dir_arg)
parts = arg_path.parts
for idx, part in enumerate(parts[:-1]):
if part == "outputs" and parts[idx + 1] == "gepa_analysis":
return arg_path
if part == "outputs" and parts[idx + 1] == "gepa_on_bfcl":
return Path("./outputs/gepa_analysis") / arg_path.name
return Path("./outputs/gepa_analysis") / output_dir_arg


def main():
args = parse_args()
output_dir = resolve_analysis_dir(args.output_dir)
candidate_df = pd.read_csv(output_dir / "candidate_snaps.csv")
plot_prompt_search_timeline(candidate_df, output_dir)


if __name__ == "__main__":
main()
35 changes: 35 additions & 0 deletions experiments/gepa_analysis/run_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import argparse
import subprocess
import sys
from pathlib import Path


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Run all GEPA analysis steps.")
parser.add_argument(
"--output-dir",
type=str,
default="1-14-prefinal",
help="Run directory name or path under outputs/gepa_on_bfcl.",
)
return parser.parse_args()


def run_step(script: str, output_dir: str) -> None:
result = subprocess.run(
[sys.executable, script, "--output-dir", output_dir],
check=False,
)
if result.returncode != 0:
raise SystemExit(result.returncode)


def main() -> None:
args = parse_args()
run_step("experiments/gepa_analysis/candidate_snapshots.py", args.output_dir)
run_step("experiments/gepa_analysis/prompt_diff.py", args.output_dir)
run_step("experiments/gepa_analysis/prompt_timeline.py", args.output_dir)


if __name__ == "__main__":
main()
Empty file.
Loading
Loading