From 7285e07375bca6c786f40948797a5d31ae7adc06 Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Wed, 3 Dec 2025 16:35:00 -0800
Subject: [PATCH 01/23] gepa minimal

---
 experiments/gepa_minimal.py | 79 +++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 experiments/gepa_minimal.py

diff --git a/experiments/gepa_minimal.py b/experiments/gepa_minimal.py
new file mode 100644
index 0000000..781dc52
--- /dev/null
+++ b/experiments/gepa_minimal.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+"""GEPA experiment using BFCL scoring."""
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]
+INSTR = ROOT / "tests/benchmarks/bfcl/instruction.txt"
+
+import sys
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+import asyncio
+import dspy
+from dspy.teleprompt.gepa.gepa import GEPA
+from tests.benchmarks.bfcl.test_bfcl import _run_bfcl_test, _validate_from_complete_json
+
+TEST_IDS = ["multi_turn_base_121", "multi_turn_base_167"]
+MODEL = "gpt-5"
+TEMP = 0.0
+
+# ---------------------------------------------------------------------------
+# Safe async wrapper (prevents GEPA worker event-loop explosions)
+# ---------------------------------------------------------------------------
+def run_async(coro):
+    try:
+        asyncio.get_running_loop()
+    except RuntimeError:
+        return asyncio.run(coro)  # normal case when called from main thread
+
+    # If already inside a running event loop (GEPA worker): create a private loop
+    loop = asyncio.new_event_loop()
+    try:
+        return loop.run_until_complete(coro)
+    finally:
+        loop.close()
+
+# ---------------------------------------------------------------------------
+# BFCL score
+# ---------------------------------------------------------------------------
+async def _run_single(test_id):
+    out = ROOT / "experiments/min" / test_id
+    out.mkdir(parents=True, exist_ok=True)
+    json_path = await _run_bfcl_test(test_id, MODEL, TEMP, out)
+    return _validate_from_complete_json(test_id, json_path)["validation"]["valid"]
+
+def bfcl_score(text: str):
+    INSTR.write_text(text)
+    async def run_all():
+        results = [await _run_single(t) for t in TEST_IDS]
+        return sum(results) / len(results)
+    return run_async(run_all())
+
+# ---------------------------------------------------------------------------
+# GEPA metric + minimal DSPy module
+# ---------------------------------------------------------------------------
+def metric(gold, pred, *_):
+    return bfcl_score(pred.instruction)
+
+class Program(dspy.Module):
+    def __init__(self, text):
+        super().__init__()
+        self.text = text
+    def forward(self, x=None):
+        return dspy.Prediction(instruction=self.text)
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    base = INSTR.read_text()
+    dspy.configure(lm=dspy.LM(MODEL))
+
+    # GEPA requires at least one input field
+    train = [dspy.Example(x="dummy").with_inputs("x")]
+
+    gepa = GEPA(metric=metric, auto="light", reflection_lm=dspy.LM(MODEL))
+    tuned = gepa.compile(student=Program(base), trainset=train, valset=train)
+
+    print("\n=== Optimized Instruction ===\n")
+    print(tuned.instruction)
\ No newline at end of file

From 6a5d3225364918aad07744062ea328102105c518 Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Fri, 5 Dec 2025 03:48:00 -0800
Subject: [PATCH 02/23] gepa test with shell pytest arguments

---
 experiments/gepa_minimal.py  |  79 --------
 experiments/optimize_gepa.py | 374 +++++++++++++++++++++++++++++++++++
 2 files changed, 374 insertions(+), 79 deletions(-)
 delete mode 100644 experiments/gepa_minimal.py
 create mode 100644 experiments/optimize_gepa.py

diff --git a/experiments/gepa_minimal.py b/experiments/gepa_minimal.py
deleted file mode 100644
index 781dc52..0000000
--- a/experiments/gepa_minimal.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env python
-"""GEPA experiment using BFCL scoring."""
-from pathlib import Path
-ROOT = Path(__file__).resolve().parents[1]
-INSTR = ROOT / "tests/benchmarks/bfcl/instruction.txt"
-
-import sys
-if str(ROOT) not in sys.path:
-    sys.path.insert(0, str(ROOT))
-
-import asyncio
-import dspy
-from dspy.teleprompt.gepa.gepa import GEPA
-from tests.benchmarks.bfcl.test_bfcl import _run_bfcl_test, _validate_from_complete_json
-
-TEST_IDS = ["multi_turn_base_121", "multi_turn_base_167"]
-MODEL = "gpt-5"
-TEMP = 0.0
-
-# ---------------------------------------------------------------------------
-# Safe async wrapper (prevents GEPA worker event-loop explosions)
-# ---------------------------------------------------------------------------
-def run_async(coro):
-    try:
-        asyncio.get_running_loop()
-    except RuntimeError:
-        return asyncio.run(coro)  # normal case when called from main thread
-
-    # If already inside a running event loop (GEPA worker): create a private loop
-    loop = asyncio.new_event_loop()
-    try:
-        return loop.run_until_complete(coro)
-    finally:
-        loop.close()
-
-# ---------------------------------------------------------------------------
-# BFCL score
-# ---------------------------------------------------------------------------
-async def _run_single(test_id):
-    out = ROOT / "experiments/min" / test_id
-    out.mkdir(parents=True, exist_ok=True)
-    json_path = await _run_bfcl_test(test_id, MODEL, TEMP, out)
-    return _validate_from_complete_json(test_id, json_path)["validation"]["valid"]
-
-def bfcl_score(text: str):
-    INSTR.write_text(text)
-    async def run_all():
-        results = [await _run_single(t) for t in TEST_IDS]
-        return sum(results) / len(results)
-    return run_async(run_all())
-
-# ---------------------------------------------------------------------------
-# GEPA metric + minimal DSPy module
-# ---------------------------------------------------------------------------
-def metric(gold, pred, *_):
-    return bfcl_score(pred.instruction)
-
-class Program(dspy.Module):
-    def __init__(self, text):
-        super().__init__()
-        self.text = text
-    def forward(self, x=None):
-        return dspy.Prediction(instruction=self.text)
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-if __name__ == "__main__":
-    base = INSTR.read_text()
-    dspy.configure(lm=dspy.LM(MODEL))
-
-    # GEPA requires at least one input field
-    train = [dspy.Example(x="dummy").with_inputs("x")]
-
-    gepa = GEPA(metric=metric, auto="light", reflection_lm=dspy.LM(MODEL))
-    tuned = gepa.compile(student=Program(base), trainset=train, valset=train)
-
-    print("\n=== Optimized Instruction ===\n")
-    print(tuned.instruction)
\ No newline at end of file
diff --git a/experiments/optimize_gepa.py b/experiments/optimize_gepa.py
new file mode 100644
index 0000000..2ae35b0
--- /dev/null
+++ b/experiments/optimize_gepa.py
@@ -0,0 +1,374 @@
+"""Simple GEPA-based instruction optimization for BFCL tests.
+
+Usage:
+    python experiments/optimize_gepa.py --test-subset multi_turn_base --num-tests <HOW MANY TESTS> --gepa-scoring-mode
+"""
+
+import argparse
+import json
+import subprocess
+from pathlib import Path
+from typing import Any, Optional
+
+import dspy
+from dspy.evaluate import Evaluate
+from dspy.teleprompt import GEPA
+
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from tests.benchmarks.bfcl import loader as bfcl_loader
+from tests.utils.fastagent_helpers import MessageSerializer
+
+
+def _stringify_question(question: Any) -> str:
+    """Normalize BFCL question payloads into text."""
+    if isinstance(question, list) and question:
+        first = question[0]
+        if isinstance(first, str):
+            return first
+        if isinstance(first, dict):
+            return str(first.get("content", ""))
+    if isinstance(question, dict):
+        return str(question.get("content", ""))
+    if isinstance(question, str):
+        return question
+    return ""
+
+
+class BFCLExample(dspy.Example):
+    """BFCL test case as a DSPy example."""
+
+    def __init__(
+        self,
+        test_id: str | None = None,
+        question: str | None = None,
+        expected_tools: list[str] | None = None,
+        *,
+        base: dspy.Example | None = None,
+        **kwargs: Any,
+    ):
+        if base is not None:
+            super().__init__(base=base, **kwargs)
+        else:
+            super().__init__(test_id=test_id, question=question, expected_tools=expected_tools or [], **kwargs)
+
+
+class MetricFeedback(dspy.Prediction):
+    """Prediction wrapper carrying both scalar score and textual feedback."""
+
+    def __init__(self, score: float, feedback: str) -> None:
+        super().__init__(score=score, feedback=feedback)
+
+
+class BFCLAgent(dspy.Module):
+    """Run BFCL tests with mutable instructions managed by GEPA."""
+
+    def __init__(
+        self,
+        instruction_text: str,
+        model: str,
+        base_dir: Path,
+        pytest_binary: str,
+        enable_scoring_mode: bool,
+    ) -> None:
+        super().__init__()
+        self.model = model
+        self.base_dir = base_dir
+        self.base_dir.mkdir(parents=True, exist_ok=True)
+        self.pytest_binary = pytest_binary
+        self.enable_scoring_mode = enable_scoring_mode
+        self._instruction_path = self.base_dir / "current_instruction.txt"
+
+        instruction_signature = dspy.Signature("prompt_input -> prompt_output", instructions=instruction_text)
+        self.prompt_predictor = dspy.Predict(instruction_signature)
+
+    def forward(self, test_id: str, question: str) -> dspy.Prediction:
+        """Run a single BFCL test and return the score plus tool usage info."""
+        self._instruction_path.parent.mkdir(parents=True, exist_ok=True)
+        instruction_text = self._instruction_text()
+        self._instruction_path.write_text(instruction_text, encoding="utf-8")
+
+        output_dir = self.base_dir / "runs" / test_id
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        cmd = [
+            self.pytest_binary,
+            f"tests/benchmarks/bfcl/test_bfcl.py::test_bfcl[{test_id}]",
+            "--model",
+            self.model,
+            "--instruction-file",
+            str(self._instruction_path),
+            "--output-dir",
+            str(output_dir),
+            "-q",
+            "-x",
+        ]
+
+        if self.enable_scoring_mode:
+            cmd.append("--gepa-scoring-mode")
+
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        passed = result.returncode == 0
+
+        tools_used = self._collect_tool_names(output_dir, test_id)
+
+        return dspy.Prediction(
+            test_id=test_id,
+            passed=passed,
+            tools_used=tools_used,
+            output=result.stdout + result.stderr,
+        )
+
+    def _instruction_text(self) -> str:
+        instructions = getattr(self.prompt_predictor.signature, "instructions", "")
+        if isinstance(instructions, (list, tuple)):
+            return "\n".join(str(part) for part in instructions if part)
+        return str(instructions or "")
+
+    def get_instruction_text(self) -> str:
+        return self._instruction_text()
+
+    @staticmethod
+    def _collect_tool_names(output_dir: Path, test_id: str) -> list[str]:
+        complete_file = output_dir / "raw" / f"{test_id}_complete.json"
+        if not complete_file.exists():
+            return []
+
+        try:
+            with open(complete_file, encoding="utf-8") as handle:
+                complete_data = json.load(handle)
+        except json.JSONDecodeError:
+            return []
+
+        tool_calls = MessageSerializer.extract_tool_calls_by_turn(complete_data)
+        names: list[str] = []
+        for turn in tool_calls:
+            for call in turn:
+                function = call.get("function")
+                if function:
+                    names.append(function)
+        return names
+
+
+def bfcl_metric_with_feedback(
+    gold: dspy.Example,
+    pred: dspy.Prediction,
+    trace: Optional[Any] = None,
+    pred_name: Optional[str] = None,
+    pred_trace: Optional[Any] = None,
+) -> dict[str, Any]:
+    """Metric that provides feedback to GEPA about test failures."""
+    
+    score = 1.0 if pred.passed else 0.0
+    
+    # Build feedback based on what went wrong
+    feedback_parts = []
+    
+    if not pred.passed:
+        feedback_parts.append(f"Test {gold.test_id} FAILED")
+        
+        # Check if expected tools were used
+        expected = set(gold.expected_tools)
+        used = set(pred.tools_used)
+        
+        if expected and used:
+            missing = expected - used
+            extra = used - expected
+            
+            if missing:
+                feedback_parts.append(f"Missing expected tools: {', '.join(missing)}")
+            if extra:
+                feedback_parts.append(f"Used unexpected tools: {', '.join(extra)}")
+        elif expected and not used:
+            feedback_parts.append(f"No tools were called, but expected: {', '.join(expected)}")
+        
+        # Add snippet of error output if available
+        if pred.output:
+            error_lines = [line for line in pred.output.split('\n') if 'error' in line.lower() or 'failed' in line.lower()]
+            if error_lines:
+                feedback_parts.append(f"Error output: {error_lines[0][:200]}")
+    else:
+        feedback_parts.append(f"Test {gold.test_id} PASSED")
+    
+    feedback = " | ".join(feedback_parts)
+    
+    return MetricFeedback(score=score, feedback=feedback)
+
+
+def load_test_cases(subset: str, limit: int) -> list[BFCLExample]:
+    """Load BFCL entries using the shared loader utilities."""
+
+    test_ids = bfcl_loader.find_tests_in_category(subset, limit=limit)
+    examples: list[BFCLExample] = []
+
+    for test_id in test_ids[:limit]:
+        try:
+            entry = bfcl_loader.load_test_entry(test_id)
+        except Exception as exc:  # pragma: no cover - diagnostics only
+            print(f"Warning: unable to load {test_id}: {exc}")
+            continue
+
+        question = _stringify_question(entry.get("question", ""))
+        expected_tools = entry.get("involved_classes", []) or []
+        example = BFCLExample(test_id=test_id, question=question, expected_tools=expected_tools)
+        examples.append(example.with_inputs("test_id", "question"))
+
+    return examples[:limit]
+
+
+def run_baseline(agent: BFCLAgent, examples: list[BFCLExample]) -> float:
+    """Run baseline evaluation."""
+    print(f"Running baseline with {len(examples)} tests...")
+    
+    passed = 0
+    for example in examples:
+        pred = agent(test_id=example.test_id, question=example.question)
+        if pred.passed:
+            passed += 1
+    
+    score = passed / len(examples) if examples else 0.0
+    print(f"Baseline pass rate: {score:.2%} ({passed}/{len(examples)})")
+    return score
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Optimize BFCL instructions using GEPA")
+    parser.add_argument("--test-subset", default="multi_turn_base", 
+                       help="Test category to use (e.g., multi_turn_base)")
+    parser.add_argument("--num-tests", type=int, default=10,
+                       help="Number of tests to use for optimization")
+    parser.add_argument("--model", default="gpt-5",
+                       help="Model to use for test evaluation")
+    parser.add_argument("--reflection-model", default="gpt-5",
+                       help="Model to use for GEPA reflection")
+    parser.add_argument("--max-evaluations", type=int, default=20,
+                       help="Maximum number of GEPA metric calls")
+    parser.add_argument("--output-dir", type=Path, default=Path("outputs/gepa"),
+                       help="Output directory")
+    parser.add_argument("--auto", choices=['light', 'medium', 'heavy'], default='light',
+                       help="GEPA auto-tuning mode")
+    parser.add_argument("--instruction-file", type=Path, default=Path("tests/benchmarks/bfcl/instruction.txt"),
+                       help="Path to the seed BFCL instruction file")
+    parser.add_argument("--pytest-binary", default="pytest",
+                       help="Pytest binary to invoke (default: pytest on PATH)")
+    parser.add_argument("--gepa-scoring-mode", action="store_true",
+                       help="Enable BFCL scoring-only logging during runs")
+    
+    args = parser.parse_args()
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    
+    print("=" * 60)
+    print("GEPA Instruction Optimization for BFCL")
+    print("=" * 60)
+    
+    # Load test cases
+    examples = load_test_cases(args.test_subset, args.num_tests)
+    if not examples:
+        print(f"Error: No tests found for subset '{args.test_subset}'")
+        return
+    
+    print(f"\nLoaded {len(examples)} test cases from {args.test_subset}")
+    
+    # Load original instructions
+    instruction_file = args.instruction_file
+    if not instruction_file.exists():
+        print(f"Error: Instruction file not found: {instruction_file}")
+        return
+    
+    original_instructions = instruction_file.read_text()
+    print(f"Original instructions: {len(original_instructions)} chars")
+    
+    # Create agent with original instructions
+    agent = BFCLAgent(
+        instruction_text=original_instructions,
+        model=args.model,
+        base_dir=args.output_dir,
+        pytest_binary=args.pytest_binary,
+        enable_scoring_mode=args.gepa_scoring_mode,
+    )
+    
+    # Run baseline
+    baseline_score = run_baseline(agent, examples)
+    
+    # Setup DSPy with reflection LM
+    reflection_lm = dspy.LM(args.reflection_model)
+    dspy.configure(lm=reflection_lm)
+    
+    print("\n" + "=" * 60)
+    print("Starting GEPA optimization...")
+    print("=" * 60)
+    print(f"Max evaluations: {args.max_evaluations}")
+    print(f"Auto-tuning mode: {args.auto}")
+    print(f"Reflection model: {args.reflection_model}")
+    
+    # Create GEPA optimizer
+    gepa = GEPA(
+        metric=bfcl_metric_with_feedback,
+        auto=args.auto,
+        reflection_lm=reflection_lm,
+        reflection_minibatch_size=3,
+        log_dir=str(args.output_dir / "gepa_logs"),
+        track_stats=True,
+        seed=42
+    )
+    
+    # Split into train/dev
+    train_size = int(len(examples) * 0.7)
+    trainset = examples[:train_size]
+    devset = examples[train_size:]
+    
+    print(f"Train set: {len(trainset)} tests")
+    print(f"Dev set: {len(devset)} tests")
+    
+    # Optimize
+    optimized_agent = gepa.compile(agent, trainset=trainset, valset=devset)
+    
+    # Evaluate optimized version
+    print("\n" + "=" * 60)
+    print("Evaluating optimized instructions...")
+    print("=" * 60)
+    
+    evaluate = Evaluate(
+        devset=devset,
+        metric=bfcl_metric_with_feedback,
+        display_progress=True,
+        display_table=False
+    )
+    
+    final_result = evaluate(optimized_agent)
+    final_score = float(final_result.score)
+    
+    optimized_instruction_path = args.output_dir / "optimized_instructions.txt"
+    optimized_instruction_path.write_text(optimized_agent.get_instruction_text(), encoding="utf-8")
+
+    metadata = {
+        "baseline_score": baseline_score,
+        "final_score": final_score,
+        "test_subset": args.test_subset,
+        "num_tests": len(examples),
+        "train_size": len(trainset),
+        "dev_size": len(devset),
+        "model": args.model,
+        "reflection_model": args.reflection_model,
+        "max_evaluations": args.max_evaluations,
+        "test_ids": [ex.test_id for ex in examples],
+        "optimized_instruction_path": str(optimized_instruction_path),
+    }
+    
+    metadata_file = args.output_dir / "optimization_metadata.json"
+    metadata_file.write_text(json.dumps(metadata, indent=2))
+    
+    print("\n" + "=" * 60)
+    print("Optimization Complete!")
+    print("=" * 60)
+    print(f"Baseline score: {baseline_score:.2%}")
+    print(f"Final score: {final_score:.2%}")
+    print(f"Improvement: {(final_score - baseline_score):.2%}")
+    print(f"\nMetadata saved to: {metadata_file}")
+    print(f"GEPA logs saved to: {args.output_dir / 'gepa_logs'}")
+    print("\nCheck the GEPA logs for optimized prompts and detailed traces.")
+
+
+if __name__ == "__main__":
+    main()

From af1f45449f95278f95cb25eb68f7cc447600f01f Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Tue, 16 Dec 2025 17:25:31 -0800
Subject: [PATCH 03/23] optimize_gepa.py runs successfully between BFCL and
 dspy's GEPA api

---
 experiments/optimize_gepa.py       | 328 +++++++++++------------------
 tests/benchmarks/bfcl/test_bfcl.py |  30 ++-
 tests/conftest.py                  |  40 ++++
 utils/GEPA_desc.txt                | 262 +++++++++++++++++++++++
 utils/appworld_new.txt             |  68 ++++++
 utils/gepa_outputs_desc.txt        | 117 ++++++++++
 utils/instruction_new.txt          |  55 +++++
 utils/json2md.py                   | 167 +++++++++++++++
 utils/scripts/__init__.py          |   0
 utils/scripts/compare_bfcl.py      | 179 ++++++++++++++++
 utils/tree.txt                     |  68 ++++++
 11 files changed, 1109 insertions(+), 205 deletions(-)
 create mode 100644 utils/GEPA_desc.txt
 create mode 100644 utils/appworld_new.txt
 create mode 100644 utils/gepa_outputs_desc.txt
 create mode 100644 utils/instruction_new.txt
 create mode 100644 utils/json2md.py
 create mode 100644 utils/scripts/__init__.py
 create mode 100644 utils/scripts/compare_bfcl.py
 create mode 100644 utils/tree.txt

diff --git a/experiments/optimize_gepa.py b/experiments/optimize_gepa.py
index 2ae35b0..3d0383f 100644
--- a/experiments/optimize_gepa.py
+++ b/experiments/optimize_gepa.py
@@ -1,12 +1,17 @@
+# NOTE:
+# This script performs instruction-only optimization using GEPA over BFCL tests.
+# The BFCL agent is invoked via pytest.
+
 """Simple GEPA-based instruction optimization for BFCL tests.
 
 Usage:
-    python experiments/optimize_gepa.py --test-subset multi_turn_base --num-tests <HOW MANY TESTS> --gepa-scoring-mode
+    python experiments/optimize_gepa.py --test-subset multi_turn_base --num-tests <N>
 """
 
 import argparse
 import json
 import subprocess
+import hashlib
 from pathlib import Path
 from typing import Any, Optional
 
@@ -21,8 +26,15 @@
 from tests.utils.fastagent_helpers import MessageSerializer
 
 
+# -------------------------
+# Utilities
+# -------------------------
+
+def sha256_text(text: str) -> str:
+    return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest()
+
+
 def _stringify_question(question: Any) -> str:
-    """Normalize BFCL question payloads into text."""
     if isinstance(question, list) and question:
         first = question[0]
         if isinstance(first, str):
@@ -36,9 +48,11 @@ def _stringify_question(question: Any) -> str:
     return ""
 
 
-class BFCLExample(dspy.Example):
-    """BFCL test case as a DSPy example."""
+# -------------------------
+# DSPy wrappers
+# -------------------------
 
+class BFCLExample(dspy.Example):
     def __init__(
         self,
         test_id: str | None = None,
@@ -55,15 +69,11 @@ def __init__(
 
 
 class MetricFeedback(dspy.Prediction):
-    """Prediction wrapper carrying both scalar score and textual feedback."""
-
     def __init__(self, score: float, feedback: str) -> None:
         super().__init__(score=score, feedback=feedback)
 
 
 class BFCLAgent(dspy.Module):
-    """Run BFCL tests with mutable instructions managed by GEPA."""
-
     def __init__(
         self,
         instruction_text: str,
@@ -84,9 +94,7 @@ def __init__(
         self.prompt_predictor = dspy.Predict(instruction_signature)
 
     def forward(self, test_id: str, question: str) -> dspy.Prediction:
-        """Run a single BFCL test and return the score plus tool usage info."""
-        self._instruction_path.parent.mkdir(parents=True, exist_ok=True)
-        instruction_text = self._instruction_text()
+        instruction_text = self.get_instruction_text()
         self._instruction_path.write_text(instruction_text, encoding="utf-8")
 
         output_dir = self.base_dir / "runs" / test_id
@@ -110,7 +118,6 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
 
         result = subprocess.run(cmd, capture_output=True, text=True)
         passed = result.returncode == 0
-
         tools_used = self._collect_tool_names(output_dir, test_id)
 
         return dspy.Prediction(
@@ -120,36 +127,28 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
             output=result.stdout + result.stderr,
         )
 
-    def _instruction_text(self) -> str:
+    def get_instruction_text(self) -> str:
         instructions = getattr(self.prompt_predictor.signature, "instructions", "")
         if isinstance(instructions, (list, tuple)):
-            return "\n".join(str(part) for part in instructions if part)
+            return "\n".join(str(p) for p in instructions if p)
         return str(instructions or "")
 
-    def get_instruction_text(self) -> str:
-        return self._instruction_text()
-
     @staticmethod
     def _collect_tool_names(output_dir: Path, test_id: str) -> list[str]:
         complete_file = output_dir / "raw" / f"{test_id}_complete.json"
         if not complete_file.exists():
             return []
-
         try:
-            with open(complete_file, encoding="utf-8") as handle:
-                complete_data = json.load(handle)
+            data = json.loads(complete_file.read_text())
         except json.JSONDecodeError:
             return []
+        calls = MessageSerializer.extract_tool_calls_by_turn(data)
+        return [call.get("function") for turn in calls for call in turn if call.get("function")]
 
-        tool_calls = MessageSerializer.extract_tool_calls_by_turn(complete_data)
-        names: list[str] = []
-        for turn in tool_calls:
-            for call in turn:
-                function = call.get("function")
-                if function:
-                    names.append(function)
-        return names
 
+# -------------------------
+# Metric
+# -------------------------
 
 def bfcl_metric_with_feedback(
     gold: dspy.Example,
@@ -157,217 +156,146 @@ def bfcl_metric_with_feedback(
     trace: Optional[Any] = None,
     pred_name: Optional[str] = None,
     pred_trace: Optional[Any] = None,
-) -> dict[str, Any]:
-    """Metric that provides feedback to GEPA about test failures."""
-    
+) -> MetricFeedback:
     score = 1.0 if pred.passed else 0.0
-    
-    # Build feedback based on what went wrong
-    feedback_parts = []
-    
+    feedback = [f"Test {gold.test_id} {'PASSED' if pred.passed else 'FAILED'}"]
+
     if not pred.passed:
-        feedback_parts.append(f"Test {gold.test_id} FAILED")
-        
-        # Check if expected tools were used
         expected = set(gold.expected_tools)
         used = set(pred.tools_used)
-        
-        if expected and used:
+        if expected and not used:
+            feedback.append(f"No tools called; expected: {', '.join(expected)}")
+        else:
             missing = expected - used
             extra = used - expected
-            
             if missing:
-                feedback_parts.append(f"Missing expected tools: {', '.join(missing)}")
+                feedback.append(f"Missing tools: {', '.join(missing)}")
             if extra:
-                feedback_parts.append(f"Used unexpected tools: {', '.join(extra)}")
-        elif expected and not used:
-            feedback_parts.append(f"No tools were called, but expected: {', '.join(expected)}")
-        
-        # Add snippet of error output if available
-        if pred.output:
-            error_lines = [line for line in pred.output.split('\n') if 'error' in line.lower() or 'failed' in line.lower()]
-            if error_lines:
-                feedback_parts.append(f"Error output: {error_lines[0][:200]}")
-    else:
-        feedback_parts.append(f"Test {gold.test_id} PASSED")
-    
-    feedback = " | ".join(feedback_parts)
-    
-    return MetricFeedback(score=score, feedback=feedback)
+                feedback.append(f"Unexpected tools: {', '.join(extra)}")
 
+    return MetricFeedback(score=score, feedback=" | ".join(feedback))
 
-def load_test_cases(subset: str, limit: int) -> list[BFCLExample]:
-    """Load BFCL entries using the shared loader utilities."""
 
+# -------------------------
+# Data loading
+# -------------------------
+
+def load_test_cases(subset: str, limit: int) -> list[BFCLExample]:
     test_ids = bfcl_loader.find_tests_in_category(subset, limit=limit)
     examples: list[BFCLExample] = []
-
     for test_id in test_ids[:limit]:
-        try:
-            entry = bfcl_loader.load_test_entry(test_id)
-        except Exception as exc:  # pragma: no cover - diagnostics only
-            print(f"Warning: unable to load {test_id}: {exc}")
-            continue
-
+        entry = bfcl_loader.load_test_entry(test_id)
         question = _stringify_question(entry.get("question", ""))
         expected_tools = entry.get("involved_classes", []) or []
-        example = BFCLExample(test_id=test_id, question=question, expected_tools=expected_tools)
-        examples.append(example.with_inputs("test_id", "question"))
-
-    return examples[:limit]
+        ex = BFCLExample(test_id=test_id, question=question, expected_tools=expected_tools)
+        examples.append(ex.with_inputs("test_id", "question"))
+    return examples
 
 
-def run_baseline(agent: BFCLAgent, examples: list[BFCLExample]) -> float:
-    """Run baseline evaluation."""
-    print(f"Running baseline with {len(examples)} tests...")
-    
-    passed = 0
-    for example in examples:
-        pred = agent(test_id=example.test_id, question=example.question)
-        if pred.passed:
-            passed += 1
-    
-    score = passed / len(examples) if examples else 0.0
-    print(f"Baseline pass rate: {score:.2%} ({passed}/{len(examples)})")
-    return score
-
+# -------------------------
+# Main
+# -------------------------
 
 def main():
-    parser = argparse.ArgumentParser(description="Optimize BFCL instructions using GEPA")
-    parser.add_argument("--test-subset", default="multi_turn_base", 
-                       help="Test category to use (e.g., multi_turn_base)")
-    parser.add_argument("--num-tests", type=int, default=10,
-                       help="Number of tests to use for optimization")
-    parser.add_argument("--model", default="gpt-5",
-                       help="Model to use for test evaluation")
-    parser.add_argument("--reflection-model", default="gpt-5",
-                       help="Model to use for GEPA reflection")
-    parser.add_argument("--max-evaluations", type=int, default=20,
-                       help="Maximum number of GEPA metric calls")
-    parser.add_argument("--output-dir", type=Path, default=Path("outputs/gepa"),
-                       help="Output directory")
-    parser.add_argument("--auto", choices=['light', 'medium', 'heavy'], default='light',
-                       help="GEPA auto-tuning mode")
-    parser.add_argument("--instruction-file", type=Path, default=Path("tests/benchmarks/bfcl/instruction.txt"),
-                       help="Path to the seed BFCL instruction file")
-    parser.add_argument("--pytest-binary", default="pytest",
-                       help="Pytest binary to invoke (default: pytest on PATH)")
-    parser.add_argument("--gepa-scoring-mode", action="store_true",
-                       help="Enable BFCL scoring-only logging during runs")
-    
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--test-subset", default="multi_turn_base")
+    parser.add_argument("--num-tests", type=int, default=10)
+    parser.add_argument("--model", default="gpt-5")
+    parser.add_argument("--reflection-model", default="gpt-5")
+    parser.add_argument("--max-evaluations", type=int, default=20)
+    parser.add_argument("--output-dir", type=Path, default=Path("outputs/gepa"))
+    parser.add_argument("--auto", choices=["light", "medium", "heavy"], default=None)
+    parser.add_argument("--instruction-file", type=Path, required=True)
+    parser.add_argument("--pytest-binary", default="pytest")
+    parser.add_argument("--gepa-scoring-mode", action="store_true")
     args = parser.parse_args()
+
     args.output_dir.mkdir(parents=True, exist_ok=True)
-    
-    print("=" * 60)
-    print("GEPA Instruction Optimization for BFCL")
-    print("=" * 60)
-    
-    # Load test cases
+
     examples = load_test_cases(args.test_subset, args.num_tests)
-    if not examples:
-        print(f"Error: No tests found for subset '{args.test_subset}'")
-        return
-    
-    print(f"\nLoaded {len(examples)} test cases from {args.test_subset}")
-    
-    # Load original instructions
-    instruction_file = args.instruction_file
-    if not instruction_file.exists():
-        print(f"Error: Instruction file not found: {instruction_file}")
-        return
-    
-    original_instructions = instruction_file.read_text()
-    print(f"Original instructions: {len(original_instructions)} chars")
-    
-    # Create agent with original instructions
+    train_size = int(0.7 * len(examples))
+    trainset, devset = examples[:train_size], examples[train_size:]
+
+    instruction_text = args.instruction_file.read_text()
+    instruction_hash = sha256_text(instruction_text)
+
     agent = BFCLAgent(
-        instruction_text=original_instructions,
+        instruction_text=instruction_text,
         model=args.model,
         base_dir=args.output_dir,
         pytest_binary=args.pytest_binary,
         enable_scoring_mode=args.gepa_scoring_mode,
     )
-    
-    # Run baseline
-    baseline_score = run_baseline(agent, examples)
-    
-    # Setup DSPy with reflection LM
+
+    # Baseline
+    passed = sum(agent(test_id=e.test_id, question=e.question).passed for e in examples)
+    baseline_score = passed / len(examples)
+    (args.output_dir / "baseline.json").write_text(json.dumps({
+        "instruction_hash": instruction_hash,
+        "pass_rate": baseline_score,
+        "passed": passed,
+        "total": len(examples),
+        "test_ids": [e.test_id for e in examples],
+        "model": args.model,
+    }, indent=2))
+
+    # GEPA
     reflection_lm = dspy.LM(args.reflection_model)
     dspy.configure(lm=reflection_lm)
-    
-    print("\n" + "=" * 60)
-    print("Starting GEPA optimization...")
-    print("=" * 60)
-    print(f"Max evaluations: {args.max_evaluations}")
-    print(f"Auto-tuning mode: {args.auto}")
-    print(f"Reflection model: {args.reflection_model}")
-    
-    # Create GEPA optimizer
-    gepa = GEPA(
+
+    gepa_kwargs = dict(
         metric=bfcl_metric_with_feedback,
-        auto=args.auto,
         reflection_lm=reflection_lm,
-        reflection_minibatch_size=3,
-        log_dir=str(args.output_dir / "gepa_logs"),
         track_stats=True,
-        seed=42
+        log_dir=str(args.output_dir / "gepa_logs"),
+        seed=42,
     )
     
-    # Split into train/dev
-    train_size = int(len(examples) * 0.7)
-    trainset = examples[:train_size]
-    devset = examples[train_size:]
-    
-    print(f"Train set: {len(trainset)} tests")
-    print(f"Dev set: {len(devset)} tests")
-    
-    # Optimize
+    if args.auto is not None:
+        gepa_kwargs["auto"] = args.auto
+    else:
+        gepa_kwargs["max_full_evals"] = args.max_evaluations
+        
+    gepa = GEPA(**gepa_kwargs)
     optimized_agent = gepa.compile(agent, trainset=trainset, valset=devset)
-    
-    # Evaluate optimized version
-    print("\n" + "=" * 60)
-    print("Evaluating optimized instructions...")
-    print("=" * 60)
-    
-    evaluate = Evaluate(
-        devset=devset,
-        metric=bfcl_metric_with_feedback,
-        display_progress=True,
-        display_table=False
-    )
-    
-    final_result = evaluate(optimized_agent)
-    final_score = float(final_result.score)
-    
-    optimized_instruction_path = args.output_dir / "optimized_instructions.txt"
-    optimized_instruction_path.write_text(optimized_agent.get_instruction_text(), encoding="utf-8")
-
-    metadata = {
+    results = optimized_agent.detailed_results
+
+    # Dump candidates
+    candidates = []
+    for i, cand in enumerate(results.candidates):
+        instr = cand.get_instruction_text()
+        candidates.append({
+            "candidate_id": i,
+            "instruction_hash": sha256_text(instr),
+            "instruction_text": instr,
+            "val_score": results.val_aggregate_scores[i],
+            "discovered_at_metric_call": results.discovery_eval_counts[i],
+            "parents": results.parents[i],
+        })
+    (args.output_dir / "gepa_candidates.json").write_text(json.dumps(candidates, indent=2))
+
+    # Pareto (simple: max score per val instance)
+    best_ids = set().union(*results.per_val_instance_best_candidates)
+    with open(args.output_dir / "gepa_pareto.txt", "w", encoding="utf-8") as f:
+        f.write("GEPA Pareto Frontier\n====================\n\n")
+        for i in sorted(best_ids, key=lambda i: results.val_aggregate_scores[i], reverse=True):
+            f.write(f"Candidate {i} | score={results.val_aggregate_scores[i]:.3f}\n")
+            f.write("-" * 40 + "\n")
+            f.write(results.candidates[i].get_instruction_text() + "\n\n")
+
+    # Final instruction
+    final_instr = optimized_agent.get_instruction_text()
+    (args.output_dir / "optimized_instructions.txt").write_text(final_instr)
+
+    # Metadata
+    meta = {
         "baseline_score": baseline_score,
-        "final_score": final_score,
-        "test_subset": args.test_subset,
-        "num_tests": len(examples),
-        "train_size": len(trainset),
-        "dev_size": len(devset),
-        "model": args.model,
-        "reflection_model": args.reflection_model,
-        "max_evaluations": args.max_evaluations,
-        "test_ids": [ex.test_id for ex in examples],
-        "optimized_instruction_path": str(optimized_instruction_path),
+        "final_score": max(results.val_aggregate_scores),
+        "total_metric_calls": results.total_metric_calls,
+        "num_full_val_evals": results.num_full_val_evals,
+        "seed": results.seed,
     }
-    
-    metadata_file = args.output_dir / "optimization_metadata.json"
-    metadata_file.write_text(json.dumps(metadata, indent=2))
-    
-    print("\n" + "=" * 60)
-    print("Optimization Complete!")
-    print("=" * 60)
-    print(f"Baseline score: {baseline_score:.2%}")
-    print(f"Final score: {final_score:.2%}")
-    print(f"Improvement: {(final_score - baseline_score):.2%}")
-    print(f"\nMetadata saved to: {metadata_file}")
-    print(f"GEPA logs saved to: {args.output_dir / 'gepa_logs'}")
-    print("\nCheck the GEPA logs for optimized prompts and detailed traces.")
+    (args.output_dir / "optimization_metadata.json").write_text(json.dumps(meta, indent=2))
 
 
 if __name__ == "__main__":
diff --git a/tests/benchmarks/bfcl/test_bfcl.py b/tests/benchmarks/bfcl/test_bfcl.py
index 0083977..72c7440 100644
--- a/tests/benchmarks/bfcl/test_bfcl.py
+++ b/tests/benchmarks/bfcl/test_bfcl.py
@@ -10,6 +10,7 @@
 
 from tests.benchmarks.bfcl import evaluator, loader
 from tests.benchmarks.bfcl.elicitation import create_elicitation_handler
+from tests.conftest import instruction_file
 from tests.utils.fastagent_helpers import MessageSerializer
 from tests.utils.logger import StructuredEventLogger
 
@@ -25,14 +26,25 @@ def _parse_question(question: Any) -> str:
     return ""
 
 
-async def _run_bfcl_test(test_id: str, model: str, temperature: float, output_dir: Path) -> Path:
+async def _run_bfcl_test(
+    test_id: str,
+    model: str,
+    temperature: float,
+    output_dir: Path,
+    instruction_file: Path | None,
+) -> Path:
     """Run BFCL test and return path to complete.json."""
     from fast_agent import FastAgent
 
     test_case = loader.load_test_entry(test_id)
     ground_truth = loader.load_ground_truth(test_id)
 
-    instruction_path = Path(__file__).parent / "instruction.txt"
+    default_instruction = Path(__file__).parent / "instruction.txt"
+    instruction_path = instruction_file if instruction_file is not None else default_instruction
+    print(f"Using INSTRUCTION file: {instruction_path}")
+    if not instruction_path.exists():
+        raise FileNotFoundError(f"Instruction file not found: {instruction_path}")
+    
     structured_log_path = output_dir / "raw" / f"{test_id}_structured.jsonl"
     structured_log_path.parent.mkdir(parents=True, exist_ok=True)
 
@@ -134,11 +146,19 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
 
 @pytest.mark.asyncio
 async def test_bfcl(
-    test_id: str, model: str, temperature: float, output_dir: Path, request: pytest.FixtureRequest
+    test_id: str,
+    model: str,
+    temperature: float,
+    output_dir: Path,
+    instruction_file: Path | None,
+    request: pytest.FixtureRequest,
 ) -> None:
     """Run or validate a BFCL test based on mode."""
-    if not request.config.getoption("--validate-only"):
-        await _run_bfcl_test(test_id, model, temperature, output_dir)
+    if request.config.getoption("--validate-only"):
+        log_dir = Path(request.config.getoption("--log-dir"))
+    else:
+        await _run_bfcl_test(test_id, model, temperature, output_dir, instruction_file)
+        log_dir = output_dir / "raw"
 
     log_dir = output_dir / "raw"
     complete_path = log_dir / f"{test_id}_complete.json"
diff --git a/tests/conftest.py b/tests/conftest.py
index 8341edd..66f613e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -36,12 +36,52 @@ def output_dir(request: pytest.FixtureRequest) -> Path:
     return path
 
 
+@pytest.fixture
+def instruction_file(request: pytest.FixtureRequest) -> Path | None:
+    """Optional path to replacement instruction file."""
+    value = request.config.getoption("--instruction-file")
+    return Path(value) if value else None
+
+
+@pytest.fixture
+def instruction_override(request: pytest.FixtureRequest) -> str | None:
+    """Inline instructions overriding file-based prompts."""
+    value = request.config.getoption("--instruction-override")
+    return value if value else None
+
+
+@pytest.fixture
+def gepa_dir(request: pytest.FixtureRequest) -> Path | None:
+    """Directory for GEPA experiment artifacts."""
+    value = request.config.getoption("--gepa-dir")
+    return Path(value) if value else None
+
+
+@pytest.fixture
+def gepa_log_dir(request: pytest.FixtureRequest) -> Path | None:
+    """Directory for GEPA-specific logs."""
+    value = request.config.getoption("--gepa-log-dir")
+    return Path(value) if value else None
+
+
+@pytest.fixture
+def gepa_scoring_mode(request: pytest.FixtureRequest) -> bool:
+    """Flag controlling GEPA scoring-only mode."""
+    return bool(request.config.getoption("--gepa-scoring-mode"))
+
+
 def pytest_addoption(parser: pytest.Parser) -> None:
     """Add custom CLI options."""
     parser.addoption("--model", default="gpt-4o-mini", help="Model to use")
     parser.addoption("--temperature", default=0.001, type=float, help="Temperature for LLM (default: 0.001)")
     parser.addoption("--output-dir", default="outputs", help="Output directory for results")
     parser.addoption("--validate-only", action="store_true", help="Only validate existing logs")
+    parser.addoption("--log-dir", default="outputs/raw", help="Directory with logs (for validate mode)")
+    parser.addoption("--instruction-file", default=None, help="Path to replacement instruction file")
+    parser.addoption("--instruction-override", default=None, help="Literal replacement instructions")
+    parser.addoption("--gepa-dir", default=None, help="Directory for GEPA experiment data")
+    parser.addoption("--gepa-log-dir", default=None, help="Directory for GEPA logs")
+    parser.addoption("--gepa-scoring-mode", action="store_true", help="Enable GEPA scoring-only mode")
 
 
 def pytest_configure(config: pytest.Config) -> None:
diff --git a/utils/GEPA_desc.txt b/utils/GEPA_desc.txt
new file mode 100644
index 0000000..6e485ce
--- /dev/null
+++ b/utils/GEPA_desc.txt
@@ -0,0 +1,262 @@
+dspy.GEPA: Reflective Prompt Optimizer¶
+
+GEPA (Genetic-Pareto) is a reflective optimizer proposed in "GEPA: Reflective Prompt Evolution Can Outperform Reinforcement Learning" (Agrawal et al., 2025, arxiv:2507.19457), that adaptively evolves textual components (such as prompts) of arbitrary systems. In addition to scalar scores returned by metrics, users can also provide GEPA with a text feedback to guide the optimization process. Such textual feedback provides GEPA more visibility into why the system got the score that it did, and then GEPA can introspect to identify how to improve the score. This allows GEPA to propose high performing prompts in very few rollouts.
+
+ dspy.GEPA(metric: GEPAFeedbackMetric, *, auto: Literal['light', 'medium', 'heavy'] | None = None, max_full_evals: int | None = None, max_metric_calls: int | None = None, reflection_minibatch_size: int = 3, candidate_selection_strategy: Literal['pareto', 'current_best'] = 'pareto', reflection_lm: LM | None = None, skip_perfect_score: bool = True, add_format_failure_as_feedback: bool = False, instruction_proposer: ProposalFn | None = None, component_selector: ReflectionComponentSelector | str = 'round_robin', use_merge: bool = True, max_merge_invocations: int | None = 5, num_threads: int | None = None, failure_score: float = 0.0, perfect_score: float = 1.0, log_dir: str | None = None, track_stats: bool = False, use_wandb: bool = False, wandb_api_key: str | None = None, wandb_init_kwargs: dict[str, Any] | None = None, track_best_outputs: bool = False, warn_on_score_mismatch: bool = True, enable_tool_optimization: bool = False, use_mlflow: bool = False, seed: int | None = 0, gepa_kwargs: dict | None = None) ¶
+
+Bases: Teleprompter
+
+GEPA is an evolutionary optimizer, which uses reflection to evolve text components of complex systems. GEPA is proposed in the paper GEPA: Reflective Prompt Evolution Can Outperform Reinforcement Learning. The GEPA optimization engine is provided by the gepa package, available from https://github.com/gepa-ai/gepa.
+
+GEPA captures full traces of the DSPy module's execution, identifies the parts of the trace corresponding to a specific predictor, and reflects on the behaviour of the predictor to propose a new instruction for the predictor. GEPA allows users to provide textual feedback to the optimizer, which is used to guide the evolution of the predictor. The textual feedback can be provided at the granularity of individual predictors, or at the level of the entire system's execution.
+
+To provide feedback to the GEPA optimizer, implement a metric as follows:
+
+
+def metric(
+    gold: Example,
+    pred: Prediction,
+    trace: Optional[DSPyTrace] = None,
+    pred_name: Optional[str] = None,
+    pred_trace: Optional[DSPyTrace] = None,
+) -> float | ScoreWithFeedback:
+    """
+    This function is called with the following arguments:
+    - gold: The gold example.
+    - pred: The predicted output.
+    - trace: Optional. The trace of the program's execution.
+    - pred_name: Optional. The name of the target predictor currently being optimized by GEPA, for which
+        the feedback is being requested.
+    - pred_trace: Optional. The trace of the target predictor's execution GEPA is seeking feedback for.
+
+    Note the `pred_name` and `pred_trace` arguments. During optimization, GEPA will call the metric to obtain
+    feedback for individual predictors being optimized. GEPA provides the name of the predictor in `pred_name`
+    and the sub-trace (of the trace) corresponding to the predictor in `pred_trace`.
+    If available at the predictor level, the metric should return {'score': float, 'feedback': str} corresponding
+    to the predictor.
+    If not available at the predictor level, the metric can also return a text feedback at the program level
+    (using just the gold, pred and trace).
+    If no feedback is returned, GEPA will use a simple text feedback consisting of just the score:
+    f"This trajectory got a score of {score}."
+    """
+    ...
+GEPA can also be used as a batch inference-time search strategy, by passing valset=trainset, track_stats=True, track_best_outputs=True, and using the detailed_results attribute of the optimized program (returned by compile) to get the Pareto frontier of the batch. optimized_program.detailed_results.best_outputs_valset will contain the best outputs for each task in the batch.
+
+Example:
+
+
+gepa = GEPA(metric=metric, track_stats=True)
+batch_of_tasks = [dspy.Example(...) for task in tasks]
+new_prog = gepa.compile(student, trainset=trainset, valset=batch_of_tasks)
+pareto_frontier = new_prog.detailed_results.val_aggregate_scores
+# pareto_frontier is a list of scores, one for each task in the batch.
+Parameters:
+
+Name	Type	Description	Default
+metric	GEPAFeedbackMetric	The metric function to use for feedback and evaluation.	required
+auto	Literal['light', 'medium', 'heavy'] | None	The auto budget to use for the run. Options: "light", "medium", "heavy".	None
+max_full_evals	int | None	The maximum number of full evaluations to perform.	None
+max_metric_calls	int | None	The maximum number of metric calls to perform.	None
+reflection_minibatch_size	int	The number of examples to use for reflection in a single GEPA step. Default is 3.	3
+candidate_selection_strategy	Literal['pareto', 'current_best']	The strategy to use for candidate selection. Default is "pareto", which stochastically selects candidates from the Pareto frontier of all validation scores. Options: "pareto", "current_best".	'pareto'
+reflection_lm	LM | None	The language model to use for reflection. Required parameter. GEPA benefits from a strong reflection model. Consider using dspy.LM(model='gpt-5', temperature=1.0, max_tokens=32000) for optimal performance.	None
+skip_perfect_score	bool	Whether to skip examples with perfect scores during reflection. Default is True.	True
+instruction_proposer	ProposalFn | None	Optional custom instruction proposer implementing GEPA's ProposalFn protocol. Default: None (recommended for most users) - Uses GEPA's proven instruction proposer from the GEPA library, which implements the ProposalFn. This default proposer is highly capable and was validated across diverse experiments reported in the GEPA paper and tutorials.
+See documentation on custom instruction proposers here.
+
+Advanced Feature: Only needed for specialized scenarios: - Multi-modal handling: Processing dspy.Image inputs alongside textual information - Nuanced control over constraints: Fine-grained control over instruction length, format, and structural requirements beyond standard feedback mechanisms - Domain-specific knowledge injection: Specialized terminology or context that cannot be provided through feedback_func alone - Provider-specific prompting: Optimizations for specific LLM providers (OpenAI, Anthropic) with unique formatting preferences - Coupled component updates: Coordinated updates of multiple components together rather than independent optimization - External knowledge integration: Runtime access to databases, APIs, or knowledge bases
+
+The default proposer handles the vast majority of use cases effectively. Use MultiModalInstructionProposer() from dspy.teleprompt.gepa.instruction_proposal for visual content or implement custom ProposalFn for highly specialized requirements.
+
+Note: When both instruction_proposer and reflection_lm are set, the instruction_proposer is called in the reflection_lm context. However, reflection_lm is optional when using a custom instruction_proposer. Custom instruction proposers can invoke their own LLMs if needed.
+
+None
+component_selector	ReflectionComponentSelector | str	Custom component selector implementing the ReflectionComponentSelector protocol, or a string specifying a built-in selector strategy. Controls which components (predictors) are selected for optimization at each iteration. Defaults to 'round_robin' strategy which cycles through components one at a time. Available string options: 'round_robin' (cycles through components sequentially), 'all' (selects all components for simultaneous optimization). Custom selectors can implement strategies using LLM-driven selection logic based on optimization state and trajectories. See gepa component selectors for available built-in selectors and the ReflectionComponentSelector protocol for implementing custom selectors.	'round_robin'
+add_format_failure_as_feedback	bool	Whether to add format failures as feedback. Default is False.	False
+use_merge	bool	Whether to use merge-based optimization. Default is True.	True
+max_merge_invocations	int | None	The maximum number of merge invocations to perform. Default is 5.	5
+num_threads	int | None	The number of threads to use for evaluation with Evaluate. Optional.	None
+failure_score	float	The score to assign to failed examples. Default is 0.0.	0.0
+perfect_score	float	The maximum score achievable by the metric. Default is 1.0. Used by GEPA to determine if all examples in a minibatch are perfect.	1.0
+log_dir	str | None	The directory to save the logs. GEPA saves elaborate logs, along with all candidate programs, in this directory. Running GEPA with the same log_dir will resume the run from the last checkpoint.	None
+track_stats	bool	Whether to return detailed results and all proposed programs in the detailed_results attribute of the optimized program. Default is False.	False
+use_wandb	bool	Whether to use wandb for logging. Default is False.	False
+wandb_api_key	str | None	The API key to use for wandb. If not provided, wandb will use the API key from the environment variable WANDB_API_KEY.	None
+wandb_init_kwargs	dict[str, Any] | None	Additional keyword arguments to pass to wandb.init.	None
+track_best_outputs	bool	Whether to track the best outputs on the validation set. track_stats must be True if track_best_outputs is True. The optimized program's detailed_results.best_outputs_valset will contain the best outputs for each task in the validation set.	False
+warn_on_score_mismatch	bool	GEPA (currently) expects the metric to return the same module-level score when called with and without the pred_name. This flag (defaults to True) determines whether a warning is raised if a mismatch in module-level and predictor-level score is detected.	True
+enable_tool_optimization	bool	Whether to enable joint optimization of dspy.ReAct modules. When enabled, GEPA jointly optimizes predictor instructions and tool descriptions together for dspy.ReAct modules. See the Tool Optimization guide for details on when to use this feature and how it works. Default is False.	False
+seed	int | None	The random seed to use for reproducibility. Default is 0.	0
+gepa_kwargs	dict | None	(Optional) Additional keyword arguments to pass directly to gepa.optimize. Useful for accessing advanced GEPA features not directly exposed through DSPy's GEPA interface.
+Available parameters: - batch_sampler: Strategy for selecting training examples. Can be a BatchSampler instance or a string ('epoch_shuffled'). Defaults to 'epoch_shuffled'. Only valid when reflection_minibatch_size is None. - merge_val_overlap_floor: Minimum number of shared validation ids required between parents before attempting a merge subsample. Only relevant when using val_evaluation_policy other than 'full_eval'. Default is 5. - stop_callbacks: Optional stopper(s) that return True when optimization should stop. Can be a single StopperProtocol or a list of StopperProtocol instances. Examples: FileStopper, TimeoutStopCondition, SignalStopper, NoImprovementStopper, or custom stopping logic. Note: This overrides the default max_metric_calls stopping condition. - use_cloudpickle: Use cloudpickle instead of pickle for serialization. Can be helpful when the serialized state contains dynamically generated DSPy signatures. Default is False. - val_evaluation_policy: Strategy controlling which validation ids to score each iteration. Can be 'full_eval' (evaluate every id each time) or an EvaluationPolicy instance. Default is 'full_eval'. - use_mlflow: If True, enables MLflow integration to log optimization progress. MLflow can be used alongside Weights & Biases (WandB). - mlflow_tracking_uri: The tracking URI to use for MLflow (when use_mlflow=True). - mlflow_experiment_name: The experiment name to use for MLflow (when use_mlflow=True).
+
+Note: Parameters already handled by DSPy's GEPA class will be overridden by the direct parameters and should not be passed through gepa_kwargs.
+
+None
+Note
+Budget Configuration: Exactly one of auto, max_full_evals, or max_metric_calls must be provided. The auto parameter provides preset configurations: "light" for quick experimentation, "medium" for balanced optimization, and "heavy" for thorough optimization.
+
+Reflection Configuration: The reflection_lm parameter is required and should be a strong language model. GEPA performs best with models like dspy.LM(model='gpt-5', temperature=1.0, max_tokens=32000). The reflection process analyzes failed examples to generate feedback for program improvement.
+
+Merge Configuration: GEPA can merge successful program variants using use_merge=True. The max_merge_invocations parameter controls how many merge attempts are made during optimization.
+
+Evaluation Configuration: Use num_threads to parallelize evaluation. The failure_score and perfect_score parameters help GEPA understand your metric's range and optimize accordingly.
+
+Logging Configuration: Set log_dir to save detailed logs and enable checkpoint resuming. Use track_stats=True to access detailed optimization results via the detailed_results attribute. Enable use_wandb=True for experiment tracking and visualization.
+
+Reproducibility: Set seed to ensure consistent results across runs with the same configuration.
+
+Source code in dspy/teleprompt/gepa/gepa.py
+Functions¶
+
+ auto_budget(num_preds, num_candidates, valset_size: int, minibatch_size: int = 35, full_eval_steps: int = 5) -> int ¶
+
+Source code in dspy/teleprompt/gepa/gepa.py
+ compile(student: Module, *, trainset: list[Example], teacher: Module | None = None, valset: list[Example] | None = None) -> Module ¶
+
+GEPA uses the trainset to perform reflective updates to the prompt, but uses the valset for tracking Pareto scores. If no valset is provided, GEPA will use the trainset for both.
+
+Parameters: - student: The student module to optimize. - trainset: The training set to use for reflective updates. - valset: The validation set to use for tracking Pareto scores. If not provided, GEPA will use the trainset for both.
+
+Source code in dspy/teleprompt/gepa/gepa.py
+ get_params() -> dict[str, Any] ¶
+
+Get the parameters of the teleprompter.
+
+Returns:
+
+Type	Description
+dict[str, Any]	The parameters of the teleprompter.
+Source code in dspy/teleprompt/teleprompt.py
+:::
+
+One of the key insights behind GEPA is its ability to leverage domain-specific textual feedback. Users should provide a feedback function as the GEPA metric, which has the following call signature:
+
+ dspy.teleprompt.gepa.gepa.GEPAFeedbackMetric ¶
+
+Bases: Protocol
+
+Functions¶
+
+ __call__(gold: Example, pred: Prediction, trace: Optional[DSPyTrace], pred_name: str | None, pred_trace: Optional[DSPyTrace]) -> Union[float, ScoreWithFeedback] ¶
+
+This function is called with the following arguments: - gold: The gold example. - pred: The predicted output. - trace: Optional. The trace of the program's execution. - pred_name: Optional. The name of the target predictor currently being optimized by GEPA, for which the feedback is being requested. - pred_trace: Optional. The trace of the target predictor's execution GEPA is seeking feedback for.
+
+Note the pred_name and pred_trace arguments. During optimization, GEPA will call the metric to obtain feedback for individual predictors being optimized. GEPA provides the name of the predictor in pred_name and the sub-trace (of the trace) corresponding to the predictor in pred_trace. If available at the predictor level, the metric should return dspy.Prediction(score: float, feedback: str) corresponding to the predictor. If not available at the predictor level, the metric can also return a text feedback at the program level (using just the gold, pred and trace). If no feedback is returned, GEPA will use a simple text feedback consisting of just the score: f"This trajectory got a score of {score}."
+
+Source code in dspy/teleprompt/gepa/gepa.py
+:::
+
+When track_stats=True, GEPA returns detailed results about all of the proposed candidates, and metadata about the optimization run. The results are available in the detailed_results attribute of the optimized program returned by GEPA, and has the following type:
+
+ dspy.teleprompt.gepa.gepa.DspyGEPAResult(candidates: list[Module], parents: list[list[int | None]], val_aggregate_scores: list[float], val_subscores: list[list[float]], per_val_instance_best_candidates: list[set[int]], discovery_eval_counts: list[int], best_outputs_valset: list[list[tuple[int, list[Prediction]]]] | None = None, total_metric_calls: int | None = None, num_full_val_evals: int | None = None, log_dir: str | None = None, seed: int | None = None) dataclass ¶
+
+Additional data related to the GEPA run.
+
+Fields: - candidates: list of proposed candidates (component_name -> component_text) - parents: lineage info; for each candidate i, parents[i] is a list of parent indices or None - val_aggregate_scores: per-candidate aggregate score on the validation set (higher is better) - val_subscores: per-candidate per-instance scores on the validation set (len == num_val_instances) - per_val_instance_best_candidates: for each val instance t, a set of candidate indices achieving the best score on t - discovery_eval_counts: Budget (number of metric calls / rollouts) consumed up to the discovery of each candidate
+
+total_metric_calls: total number of metric calls made across the run
+num_full_val_evals: number of full validation evaluations performed
+log_dir: where artifacts were written (if any)
+seed: RNG seed for reproducibility (if known)
+
+best_idx: candidate index with the highest val_aggregate_scores
+
+best_candidate: the program text mapping for best_idx
+Attributes¶
+
+ candidates: list[Module] instance-attribute ¶
+
+ parents: list[list[int | None]] instance-attribute ¶
+
+ val_aggregate_scores: list[float] instance-attribute ¶
+
+ val_subscores: list[list[float]] instance-attribute ¶
+
+ per_val_instance_best_candidates: list[set[int]] instance-attribute ¶
+
+ discovery_eval_counts: list[int] instance-attribute ¶
+
+ best_outputs_valset: list[list[tuple[int, list[Prediction]]]] | None = None class-attribute instance-attribute ¶
+
+ total_metric_calls: int | None = None class-attribute instance-attribute ¶
+
+ num_full_val_evals: int | None = None class-attribute instance-attribute ¶
+
+ log_dir: str | None = None class-attribute instance-attribute ¶
+
+ seed: int | None = None class-attribute instance-attribute ¶
+
+ best_idx: int property ¶
+
+ best_candidate: dict[str, str] property ¶
+
+ highest_score_achieved_per_val_task: list[float] property ¶
+
+Functions¶
+
+ to_dict() -> dict[str, Any] ¶
+
+Source code in dspy/teleprompt/gepa/gepa.py
+ from_gepa_result(gepa_result: GEPAResult, adapter: DspyAdapter) -> DspyGEPAResult staticmethod ¶
+
+Source code in dspy/teleprompt/gepa/gepa.py
+:::
+
+Usage Examples¶
+
+See GEPA usage tutorials in GEPA Tutorials.
+
+Inference-Time Search¶
+
+GEPA can act as a test-time/inference search mechanism. By setting your valset to your evaluation batch and using track_best_outputs=True, GEPA produces for each batch element the highest-scoring outputs found during the evolutionary search.
+
+
+gepa = dspy.GEPA(metric=metric, track_stats=True, ...)
+new_prog = gepa.compile(student, trainset=my_tasks, valset=my_tasks)
+highest_score_achieved_per_task = new_prog.detailed_results.highest_score_achieved_per_val_task
+best_outputs = new_prog.detailed_results.best_outputs_valset
+How Does GEPA Work?¶
+
+1. Reflective Prompt Mutation¶
+
+GEPA uses LLMs to reflect on structured execution traces (inputs, outputs, failures, feedback), targeting a chosen module and proposing a new instruction/program text tailored to real observed failures and rich textual/environmental feedback.
+
+2. Rich Textual Feedback as Optimization Signal¶
+
+GEPA can leverage any textual feedback available—not just scalar rewards. This includes evaluation logs, code traces, failed parses, constraint violations, error message strings, or even isolated submodule-specific feedback. This allows actionable, domain-aware optimization.
+
+3. Pareto-based Candidate Selection¶
+
+Rather than evolving just the best global candidate (which leads to local optima or stagnation), GEPA maintains a Pareto frontier: the set of candidates which achieve the highest score on at least one evaluation instance. In each iteration, the next candidate to mutate is sampled (with probability proportional to coverage) from this frontier, guaranteeing both exploration and robust retention of complementary strategies.
+
+Algorithm Summary¶
+
+Initialize the candidate pool with the the unoptimized program.
+Iterate:
+Sample a candidate (from Pareto frontier).
+Sample a minibatch from the train set.
+Collect execution traces + feedbacks for module rollout on minibatch.
+Select a module of the candidate for targeted improvement.
+LLM Reflection: Propose a new instruction/prompt for the targeted module using reflective meta-prompting and the gathered feedback.
+Roll out the new candidate on the minibatch; if improved, evaluate on Pareto validation set.
+Update the candidate pool/Pareto frontier.
+[Optionally] System-aware merge/crossover: Combine best-performing modules from distinct lineages.
+Continue until rollout or metric budget is exhausted.
+Return candidate with best aggregate performance on validation.
+Implementing Feedback Metrics¶
+
+A well-designed metric is central to GEPA's sample efficiency and learning signal richness. GEPA expects the metric to returns a dspy.Prediction(score=..., feedback=...). GEPA leverages natural language traces from LLM-based workflows for optimization, preserving intermediate trajectories and errors in plain text rather than reducing them to numerical rewards. This mirrors human diagnostic processes, enabling clearer identification of system behaviors and bottlenecks.
+
+Practical Recipe for GEPA-Friendly Feedback:
+
+Leverage Existing Artifacts: Use logs, unit tests, evaluation scripts, and profiler outputs; surfacing these often suffices.
+Decompose Outcomes: Break scores into per-objective components (e.g., correctness, latency, cost, safety) and attribute errors to steps.
+Expose Trajectories: Label pipeline stages, reporting pass/fail with salient errors (e.g., in code generation pipelines).
+Ground in Checks: Employ automatic validators (unit tests, schemas, simulators) or LLM-as-a-judge for non-verifiable tasks (as in PUPA).
+Prioritize Clarity: Focus on error coverage and decision points over technical complexity.
+Examples¶
+
+Document Retrieval (e.g., HotpotQA): List correctly retrieved, incorrect, or missed documents, beyond mere Recall/F1 scores.
+Multi-Objective Tasks (e.g., PUPA): Decompose aggregate scores to reveal contributions from each objective, highlighting tradeoffs (e.g., quality vs. privacy).
+Stacked Pipelines (e.g., code generation: parse → compile → run → profile → evaluate): Expose stage-specific failures; natural-language traces often suffice for LLM self-correction.
\ No newline at end of file
diff --git a/utils/appworld_new.txt b/utils/appworld_new.txt
new file mode 100644
index 0000000..c609eea
--- /dev/null
+++ b/utils/appworld_new.txt
@@ -0,0 +1,68 @@
+I am your supervisor, and you are an AI Assistant whose job is to complete my day-to-day tasks fully autonomously.
+----------------------------------------------------------------------------
+
+My name is: {{ main_user.first_name }} {{ main_user.last_name }}. My personal email is {{ main_user.email }} and phone number is {{ main_user.phone_number }}.
+
+You will be given a task instruction and a list of functions in the standard format. The functions correspond to APIs from various apps you have access to. The function name has three parts: the server name "appworld", the app name, and the API name, all separated by "__" (double underscore). For example, appworld__spotify__login is the login API for the Spotify app.
+
+You will complete the task completely autonomously through multi-turn interaction with the execution environment. In each turn, you will make one or more function calls, and the environment will return its outputs. This will continue until you call the appworld__supervisor__complete_task API.
+
+Here are brief app-wise descriptions.
+
+{app_descriptions}
+
+# Key Instructions:
+
+A. General instructions:
+
+- Act fully on your own. You must make all decisions yourself and never ask me or anyone else to confirm or clarify. Your role is to solve the task, not to bounce questions back, or provide me directions to follow.
+- You have full access -- complete permission to operate across my connected accounts and services.
+- Never invent or guess values. For example, if I ask you to play a song, do not assume the ID is 123. Instead, look it up properly through the right API.
+- Never leave placeholders; don't output things like "your_username". Always fill in the real value by retrieving it via APIs (e.g., Supervisor app for credentials).
+- When I omit details, choose any valid value. For example, if I ask you to buy something but don't specify which payment card to use, you may pick any one of my available cards.
+- Avoid collateral damage. Only perform what I explicitly ask for. Example: if I ask you to buy something, do not delete emails, return the order, or perform unrelated account operations.
+- Avoid unnecessary requests.
+
+B. App-specific instructions:
+
+- All my personal information (biographical details, credentials, addresses, cards) is stored in the Supervisor app, accessible via its APIs.
+- Any reference to my friends, family or any other person or relation refers to the people in my phone's contacts list.
+- To obtain the current date or time, get it from the phone app, never from your internal clock.
+- All requests are concerning a single, default (no) time zone.
+- For temporal requests, use proper time boundaries, e.g., when asked about periods like "yesterday", use complete ranges: 00:00:00 to 23:59:59.
+- References to "file system" mean the file system app, not the machine's OS. Do not use OS modules or functions.
+- Paginated APIs: Always process all results, looping through the page_index. Don't stop at the first page.
+
+# Additional AppWorld guardrails
+
+Universal rules (apply to every app/API):
+- Always fetch real credentials/tokens from Supervisor, log in to each app before protected calls, and reuse the returned access_token instead of guessing IDs or passwords.
+- Derive every resource ID from list/search responses (iterate page_index until a page returns fewer results than the limit), and only send documented parameters—never invent arguments or extra fields.
+- Preserve user-provided wording exactly (emails, posts, notes, payment memos, etc.), and for file operations always `pwd` then `ls`/`find` before `cd`, `mv`, or `rm`.
+
+App-specific micro-instructions:
+- Supervisor: Use its APIs to obtain usernames, passwords, contact info, and default payment data before acting in any other app.
+- File System: Navigate one directory at a time with `cd`, confirm location with `pwd`/`ls`, and operate only on files/directories you've discovered (use `find` when unsure).
+- Gmail: Login first, then list or search threads/drafts to capture IDs before replying, forwarding, or deleting; when composing/editing mail, include only the requested recipients/attachments and keep subject/body formatting identical to the task.
+- Todoist: Retrieve projects/tasks to get IDs before updates/completions, respect required fields like `content`, `due` ISO timestamps, and follow create → update → close ordering.
+- Spotify: Obtain an access token and active device via playback/state APIs, search to get track/playlist IDs before queue or playback edits, and pause/clear queue only after confirming the current player state.
+- Splitwise: Login, list groups/friends to fetch participant IDs, ensure expense `splits` add up to the total, and only settle/delete expenses whose IDs you just retrieved.
+- Amazon: Follow the workflow search → add_to_cart → checkout, pulling ASIN/item IDs and shipping/payment options from list APIs; do not fabricate order notes or modify user-specified quantities/prices.
+- Phone: Use the phone app for current time/date, fetch contacts/call logs to obtain IDs before calls or texts, and send message bodies exactly as provided—no extra punctuation or emojis.
+- Venmo: Authenticate, look up recipients via contacts/search, send payments with positive amounts and the exact note requested, and confirm transaction IDs from the response before reporting success.
+- Simple Note: List notes to capture `note_id` before update/delete, keep note content formatting verbatim unless explicitly told to change it, and avoid duplicate titles by checking existing notes first.
+
+C. Task-completion instructions:
+
+You must call the `appworld__supervisor__complete_task` API after completing the task.
+- If an answer is needed, e.g., for "How many songs are in the Spotify queue?", call it with the appropriate answer argument value.
+- If no answer is required, e.g., for "Start my Spotify music player.", omit the answer argument (or set it to None/null).
+- The task is doable, but if you cannot find a way, you can call it with status="fail" to exit with failure.
+
+When the answer is given:
+- Keep answers minimal. Return only the entity, number, or direct value requested - not full sentences.
+  E.g., for the song title of the current playing track, return just the title.
+- Numbers must be numeric and not in words.
+  E.g., for the number of songs in the queue, return "10", not "ten".
+
+Next, I will show you some worked-out examples as a tutorial before we proceed with the real task instruction.
diff --git a/utils/gepa_outputs_desc.txt b/utils/gepa_outputs_desc.txt
new file mode 100644
index 0000000..8534659
--- /dev/null
+++ b/utils/gepa_outputs_desc.txt
@@ -0,0 +1,117 @@
+📄 File-by-File Specification
+
+1️⃣ baseline.json
+Purpose: Explicit baseline record, separate from optimized results.
+{
+  "instruction_hash": "sha256:abcd...",
+  "pass_rate": 0.42,
+  "passed": 21,
+  "total": 50,
+  "test_ids": ["bfcl_001", "bfcl_002", "..."],
+  "model": "gpt-5"
+}
+Why:
+Makes “baseline vs optimized” trivially inspectable
+Prevents ambiguity if instructions don’t change
+
+2️⃣ gepa_candidates.json
+Purpose: Full candidate history — this is the most important artifact.
+One entry per candidate index, matching detailed_results.
+[
+  {
+    "candidate_id": 0,
+    "instruction_hash": "sha256:aaaa...",
+    "instruction_text": "...",
+    "val_score": 0.38,
+    "discovered_at_metric_call": 0,
+    "parents": null
+  },
+  {
+    "candidate_id": 1,
+    "instruction_hash": "sha256:bbbb...",
+    "instruction_text": "...",
+    "val_score": 0.44,
+    "discovered_at_metric_call": 12,
+    "parents": [0]
+  }
+]
+Mapping:
+candidate_id → index in detailed_results.candidates
+val_score → val_aggregate_scores[i]
+parents → parents[i]
+discovered_at_metric_call → discovery_eval_counts[i]
+Why:
+Shows exploration
+Shows convergence
+Allows later analysis without rerunning GEPA
+
+3️⃣ gepa_pareto.txt
+Purpose: Human-readable frontier summary (reviewer bait).
+Example:
+GEPA Pareto Frontier (Validation Set)
+====================================
+
+Candidate 3 | score=0.52 | discovered_at=31
+--------------------------------------------
+<instruction text>
+
+Candidate 7 | score=0.51 | discovered_at=44
+--------------------------------------------
+<instruction text>
+Construction:
+Include all candidates that are Pareto-optimal
+Sorted by score descending
+Plain text, no JSON
+Why:
+Lets a human actually read what GEPA found
+Zero tooling required
+
+4️⃣ gepa_iterations.jsonl
+Purpose: Iteration-level traceability without over-logging.
+One JSON object per GEPA iteration, append-only.
+{"iteration": 0, "instruction_hash": "sha256:aaaa...", "val_score": 0.38, "evaluated_test_ids": ["bfcl_001", "bfcl_004"], "metric_calls_so_far": 5}
+{"iteration": 1, "instruction_hash": "sha256:bbbb...", "val_score": 0.44, "evaluated_test_ids": ["bfcl_002", "bfcl_003"], "metric_calls_so_far": 11}
+Why:
+Distinguishes “did nothing” vs “explored”
+Enables simple plots later
+JSONL avoids schema lock-in
+
+5️⃣ reflection_traces/iter_XXX.txt
+Purpose: Raw reflection text (minimal but defensible).
+Each file contains:
+ITERATION 3
+Candidate: 7
+Score: 0.51
+
+=== REFLECTION PROMPT ===
+...
+
+=== REFLECTION OUTPUT ===
+<verbatim LLM output>
+Source:
+Whatever GEPA emits during reflection
+No parsing
+No summarization
+Why:
+Satisfies “uses model traces”
+Auditable
+No DSPy internals exposed
+
+------------------------------
+
+outputs/gepa/
+└── <experiment_id>/              # already exists (args.output_dir)
+    ├── baseline.json
+    ├── optimized_instructions.txt
+    ├── optimization_metadata.json
+    │
+    ├── gepa_candidates.json
+    ├── gepa_pareto.txt
+    ├── gepa_iterations.jsonl
+    │
+    ├── reflection_traces/
+    │   ├── iter_000.txt
+    │   ├── iter_001.txt
+    │   └── ...
+    │
+    └── gepa_logs/                # GEPA’s native log_dir (unchanged)
\ No newline at end of file
diff --git a/utils/instruction_new.txt b/utils/instruction_new.txt
new file mode 100644
index 0000000..b7a0ba4
--- /dev/null
+++ b/utils/instruction_new.txt
@@ -0,0 +1,55 @@
+You are an expert in composing functions. You are given a question and a set of possible functions. 
+Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
+If none of the functions can be used, point it out. 
+If the given question lacks the parameters required by the function, also point it out.
+
+You should only return the function calls in your response. You SHOULD NOT include any other text in the response.
+
+At each turn, you should try your best to complete the tasks requested by the user within the current turn. 
+Continue to output functions to call until you have fulfilled the user's request to the best of your ability. 
+Once you have no more functions to call, the system will consider the current turn complete and proceed to the next turn or task.
+
+{{serverInstructions}}
+
+Universal BFCL Rules:
+- Always check the relevant `*_get_login_status` (or authentication status) and log in/authenticate before calling any stateful tool; never reuse or invent tokens, IDs, or usernames—fetch them using the provided lookup tools first.
+- Execute workflows in schema order: gather context (list/search/get) → perform the requested action → confirm via the API, and only supply parameters that exist in the JSON schema (no extra fields, no formatting changes to user-provided text or constraints).
+
+Twitter API:
+- If `posting_get_login_status` is false, authenticate with `authenticate_twitter` before any post/follow/comment, and never fabricate tweet IDs—retrieve them via `get_tweet`, `search_tweets`, or `get_user_tweets`.
+- `post_tweet` requires `content` plus optional `tags` (each starting with `#`) and `mentions` (each starting with `@`); only send those arrays when the user asks for them and keep the wording exactly as instructed.
+- For retweets/comments/mentions, fetch the target tweet first to copy the real `tweet_id`, and do not add unrequested fields or reorder the user’s constraints.
+
+Ticket API:
+- Use `ticket_get_login_status`/`ticket_login` before any ticket operation, and call `get_ticket` (or `get_user_tickets`) to obtain real IDs before editing, resolving, or closing.
+- When using `edit_ticket`, include only the fields the user wants changed inside the `updates` dict; maintain the priority range (1–5) and never change status/resolution unless explicitly asked.
+- Resolving or closing requires an existing ticket—gather details, apply updates, and confirm via `resolve_ticket`/`close_ticket` instead of skipping prerequisite steps.
+
+Travel Booking API:
+- Always call `authenticate_travel` first to obtain a fresh `access_token`, then reuse that token (not a hallucinated one) for every protected call; if you need a `card_id`, fetch or register it before booking.
+- Ensure airport codes and traveler data are real: use `list_all_airports`/`get_nearest_airport_by_city` and `verify_traveler_information` as needed, and keep `travel_from`/`travel_to` as 3-letter IATA codes.
+- Follow the payment chain: check balances (`get_credit_card_balance`/`set_budget_limit`), book (`book_flight`), then reference the returned `booking_id` for insurance, invoices, or cancellations without inventing IDs.
+
+Message API:
+- Check `message_get_login_status` and call `message_login` with the provided `user_id` before sending/deleting messages.
+- Convert usernames to IDs via `get_user_id` (or `list_users`) before `send_message`/`delete_message`; never assume IDs or create contacts unless the user requests it.
+- Remember `delete_message` only removes the latest message for a receiver—confirm the target receiver first and avoid altering unrelated threads.
+
+Math API:
+- Use the exact math tool that matches the user’s request instead of manual computation, and supply every required parameter (`numbers`, `precision`, units, etc.) with the correct type.
+- Keep units explicit for conversion tools (e.g., `imperial_si_conversion`, `si_unit_conversion`) and avoid mixing optional arguments or adding unsupported keys.
+
+Gorilla File System:
+- Begin every file operation flow with `pwd` and `ls`, and only reference files/directories that appear in those listings; commands like `cat`, `rm`, `mv`, `cp`, `grep`, etc., must use names relative to the current directory with no paths.
+- Change directories strictly one level at a time using `cd`, documenting each move, and undo navigation explicitly—never assume the working directory without verifying.
+- When creating/modifying files, avoid extra flags or side effects: use `touch`/`echo`/`mkdir` exactly as required and confirm results via the appropriate read/list commands.
+
+Vehicle Control API:
+- Inspect the current state via `displayCarStatus` (or other read tools) before issuing control commands so you don’t contradict the existing mode (e.g., check door locks, brake status, headlights).
+- Respect every parameter constraint: cruise control speeds must be multiples of 5 between 0–120, `lockDoors` `door` entries must be from the allowed enum, and temperature units should match the schema.
+- Sequence safety actions explicitly—engage/release brakes, lock/unlock doors, and start/stop the engine using the provided functions in the logical order rather than combining steps.
+
+Trading Bot API:
+- Authenticate (`trading_get_login_status` + `trading_login`) before any trading action and fetch `get_account_info` to confirm balance/card bindings before placing, funding, or withdrawing.
+- Derive stock identifiers from the API (`get_symbol_by_name`, `get_stock_info`, `get_available_stocks`) before trading, and only submit orders/watchlist updates for symbols you fetched—do not invent symbols or order IDs.
+- For every order workflow: gather order IDs via `get_order_history`/`place_order`, reference those IDs for `get_order_details` or `cancel_order`, and ensure funds/amount constraints are satisfied before placing the trade.
diff --git a/utils/json2md.py b/utils/json2md.py
new file mode 100644
index 0000000..ad06232
--- /dev/null
+++ b/utils/json2md.py
@@ -0,0 +1,167 @@
+import json
+import sys
+from typing import Dict, List, Any
+
+
+def format_code_block(content: str, language: str = "") -> str:
+    """Format content as a markdown code block."""
+    return f"```{language}\n{content}\n```"
+
+
+def format_tool_call(tool_name: str, arguments: Dict[str, Any]) -> str:
+    """Format a tool call as Python code."""
+    args_str = ", ".join(f"{k}={repr(v)}" for k, v in arguments.items())
+    return f"{tool_name}({args_str})"
+
+
+def format_tool_result(result_content: List[Dict]) -> str:
+    """Format tool result content."""
+    if not result_content:
+        return ""
+    
+    # Extract text from result
+    text_parts = []
+    for item in result_content:
+        if item.get("type") == "text":
+            text_parts.append(item.get("text", ""))
+    
+    combined_text = "\n".join(text_parts)
+    
+    # Try to parse as JSON for pretty formatting
+    try:
+        parsed = json.loads(combined_text)
+        return format_code_block(json.dumps(parsed, indent=2), "json")
+    except (json.JSONDecodeError, ValueError):
+        return combined_text
+
+
+def format_assistant_message(message: Dict) -> str:
+    """Format an assistant message with tool calls and content."""
+    output = []
+    
+    # Add tool calls if present
+    if message.get("tool_calls"):
+        output.append("**Model Output:**")
+        for call_id, call_data in message["tool_calls"].items():
+            tool_name = call_data.get("name", "")
+            arguments = call_data.get("arguments", {})
+            output.append(format_code_block(format_tool_call(tool_name, arguments), "python"))
+    
+    # Add text content if present
+    if message.get("content"):
+        for item in message["content"]:
+            if item.get("type") == "text":
+                text = item.get("text", "")
+                if text.strip():
+                    if not message.get("tool_calls"):
+                        output.append("**Model Output:**")
+                        output.append("")
+                        output.append(f"_{text}_" if "No tool calls" in text else text)
+                    else:
+                        # Format as blockquote for responses after tool calls
+                        lines = text.strip().split("\n")
+                        output.append("")
+                        for line in lines:
+                            output.append(f"> {line}" if line else ">")
+    
+    return "\n".join(output)
+
+
+def convert_json_to_markdown(data: Dict) -> str:
+    """Convert JSON conversation data to Markdown format."""
+    lines = []
+    messages = data.get("messages", [])
+    
+    # Group messages into turns (user -> assistant -> tool_results -> assistant)
+    turn_number = 0
+    i = 0
+    
+    while i < len(messages):
+        msg = messages[i]
+        
+        if msg["role"] == "user" and msg.get("content"):
+            # Start of a new turn with user content
+            lines.append(f"## Turn {turn_number}")
+            lines.append("")
+            
+            # User message
+            user_text = ""
+            for item in msg["content"]:
+                if item.get("type") == "text":
+                    user_text = item.get("text", "")
+                    break
+            
+            lines.append(f"**User:** {user_text}")
+            lines.append("")
+            
+            # Look ahead for expected tool calls (if this is a validation document)
+            # This would need to be added from external validation data
+            
+            # Get assistant response
+            if i + 1 < len(messages) and messages[i + 1]["role"] == "assistant":
+                assistant_msg = messages[i + 1]
+                
+                # Add tool calls
+                if assistant_msg.get("tool_calls"):
+                    lines.append(format_assistant_message(assistant_msg))
+                    
+                    # Get tool results
+                    if i + 2 < len(messages) and messages[i + 2].get("tool_results"):
+                        tool_results_msg = messages[i + 2]
+                        for call_id, result in tool_results_msg["tool_results"].items():
+                            if result.get("content"):
+                                lines.append(format_tool_result(result["content"]))
+                        
+                        # Get final assistant response with text
+                        if i + 3 < len(messages) and messages[i + 3]["role"] == "assistant":
+                            final_msg = messages[i + 3]
+                            if final_msg.get("content"):
+                                for item in final_msg["content"]:
+                                    if item.get("type") == "text":
+                                        text = item.get("text", "").strip()
+                                        if text:
+                                            lines.append("")
+                                            for line in text.split("\n"):
+                                                lines.append(f"> {line}" if line else ">")
+                            i += 3
+                        else:
+                            i += 2
+                    else:
+                        i += 1
+                else:
+                    # No tool calls, just text response
+                    lines.append(format_assistant_message(assistant_msg))
+                    i += 1
+            
+            lines.append("")
+            turn_number += 1
+        
+        i += 1
+    
+    return "\n".join(lines)
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python script.py <input_json_file> [output_md_file]")
+        sys.exit(1)
+    
+    input_file = sys.argv[1]
+    output_file = sys.argv[2] if len(sys.argv) > 2 else input_file.replace(".json", ".md")
+    
+    # Read JSON file
+    with open(input_file, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    
+    # Convert to Markdown
+    markdown = convert_json_to_markdown(data)
+    
+    # Write to output file
+    with open(output_file, "w", encoding="utf-8") as f:
+        f.write(markdown)
+    
+    print(f"Conversion complete! Output written to: {output_file}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/utils/scripts/__init__.py b/utils/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/utils/scripts/compare_bfcl.py b/utils/scripts/compare_bfcl.py
new file mode 100644
index 0000000..ee5f3cc
--- /dev/null
+++ b/utils/scripts/compare_bfcl.py
@@ -0,0 +1,179 @@
+"""Compare BFCL run outputs by re-running the evaluator on complete logs."""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Literal, NamedTuple
+from tests.benchmarks.bfcl import evaluator
+from tests.utils.fastagent_helpers import MessageSerializer
+import traceback
+
+Status = Literal["PASS", "FAIL"]
+
+
+class RunResult(NamedTuple):
+    test_id: str
+    status: Status
+    details: dict[str, object]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Compare BFCL run logs.")
+    parser.add_argument(
+        "--baseline",
+        type=Path,
+        default=Path("outputs/baseline_multi_turn_base/raw"),
+        help="Directory containing baseline *_complete.json files.",
+    )
+    parser.add_argument(
+        "--new",
+        type=Path,
+        default=Path("outputs/new_multi_turn_base/raw"),
+        help="Directory containing new *_complete.json files.",
+    )
+    return parser.parse_args()
+
+
+def evaluate_complete(test_id: str, complete_path: Path) -> RunResult | None:
+    """Run BFCL evaluation on a complete.json file."""
+    if not complete_path.exists():
+        return None
+
+    try:
+        with complete_path.open("r", encoding="utf-8") as f:
+            complete_data = json.load(f)
+
+        tool_calls = MessageSerializer.extract_tool_calls_by_turn(complete_data)
+        executable = MessageSerializer.format_to_executable(tool_calls)
+
+        # Run evaluation the same way the pytest harness does. If evaluator raises,
+        # capture the exception and treat the test as a FAIL so totals match pytest.
+        try:
+            evaluation = evaluator._run_evaluation(test_id, tool_calls, executable)
+            status: Status = "PASS" if evaluation.get("validation", {}).get("valid") else "FAIL"
+            return RunResult(test_id, status, evaluation)
+        except Exception as eval_exc:
+            # Return a failing RunResult with diagnostic details instead of None
+            tb = traceback.format_exc()
+            details = {"error": str(eval_exc), "traceback": tb}
+            return RunResult(test_id, "FAIL", details)
+    except Exception as exc:  # pragma: no cover - defensive logging
+        print(f"[WARN] Failed to evaluate {complete_path}: {exc}")
+        # Provide more context for debugging
+        try:
+            print("--- Debug info ---")
+            print(f"test_id={test_id}")
+            if 'complete_data' in locals():
+                msgs = complete_data.get('messages') if isinstance(complete_data, dict) else None
+                print(f"message_count={len(msgs) if msgs is not None else 'N/A'}")
+                # show first assistant message tool_calls sample
+                if msgs:
+                    for m in msgs[:10]:
+                        if m.get('tool_calls'):
+                            print('sample_tool_calls=', list(m.get('tool_calls').items())[:1])
+                            break
+        except Exception:
+            pass
+        traceback.print_exc()
+        # If we couldn't even parse the file, mark as FAIL with diagnostics
+        tb = traceback.format_exc()
+        details = {"error": str(exc), "traceback": tb}
+        return RunResult(test_id, "FAIL", details)
+
+
+def collect_results(root: Path) -> dict[str, RunResult]:
+    if not root.exists():
+        raise FileNotFoundError(f"Directory not found: {root}")
+    if not root.is_dir():
+        raise NotADirectoryError(f"Path is not a directory: {root}")
+
+    results: dict[str, RunResult] = {}
+    for complete_path in sorted(root.glob("*_complete.json")):
+        test_id = complete_path.stem.replace("_complete", "")
+        evaluated = evaluate_complete(test_id, complete_path)
+        if evaluated:
+            results[test_id] = evaluated
+    return results
+
+
+def main() -> None:
+    args = parse_args()
+
+    baseline = collect_results(args.baseline)
+    new = collect_results(args.new)
+
+    all_test_ids = sorted(set(baseline) | set(new))
+
+    improvements: list[str] = []
+    regressions: list[str] = []
+    unchanged: list[str] = []
+    missing_in_new: list[str] = []
+    missing_in_baseline: list[str] = []
+
+    for test_id in all_test_ids:
+        baseline_result = baseline.get(test_id)
+        new_result = new.get(test_id)
+
+        if baseline_result is None and new_result is None:
+            continue
+        if baseline_result is None:
+            missing_in_baseline.append(test_id)
+            continue
+        if new_result is None:
+            missing_in_new.append(test_id)
+            continue
+
+        if baseline_result.status == "FAIL" and new_result.status == "PASS":
+            improvements.append(test_id)
+        elif baseline_result.status == "PASS" and new_result.status == "FAIL":
+            regressions.append(test_id)
+        elif baseline_result.status == new_result.status:
+            unchanged.append(test_id)
+
+    print("\n===== BFCL Log Comparison =====\n")
+    print(f"Baseline dir: {args.baseline}")
+    print(f"New dir:      {args.new}\n")
+
+    print(f"Total baseline logs: {len(baseline)}")
+    print(f"Total new logs:      {len(new)}")
+    # Print PASS/FAIL totals for each run to aid comparison with pytest output
+    baseline_pass = sum(1 for r in baseline.values() if r.status == "PASS")
+    baseline_fail = sum(1 for r in baseline.values() if r.status == "FAIL")
+    new_pass = sum(1 for r in new.values() if r.status == "PASS")
+    new_fail = sum(1 for r in new.values() if r.status == "FAIL")
+    print(f"Baseline PASS/FAIL:  {baseline_pass} passed, {baseline_fail} failed")
+    print(f"New PASS/FAIL:       {new_pass} passed, {new_fail} failed")
+    print(f"Shared evaluations:  {len(all_test_ids) - len(missing_in_baseline) - len(missing_in_new)}")
+    print(f"Improvements (FAIL → PASS): {len(improvements)}")
+    print(f"Regressions (PASS → FAIL): {len(regressions)}")
+    print(f"Unchanged (same result):   {len(unchanged)}")
+    print(f"Missing in new run:        {len(missing_in_new)}")
+    print(f"Missing in baseline run:   {len(missing_in_baseline)}\n")
+
+    if improvements:
+        print("=== Improvements ===")
+        for test_id in improvements:
+            print(f"  - {test_id}")
+
+    if regressions:
+        print("\n=== Regressions ===")
+        for test_id in regressions:
+            print(f"  - {test_id}")
+
+    if missing_in_new:
+        print("\n=== Missing in New Run ===")
+        for test_id in missing_in_new:
+            print(f"  - {test_id}")
+
+    if missing_in_baseline:
+        print("\n=== Missing in Baseline Run ===")
+        for test_id in missing_in_baseline:
+            print(f"  - {test_id}")
+
+    print("\nDone.\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/tree.txt b/utils/tree.txt
new file mode 100644
index 0000000..6745e6c
--- /dev/null
+++ b/utils/tree.txt
@@ -0,0 +1,68 @@
+outputs
+├── baseline_multi_turn_base
+│   ├── multi_turn_base_0_test.json
+│   ├── multi_turn_base_100_test.json
+    ..
+│   └── raw
+│       ├── multi_turn_base_0_complete.json
+│       ├── multi_turn_base_0_structured.jsonl
+│       ├── multi_turn_base_100_complete.json
+│       ├── multi_turn_base_100_structured.jsonl
+│       ..
+├── bfcl_new_results.txt
+├── gepa
+│   ├── current_instruction.txt
+│   ├── gepa_logs
+│   │   ├── generated_best_outputs_valset
+│   │   │   └── task_0
+│   │   │       └── iter_0_prog_0.json
+│   │   └── gepa_state.bin
+│   ├── gepa_output.txt
+│   ├── optimization_metadata.json
+│   ├── optimized_instructions.txt
+│   └── runs
+│       ├── multi_turn_base_0
+│       │   ├── multi_turn_base_0_test.json
+│       │   └── raw
+│       │       ├── multi_turn_base_0_complete.json
+│       │       └── multi_turn_base_0_structured.jsonl
+│       ├── multi_turn_base_1
+│       │   ├── multi_turn_base_1_test.json
+│       │   └── raw
+│       │       ├── multi_turn_base_1_complete.json
+│       │       └── multi_turn_base_1_structured.jsonl
+│       ├── multi_turn_base_2
+│       │   ├── multi_turn_base_2_test.json
+│       │   └── raw
+│       │       └── multi_turn_base_2_structured.jsonl
+│       ├── multi_turn_base_3
+│       │   ├── multi_turn_base_3_test.json
+│       │   └── raw
+│       │       └── multi_turn_base_3_structured.jsonl
+│       ├── multi_turn_base_4
+│       │   ├── multi_turn_base_4_test.json
+│       │   └── raw
+│       │       └── multi_turn_base_4_structured.jsonl
+│       ├── multi_turn_base_5
+│       │   ├── multi_turn_base_5_test.json
+│       │   └── raw
+│       │       └── multi_turn_base_5_structured.jsonl
+│       ├── multi_turn_base_6
+│       │   ├── multi_turn_base_6_test.json
+│       │   └── raw
+│       │       └── multi_turn_base_6_structured.jsonl
+│       ├── multi_turn_base_7
+│       │   ├── multi_turn_base_7_test.json
+│       │   └── raw
+│       │       └── multi_turn_base_7_structured.jsonl
+│       ├── multi_turn_base_8
+│       │   ├── multi_turn_base_8_test.json
+│       │   └── raw
+│       │       └── multi_turn_base_8_structured.jsonl
+│       └── multi_turn_base_9
+│           ├── multi_turn_base_9_test.json
+│           └── raw
+│               └── multi_turn_base_9_structured.jsonl
+└── tree.txt
+
+30 directories, 745 files
\ No newline at end of file

From 54fdf40b4009eea89c3afede5fc1c8025660567e Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Tue, 16 Dec 2025 17:33:45 -0800
Subject: [PATCH 04/23] add filler files to utils/

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index e04949b..4f0ac22 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,3 +64,5 @@ site/
 
 # Appworld data
 data/
+
+utils/
\ No newline at end of file

From 77744c86e1eaae1d0f885f377ef84e3cff1a3d2d Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Wed, 17 Dec 2025 16:43:17 -0800
Subject: [PATCH 05/23] minimal version of GEPA ran

---
 .../{optimize_gepa.py => gepa_bfcl.py}        |   4 +-
 experiments/gepa_minimal.py                   | 211 ++++++++++++++++++
 2 files changed, 213 insertions(+), 2 deletions(-)
 rename experiments/{optimize_gepa.py => gepa_bfcl.py} (98%)
 create mode 100644 experiments/gepa_minimal.py

diff --git a/experiments/optimize_gepa.py b/experiments/gepa_bfcl.py
similarity index 98%
rename from experiments/optimize_gepa.py
rename to experiments/gepa_bfcl.py
index 3d0383f..02d7147 100644
--- a/experiments/optimize_gepa.py
+++ b/experiments/gepa_bfcl.py
@@ -5,7 +5,7 @@
 """Simple GEPA-based instruction optimization for BFCL tests.
 
 Usage:
-    python experiments/optimize_gepa.py --test-subset multi_turn_base --num-tests <N>
+    python experiments/gepa_bfcl.py --test-subset multi_turn_base --num-tests <N>
 """
 
 import argparse
@@ -203,7 +203,7 @@ def main():
     parser.add_argument("--model", default="gpt-5")
     parser.add_argument("--reflection-model", default="gpt-5")
     parser.add_argument("--max-evaluations", type=int, default=20)
-    parser.add_argument("--output-dir", type=Path, default=Path("outputs/gepa"))
+    parser.add_argument("--output-dir", type=Path, default=Path("outputs/gepa_on_bfcl"))
     parser.add_argument("--auto", choices=["light", "medium", "heavy"], default=None)
     parser.add_argument("--instruction-file", type=Path, required=True)
     parser.add_argument("--pytest-binary", default="pytest")
diff --git a/experiments/gepa_minimal.py b/experiments/gepa_minimal.py
new file mode 100644
index 0000000..5908b01
--- /dev/null
+++ b/experiments/gepa_minimal.py
@@ -0,0 +1,211 @@
+"""
+Minimal GEPA use case
+"""
+
+import json
+from pathlib import Path
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+import dspy
+from dspy.teleprompt import GEPA
+from dspy.evaluate import Evaluate
+
+
+
+# 1. Define a tiny task
+
+class QAExample(dspy.Example):
+    """Simple question–answer example."""
+    def __init__(self, question: str | None = None, answer: str | None = None, *, base: dspy.Example | None = None,**kwargs,):
+        if base is not None:
+            super().__init__(base=base, **kwargs)
+        else:
+            super().__init__(question=question, answer=answer, **kwargs)
+
+    def __repr__(self):
+        return f"Q: {self.question} | A: {self.answer}"
+
+
+examples = [
+    QAExample(
+        "What is 2 + 2? If the result is greater than 3, subtract 2.",
+        "2"
+    ).with_inputs("question"),
+    QAExample(
+        "What is the capital of France? Return the number of letters in the answer.",
+        "5"
+    ).with_inputs("question"),
+    QAExample(
+        "What color is the sky? Assume no atmosphere.",
+        "black"
+    ).with_inputs("question"),
+    QAExample(
+        "What is 10 minus 3? If the result is odd, subtract 1.",
+        "6"
+    ).with_inputs("question"),
+    QAExample(
+        "What is the largest planet in our solar system? Answer in one word only. Explain your reasoning.",
+        "jupiter"
+    ).with_inputs("question"),
+    QAExample(
+        "Who wrote 'To Kill a Mockingbird'? Return only the last name.",
+        "lee"
+    ).with_inputs("question"),
+    QAExample(
+        "What is the boiling point of water in Celsius? If conditions differ from standard, return 'unknown'.",
+        "unknown"
+    ).with_inputs("question"),
+    QAExample(
+        "What is the square root of 16? Return the result minus 1.",
+        "3"
+    ).with_inputs("question"),
+    QAExample(
+        "What is the chemical symbol for gold? Return the symbol reversed.",
+        "ua"
+    ).with_inputs("question"),
+    QAExample(
+        "What is the dot product of [1,2] and [3,4]? If the result is greater than 10, subtract 1.",
+        "10"
+    ).with_inputs("question"),
+    QAExample(
+        "Where is the Taj Mahal located? Return only the country name.",
+        "india"
+    ).with_inputs("question"),
+    QAExample(
+        "What is the powerhouse of a cell? Answer the organelle name in reverse order",
+        "airdnohcotim"
+    ).with_inputs("question"),
+    QAExample(
+        "What is the RGB value of the color red? Return only the blue component.",
+        "0"
+    ).with_inputs("question"),
+]
+
+
+
+# 2. Define a DSPy module
+
+class SimpleQAModel(dspy.Module):
+    def __init__(self, instructions: str):
+        super().__init__()
+        self.predict = dspy.Predict(
+            dspy.Signature("question -> answer", instructions=instructions)
+        )
+
+    def forward(self, question: str):
+        return self.predict(question=question)
+
+    # Required for GEPA instruction optimization
+    def get_instruction_text(self) -> str:
+        return self.predict.signature.instructions or ""
+
+
+
+# 3. Metric
+
+def exact_match_metric(
+    gold,
+    pred,
+    trace=None,
+    pred_name=None,
+    pred_trace=None,
+):
+    score = (
+        1.0
+        if gold.answer.strip().lower() == pred.answer.strip().lower()
+        else 0.0
+    )
+    return score
+
+
+
+
+# 4. Main
+
+def main():
+    output_dir = Path("outputs/gepa_minimal")
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    lm = dspy.LM("openai/gpt-5")
+    dspy.configure(lm=lm)
+
+    # Initial weaker instruction
+    seed_instruction = "Answer given question."
+
+    model = SimpleQAModel(seed_instruction)
+
+    # Baseline evaluation
+    evaluator = Evaluate(
+        devset=examples,
+        metric=exact_match_metric,
+        display_progress=True,
+        num_threads=1,  
+    )
+
+    print("\n=== BASELINE ===")
+    baseline = evaluator(model)
+    (output_dir / "baseline.txt").write_text(f"Baseline score: {baseline.score}")
+
+    # 5. Run GEPA
+    gepa = GEPA(
+        metric=exact_match_metric,
+        max_full_evals=20,
+        reflection_lm=lm,
+        track_stats=True,
+        seed=42,
+    )
+
+    train_size = int(0.7 * len(examples))
+    trainset, devset = examples[:train_size], examples[train_size:]
+
+    print("\n=== RUNNING GEPA ===")
+    optimized_model = gepa.compile(
+        model,
+        trainset=trainset,
+        valset=devset,
+    )
+
+    print("\n=== OPTIMIZED ===")
+    final_score = evaluator(optimized_model)
+    (output_dir / "optimized.txt").write_text(f"Optimized accuracy: {final_score.score}")
+
+    # Correct way to access results (from real DSPy usage)
+    results = optimized_model.detailed_results
+
+    # Save candidates with proper instruction extraction
+    print("\n=== CANDIDATES SAVED ===")
+    candidates = []
+    for i, cand in enumerate(results.candidates):
+        instr = cand.get_instruction_text()  # This works!
+        candidates.append({
+            "candidate_id": i,
+            "instruction_text": instr,
+            "val_score": results.val_aggregate_scores[i],
+        })
+    (output_dir / "candidates.json").write_text(json.dumps(candidates, indent=2))
+
+    # Save instruction evolution
+    print("\n=== INSTRUCTIONS SAVED ===")
+    instructions_text = (
+        f"Original:\n{seed_instruction}\n\n"
+        f"Optimized:\n{optimized_model.get_instruction_text()}"
+    )
+    (output_dir / "instructions.txt").write_text(instructions_text)
+
+    # Metadata
+    print("\n=== METADATA SAVED ===")
+    meta = {
+        "baseline_score": float(baseline.score),
+        "final_score": float(final_score),
+        "total_metric_calls": results.total_metric_calls,
+        "num_full_val_evals": results.num_full_val_evals,
+        "seed": results.seed,
+    }
+    (output_dir / "metadata.json").write_text(json.dumps(meta, indent=2))
+
+    print(f"\nAll outputs saved to {output_dir}/")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 435a53810f54b7bc68b4266cfee372e0f3814cd7 Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Tue, 23 Dec 2025 14:02:43 -0800
Subject: [PATCH 06/23] tried summarizing behavior

---
 experiments/gepa_bfcl.py | 73 ++++++++++++++++++++++++++++++++++------
 1 file changed, 62 insertions(+), 11 deletions(-)

diff --git a/experiments/gepa_bfcl.py b/experiments/gepa_bfcl.py
index 02d7147..1ab946d 100644
--- a/experiments/gepa_bfcl.py
+++ b/experiments/gepa_bfcl.py
@@ -120,13 +120,17 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
         passed = result.returncode == 0
         tools_used = self._collect_tool_names(output_dir, test_id)
 
+        behavior_summary = self._summarize_behavior(output_dir, test_id)
+
         return dspy.Prediction(
             test_id=test_id,
             passed=passed,
             tools_used=tools_used,
-            output=result.stdout + result.stderr,
+            behavior=behavior_summary,
         )
 
+
+
     def get_instruction_text(self) -> str:
         instructions = getattr(self.prompt_predictor.signature, "instructions", "")
         if isinstance(instructions, (list, tuple)):
@@ -144,6 +148,27 @@ def _collect_tool_names(output_dir: Path, test_id: str) -> list[str]:
             return []
         calls = MessageSerializer.extract_tool_calls_by_turn(data)
         return [call.get("function") for turn in calls for call in turn if call.get("function")]
+    
+    @staticmethod
+    def _summarize_behavior(output_dir: Path, test_id: str) -> str:
+        complete_file = output_dir / "raw" / f"{test_id}_complete.json"
+        if not complete_file.exists():
+            return "NO_TRACE"
+
+        data = json.load(open(complete_file))
+
+        tool_calls = MessageSerializer.extract_tool_calls_by_turn(data)
+        tool_seq = []
+        for turn in tool_calls:
+            for call in turn:
+                if call.get("function"):
+                    tool_seq.append(call["function"])
+
+        return (
+            f"TOOLS: {' -> '.join(tool_seq) or 'NONE'}\n"
+            f"NUM_TOOLS: {len(tool_seq)}"
+        )
+
 
 
 # -------------------------
@@ -157,23 +182,49 @@ def bfcl_metric_with_feedback(
     pred_name: Optional[str] = None,
     pred_trace: Optional[Any] = None,
 ) -> MetricFeedback:
+    # Score stays EXACTLY the same
     score = 1.0 if pred.passed else 0.0
-    feedback = [f"Test {gold.test_id} {'PASSED' if pred.passed else 'FAILED'}"]
 
-    if not pred.passed:
-        expected = set(gold.expected_tools)
-        used = set(pred.tools_used)
-        if expected and not used:
-            feedback.append(f"No tools called; expected: {', '.join(expected)}")
-        else:
+    feedback_parts = []
+
+    # High-level outcome
+    feedback_parts.append(
+        f"RESULT: {'PASS' if pred.passed else 'FAIL'}"
+    )
+
+    # Expected vs used tools (what you already had)
+    expected = set(gold.expected_tools)
+    used = set(pred.tools_used)
+
+    if expected:
+        feedback_parts.append(
+            f"EXPECTED_TOOLS: {', '.join(sorted(expected))}"
+        )
+        feedback_parts.append(
+            f"USED_TOOLS: {', '.join(sorted(used)) if used else 'NONE'}"
+        )
+
+        if not pred.passed:
             missing = expected - used
             extra = used - expected
             if missing:
-                feedback.append(f"Missing tools: {', '.join(missing)}")
+                feedback_parts.append(
+                    f"MISSING_TOOLS: {', '.join(sorted(missing))}"
+                )
             if extra:
-                feedback.append(f"Unexpected tools: {', '.join(extra)}")
+                feedback_parts.append(
+                    f"EXTRA_TOOLS: {', '.join(sorted(extra))}"
+                )
+
+    if hasattr(pred, "behavior"):
+        feedback_parts.append("BEHAVIOR_SUMMARY:")
+        feedback_parts.append(pred.behavior)
+
+    return MetricFeedback(
+        score=score,
+        feedback="\n".join(feedback_parts),
+    )
 
-    return MetricFeedback(score=score, feedback=" | ".join(feedback))
 
 
 # -------------------------

From bb4dd4be2cb74aaccfe859a99a0e0cf894adf425 Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Wed, 24 Dec 2025 14:30:20 -0800
Subject: [PATCH 07/23] GEPA successfully works on BFCL Agent runs

---
 experiments/gepa_bfcl.py | 388 +++++++++++++++++++++++++++++----------
 1 file changed, 286 insertions(+), 102 deletions(-)

diff --git a/experiments/gepa_bfcl.py b/experiments/gepa_bfcl.py
index 1ab946d..e75897e 100644
--- a/experiments/gepa_bfcl.py
+++ b/experiments/gepa_bfcl.py
@@ -12,17 +12,20 @@
 import json
 import subprocess
 import hashlib
+import uuid
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Optional, Tuple
 
 import dspy
 from dspy.evaluate import Evaluate
 from dspy.teleprompt import GEPA
 
 import sys
+
 sys.path.insert(0, str(Path(__file__).parent.parent))
 
 from tests.benchmarks.bfcl import loader as bfcl_loader
+from tests.benchmarks.bfcl import evaluator as bfcl_evaluator
 from tests.utils.fastagent_helpers import MessageSerializer
 
 
@@ -30,11 +33,13 @@
 # Utilities
 # -------------------------
 
+
 def sha256_text(text: str) -> str:
     return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest()
 
 
 def _stringify_question(question: Any) -> str:
+    """Best-effort stringify for logging/trace only. BFCL is multi-turn; this just picks the first user content."""
     if isinstance(question, list) and question:
         first = question[0]
         if isinstance(first, str):
@@ -48,16 +53,96 @@ def _stringify_question(question: Any) -> str:
     return ""
 
 
+def _fn_name(executable_call: str) -> str:
+    """Extract function name from BFCL executable string like `grep(file='x')`."""
+    if not executable_call:
+        return ""
+    idx = executable_call.find("(")
+    return executable_call[:idx] if idx != -1 else executable_call
+
+
+def _soft_turn_score(gt_turn: list[str], pred_turn: list[str]) -> float:
+    """
+    Soft, cheap signal to help GEPA search:
+    - 1.0 if exact match (order+args string exactness)
+    - else, score based on overlap of function names (ignores args) with order-insensitive F1-ish heuristic
+    """
+    if gt_turn == pred_turn:
+        return 1.0
+    gt_fns = [_fn_name(x) for x in gt_turn]
+    pr_fns = [_fn_name(x) for x in pred_turn]
+    if not gt_fns and not pr_fns:
+        return 1.0
+    if not gt_fns or not pr_fns:
+        return 0.0
+
+    gt_set = set(gt_fns)
+    pr_set = set(pr_fns)
+    inter = len(gt_set & pr_set)
+    prec = inter / max(len(pr_set), 1)
+    rec = inter / max(len(gt_set), 1)
+    if prec + rec == 0:
+        return 0.0
+    return (2 * prec * rec) / (prec + rec)
+
+
+def _soft_sequence_score(gt: list[list[str]], pred: list[list[str]]) -> float:
+    """Aggregate soft score across turns."""
+    if not gt and not pred:
+        return 1.0
+    n = max(len(gt), len(pred), 1)
+    total = 0.0
+    for i in range(n):
+        gt_turn = gt[i] if i < len(gt) else []
+        pr_turn = pred[i] if i < len(pred) else []
+        total += _soft_turn_score(gt_turn, pr_turn)
+    return total / n
+
+
+def _diff_summary(gt: list[list[str]], pred: list[list[str]], max_turns: int = 8, max_calls_per_turn: int = 8) -> str:
+    """Readable per-turn diff summary for GEPA feedback."""
+    lines: list[str] = []
+    n = min(max(len(gt), len(pred)), max_turns)
+    for i in range(n):
+        gt_turn = gt[i] if i < len(gt) else []
+        pr_turn = pred[i] if i < len(pred) else []
+        if gt_turn == pr_turn:
+            lines.append(f"TURN {i+1}: OK (exact match)")
+            continue
+
+        lines.append(f"TURN {i+1}: MISMATCH")
+        lines.append("  EXPECTED:")
+        if gt_turn:
+            for s in gt_turn[:max_calls_per_turn]:
+                lines.append(f"    - {s}")
+            if len(gt_turn) > max_calls_per_turn:
+                lines.append(f"    ... (+{len(gt_turn) - max_calls_per_turn} more)")
+        else:
+            lines.append("    - (no calls expected)")
+
+        lines.append("  GOT:")
+        if pr_turn:
+            for s in pr_turn[:max_calls_per_turn]:
+                lines.append(f"    - {s}")
+            if len(pr_turn) > max_calls_per_turn:
+                lines.append(f"    ... (+{len(pr_turn) - max_calls_per_turn} more)")
+        else:
+            lines.append("    - (no calls produced)")
+    if len(gt) != len(pred):
+        lines.append(f"TURN COUNT: expected {len(gt)} turns, got {len(pred)} turns")
+    return "\n".join(lines)
+
+
 # -------------------------
 # DSPy wrappers
 # -------------------------
 
+
 class BFCLExample(dspy.Example):
     def __init__(
         self,
         test_id: str | None = None,
         question: str | None = None,
-        expected_tools: list[str] | None = None,
         *,
         base: dspy.Example | None = None,
         **kwargs: Any,
@@ -65,7 +150,7 @@ def __init__(
         if base is not None:
             super().__init__(base=base, **kwargs)
         else:
-            super().__init__(test_id=test_id, question=question, expected_tools=expected_tools or [], **kwargs)
+            super().__init__(test_id=test_id, question=question, **kwargs)
 
 
 class MetricFeedback(dspy.Prediction):
@@ -74,6 +159,11 @@ def __init__(self, score: float, feedback: str) -> None:
 
 
 class BFCLAgent(dspy.Module):
+    """
+    DSPy module wrapper around pytest-driven BFCL evaluation.
+    The only optimized artifact is the instruction string stored in a DSPy Signature.
+    """
+
     def __init__(
         self,
         instruction_text: str,
@@ -90,14 +180,29 @@ def __init__(
         self.enable_scoring_mode = enable_scoring_mode
         self._instruction_path = self.base_dir / "current_instruction.txt"
 
+        # This predictor exists so GEPA can optimize the `instructions` field.
         instruction_signature = dspy.Signature("prompt_input -> prompt_output", instructions=instruction_text)
         self.prompt_predictor = dspy.Predict(instruction_signature)
 
     def forward(self, test_id: str, question: str) -> dspy.Prediction:
+        """
+        Runs one BFCL test via pytest using the current instruction file.
+        Returns enough artifacts for metrics to generate BFCL-aligned feedback.
+        """
+        # ---- Create a real DSPy trace anchor ----
+        # We don't *use* the output; we just ensure the predictor is invoked so GEPA has a traced component.
+        try:
+            _ = self.prompt_predictor(prompt_input=question)
+        except Exception:
+            # If tracing fails due to LM issues, continue; pytest run is the true evaluator.
+            pass
+
         instruction_text = self.get_instruction_text()
         self._instruction_path.write_text(instruction_text, encoding="utf-8")
 
-        output_dir = self.base_dir / "runs" / test_id
+        # Unique run dir avoids stale artifacts being reused across GEPA candidates.
+        run_id = uuid.uuid4().hex[:12]
+        output_dir = self.base_dir / "runs" / f"{test_id}__{run_id}"
         output_dir.mkdir(parents=True, exist_ok=True)
 
         cmd = [
@@ -118,19 +223,40 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
 
         result = subprocess.run(cmd, capture_output=True, text=True)
         passed = result.returncode == 0
-        tools_used = self._collect_tool_names(output_dir, test_id)
 
-        behavior_summary = self._summarize_behavior(output_dir, test_id)
+        complete_path = output_dir / "raw" / f"{test_id}_complete.json"
+        tool_calls_by_turn: list[list[dict[str, Any]]] = []
+        executable_responses: list[list[str]] = []
+        evaluation: dict[str, Any] | None = None
+        eval_error: str | None = None
+
+        if complete_path.exists():
+            try:
+                complete_data = json.loads(complete_path.read_text())
+                tool_calls_by_turn = MessageSerializer.extract_tool_calls_by_turn(complete_data)
+                executable_responses = MessageSerializer.format_to_executable(tool_calls_by_turn)
+                evaluation = bfcl_evaluator._run_evaluation(test_id, tool_calls_by_turn, executable_responses)
+            except Exception as e:
+                eval_error = f"{type(e).__name__}: {e}"
+        else:
+            eval_error = "Complete JSON not found (agent may have crashed before serialization)."
+
+        tools_used = [call.get("function") for turn in tool_calls_by_turn for call in turn if call.get("function")]
+        behavior_summary = self._summarize_behavior_from_calls(tool_calls_by_turn)
 
         return dspy.Prediction(
             test_id=test_id,
             passed=passed,
             tools_used=tools_used,
             behavior=behavior_summary,
+            executable_responses=executable_responses,
+            evaluation=evaluation,
+            eval_error=eval_error,
+            pytest_stdout=result.stdout,
+            pytest_stderr=result.stderr,
+            run_dir=str(output_dir),
         )
 
-
-
     def get_instruction_text(self) -> str:
         instructions = getattr(self.prompt_predictor.signature, "instructions", "")
         if isinstance(instructions, (list, tuple)):
@@ -138,43 +264,21 @@ def get_instruction_text(self) -> str:
         return str(instructions or "")
 
     @staticmethod
-    def _collect_tool_names(output_dir: Path, test_id: str) -> list[str]:
-        complete_file = output_dir / "raw" / f"{test_id}_complete.json"
-        if not complete_file.exists():
-            return []
-        try:
-            data = json.loads(complete_file.read_text())
-        except json.JSONDecodeError:
-            return []
-        calls = MessageSerializer.extract_tool_calls_by_turn(data)
-        return [call.get("function") for turn in calls for call in turn if call.get("function")]
-    
-    @staticmethod
-    def _summarize_behavior(output_dir: Path, test_id: str) -> str:
-        complete_file = output_dir / "raw" / f"{test_id}_complete.json"
-        if not complete_file.exists():
-            return "NO_TRACE"
-
-        data = json.load(open(complete_file))
-
-        tool_calls = MessageSerializer.extract_tool_calls_by_turn(data)
-        tool_seq = []
-        for turn in tool_calls:
+    def _summarize_behavior_from_calls(tool_calls_by_turn: list[list[dict[str, Any]]]) -> str:
+        tool_seq: list[str] = []
+        for turn in tool_calls_by_turn:
             for call in turn:
-                if call.get("function"):
-                    tool_seq.append(call["function"])
-
-        return (
-            f"TOOLS: {' -> '.join(tool_seq) or 'NONE'}\n"
-            f"NUM_TOOLS: {len(tool_seq)}"
-        )
-
+                fn = call.get("function")
+                if fn:
+                    tool_seq.append(fn)
+        return f"TOOLS: {' -> '.join(tool_seq) or 'NONE'}\nNUM_TOOLS: {len(tool_seq)}"
 
 
 # -------------------------
 # Metric
 # -------------------------
 
+
 def bfcl_metric_with_feedback(
     gold: dspy.Example,
     pred: dspy.Prediction,
@@ -182,63 +286,114 @@ def bfcl_metric_with_feedback(
     pred_name: Optional[str] = None,
     pred_trace: Optional[Any] = None,
 ) -> MetricFeedback:
-    # Score stays EXACTLY the same
-    score = 1.0 if pred.passed else 0.0
-
-    feedback_parts = []
-
-    # High-level outcome
-    feedback_parts.append(
-        f"RESULT: {'PASS' if pred.passed else 'FAIL'}"
-    )
-
-    # Expected vs used tools (what you already had)
-    expected = set(gold.expected_tools)
-    used = set(pred.tools_used)
-
-    if expected:
-        feedback_parts.append(
-            f"EXPECTED_TOOLS: {', '.join(sorted(expected))}"
-        )
-        feedback_parts.append(
-            f"USED_TOOLS: {', '.join(sorted(used)) if used else 'NONE'}"
-        )
-
-        if not pred.passed:
-            missing = expected - used
-            extra = used - expected
-            if missing:
-                feedback_parts.append(
-                    f"MISSING_TOOLS: {', '.join(sorted(missing))}"
-                )
-            if extra:
-                feedback_parts.append(
-                    f"EXTRA_TOOLS: {', '.join(sorted(extra))}"
-                )
-
+    """
+    GEPA metric aligned to BFCL:
+    - score is primarily BFCL validity (pass/fail), but we add a soft score component to provide gradient.
+    - feedback includes BFCL evaluator diagnostics + per-turn executable diffs + constraint hints.
+    """
+    test_id = getattr(pred, "test_id", None) or getattr(gold, "test_id", None)
+    feedback_parts: list[str] = []
+
+    # Load BFCL truth + constraints for feedback
+    gt: list[list[str]] = []
+    excluded: list[str] = []
+    involved_classes: list[str] = []
+    try:
+        if test_id:
+            gt = bfcl_loader.load_ground_truth(test_id)
+            entry = bfcl_loader.load_test_entry(test_id)
+            excluded = entry.get("excluded_function", []) or []
+            involved_classes = entry.get("involved_classes", []) or []
+    except Exception as e:
+        feedback_parts.append(f"WARNING: could not load BFCL ground truth/entry: {type(e).__name__}: {e}")
+
+    pred_exec: list[list[str]] = getattr(pred, "executable_responses", []) or []
+    evaluation: dict[str, Any] | None = getattr(pred, "evaluation", None)
+    eval_error: str | None = getattr(pred, "eval_error", None)
+
+    # Primary validity
+    valid = False
+    if evaluation and isinstance(evaluation, dict):
+        try:
+            valid = bool(evaluation.get("validation", {}).get("valid", False))
+        except Exception:
+            valid = False
+
+    # Soft score for gradient (helps GEPA search)
+    soft = _soft_sequence_score(gt, pred_exec) if gt else (1.0 if valid else 0.0)
+
+    # Final score: keep pass/fail dominant, but allow soft improvements to be visible
+    # This prevents GEPA from being totally flat when nothing flips to PASS yet.
+    score = (1.0 if valid else 0.0) * 0.9 + soft * 0.1
+
+    feedback_parts.append(f"RESULT: {'PASS' if valid else 'FAIL'}")
+    feedback_parts.append(f"SCORE_BREAKDOWN: hard={'1.0' if valid else '0.0'} soft={soft:.3f} final={score:.3f}")
+
+    if involved_classes:
+        feedback_parts.append(f"INVOLVED_CLASSES (servers mounted): {', '.join(involved_classes)}")
+    if excluded:
+        feedback_parts.append(f"EXCLUDED_FUNCTIONS: {', '.join(excluded)}")
+
+    # If we have evaluator info, surface the most relevant parts
+    if evaluation and isinstance(evaluation, dict):
+        validation = evaluation.get("validation", {})
+        irrelevance = evaluation.get("irrelevance_check", {})
+        feedback_parts.append("EVALUATOR_VALIDATION:")
+        # Keep it compact; GEPA reflection needs signal, not a huge JSON blob.
+        if isinstance(validation, dict):
+            # Include key flags + common fields if present
+            for k in ["valid", "reason", "error_type", "error_message"]:
+                if k in validation:
+                    feedback_parts.append(f"  {k}: {validation.get(k)}")
+        else:
+            feedback_parts.append(f"  validation: {validation}")
+
+        if isinstance(irrelevance, dict) and irrelevance:
+            feedback_parts.append("EVALUATOR_IRRELEVANCE_CHECK:")
+            for k in ["is_irrelevant", "reason"]:
+                if k in irrelevance:
+                    feedback_parts.append(f"  {k}: {irrelevance.get(k)}")
+
+    if eval_error:
+        feedback_parts.append(f"EVAL_ERROR: {eval_error}")
+
+    # Per-turn executable diff is the strongest actionable feedback
+    if gt:
+        feedback_parts.append("EXECUTABLE_DIFF:")
+        feedback_parts.append(_diff_summary(gt, pred_exec))
+
+    # Constraint violation hint: excluded function used
+    if excluded and pred_exec:
+        used_fns = {_fn_name(s) for turn in pred_exec for s in turn}
+        bad = sorted(set(excluded) & used_fns)
+        if bad:
+            feedback_parts.append(f"CONSTRAINT_VIOLATION: used excluded function(s): {', '.join(bad)}")
+
+    # Light behavior summary
     if hasattr(pred, "behavior"):
         feedback_parts.append("BEHAVIOR_SUMMARY:")
-        feedback_parts.append(pred.behavior)
+        feedback_parts.append(str(pred.behavior))
 
-    return MetricFeedback(
-        score=score,
-        feedback="\n".join(feedback_parts),
-    )
+    # Where artifacts live (useful for debugging candidate runs)
+    run_dir = getattr(pred, "run_dir", None)
+    if run_dir:
+        feedback_parts.append(f"RUN_DIR: {run_dir}")
 
+    return MetricFeedback(score=score, feedback="\n".join(feedback_parts))
 
 
 # -------------------------
 # Data loading
 # -------------------------
 
+
 def load_test_cases(subset: str, limit: int) -> list[BFCLExample]:
     test_ids = bfcl_loader.find_tests_in_category(subset, limit=limit)
     examples: list[BFCLExample] = []
     for test_id in test_ids[:limit]:
         entry = bfcl_loader.load_test_entry(test_id)
         question = _stringify_question(entry.get("question", ""))
-        expected_tools = entry.get("involved_classes", []) or []
-        ex = BFCLExample(test_id=test_id, question=question, expected_tools=expected_tools)
+        ex = BFCLExample(test_id=test_id, question=question)
         examples.append(ex.with_inputs("test_id", "question"))
     return examples
 
@@ -247,6 +402,7 @@ def load_test_cases(subset: str, limit: int) -> list[BFCLExample]:
 # Main
 # -------------------------
 
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--test-subset", default="multi_turn_base")
@@ -278,17 +434,43 @@ def main():
         enable_scoring_mode=args.gepa_scoring_mode,
     )
 
-    # Baseline
-    passed = sum(agent(test_id=e.test_id, question=e.question).passed for e in examples)
-    baseline_score = passed / len(examples)
-    (args.output_dir / "baseline.json").write_text(json.dumps({
-        "instruction_hash": instruction_hash,
-        "pass_rate": baseline_score,
-        "passed": passed,
-        "total": len(examples),
-        "test_ids": [e.test_id for e in examples],
-        "model": args.model,
-    }, indent=2))
+    # Baseline (use BFCL evaluator validity when available, not pytest returncode alone)
+    baseline_valid = 0
+    baseline_total = len(examples)
+    baseline_details: list[dict[str, Any]] = []
+
+    for e in examples:
+        pred = agent(test_id=e.test_id, question=e.question)
+        valid = False
+        if getattr(pred, "evaluation", None):
+            valid = bool(pred.evaluation.get("validation", {}).get("valid", False))
+        else:
+            valid = bool(getattr(pred, "passed", False))
+        baseline_valid += 1 if valid else 0
+        baseline_details.append(
+            {
+                "test_id": e.test_id,
+                "valid": valid,
+                "run_dir": getattr(pred, "run_dir", None),
+                "eval_error": getattr(pred, "eval_error", None),
+            }
+        )
+
+    baseline_score = baseline_valid / max(baseline_total, 1)
+    (args.output_dir / "baseline.json").write_text(
+        json.dumps(
+            {
+                "instruction_hash": instruction_hash,
+                "bfcl_valid_rate": baseline_score,
+                "valid": baseline_valid,
+                "total": baseline_total,
+                "test_ids": [e.test_id for e in examples],
+                "model": args.model,
+                "runs": baseline_details,
+            },
+            indent=2,
+        )
+    )
 
     # GEPA
     reflection_lm = dspy.LM(args.reflection_model)
@@ -301,12 +483,12 @@ def main():
         log_dir=str(args.output_dir / "gepa_logs"),
         seed=42,
     )
-    
+
     if args.auto is not None:
         gepa_kwargs["auto"] = args.auto
     else:
         gepa_kwargs["max_full_evals"] = args.max_evaluations
-        
+
     gepa = GEPA(**gepa_kwargs)
     optimized_agent = gepa.compile(agent, trainset=trainset, valset=devset)
     results = optimized_agent.detailed_results
@@ -315,14 +497,16 @@ def main():
     candidates = []
     for i, cand in enumerate(results.candidates):
         instr = cand.get_instruction_text()
-        candidates.append({
-            "candidate_id": i,
-            "instruction_hash": sha256_text(instr),
-            "instruction_text": instr,
-            "val_score": results.val_aggregate_scores[i],
-            "discovered_at_metric_call": results.discovery_eval_counts[i],
-            "parents": results.parents[i],
-        })
+        candidates.append(
+            {
+                "candidate_id": i,
+                "instruction_hash": sha256_text(instr),
+                "instruction_text": instr,
+                "val_score": results.val_aggregate_scores[i],
+                "discovered_at_metric_call": results.discovery_eval_counts[i],
+                "parents": results.parents[i],
+            }
+        )
     (args.output_dir / "gepa_candidates.json").write_text(json.dumps(candidates, indent=2))
 
     # Pareto (simple: max score per val instance)
@@ -340,8 +524,8 @@ def main():
 
     # Metadata
     meta = {
-        "baseline_score": baseline_score,
-        "final_score": max(results.val_aggregate_scores),
+        "baseline_bfcl_valid_rate": baseline_score,
+        "final_score": max(results.val_aggregate_scores) if results.val_aggregate_scores else None,
         "total_metric_calls": results.total_metric_calls,
         "num_full_val_evals": results.num_full_val_evals,
         "seed": results.seed,

From 396c173264479e71ed71466197ae878adf95f03e Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Sat, 27 Dec 2025 14:06:17 -0800
Subject: [PATCH 08/23] GEPA experiment outputs more logs

---
 experiments/gepa_bfcl.py | 633 +++++++++++++++++++++++++++------------
 1 file changed, 438 insertions(+), 195 deletions(-)

diff --git a/experiments/gepa_bfcl.py b/experiments/gepa_bfcl.py
index e75897e..3c1a356 100644
--- a/experiments/gepa_bfcl.py
+++ b/experiments/gepa_bfcl.py
@@ -2,10 +2,9 @@
 # This script performs instruction-only optimization using GEPA over BFCL tests.
 # The BFCL agent is invoked via pytest.
 
-"""Simple GEPA-based instruction optimization for BFCL tests.
-
-Usage:
-    python experiments/gepa_bfcl.py --test-subset multi_turn_base --num-tests <N>
+"""
+GEPA-based instruction optimization for BFCL tests with first-class logging/artifacts.
+Run via: `python experiments/gepa_bfcl.py --instruction-file path/to/instruction.txt [other options]`
 """
 
 import argparse
@@ -13,15 +12,19 @@
 import subprocess
 import hashlib
 import uuid
+import os
+import platform
+import sys
+import time
+from dataclasses import dataclass
+from datetime import datetime, timezone
 from pathlib import Path
-from typing import Any, Optional, Tuple
+from typing import Any, Optional
 
 import dspy
-from dspy.evaluate import Evaluate
 from dspy.teleprompt import GEPA
 
-import sys
-
+# Ensure repo root importable
 sys.path.insert(0, str(Path(__file__).parent.parent))
 
 from tests.benchmarks.bfcl import loader as bfcl_loader
@@ -30,22 +33,87 @@
 
 
 # -------------------------
-# Utilities
+# JSON / logging utilities
 # -------------------------
 
+def utc_now_iso() -> str:
+    return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace('+00:00', 'Z')
+
 
 def sha256_text(text: str) -> str:
     return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest()
 
 
+def safe_json(obj: Any) -> Any:
+    """Best-effort JSON-serializable conversion."""
+    try:
+        json.dumps(obj)
+        return obj
+    except Exception:
+        if isinstance(obj, dict):
+            return {str(k): safe_json(v) for k, v in obj.items()}
+        if isinstance(obj, (list, tuple)):
+            return [safe_json(x) for x in obj]
+        if hasattr(obj, "__dict__"):
+            return safe_json(obj.__dict__)
+        return repr(obj)
+
+
+def append_jsonl(path: Path, record: dict[str, Any]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("a", encoding="utf-8") as f:
+        f.write(json.dumps(record, ensure_ascii=False) + "\n")
+
+
+class TeeIO:
+    """Mirror writes to both the real stream and a file."""
+    def __init__(self, real_stream, log_file):
+        self.real_stream = real_stream
+        self.log_file = log_file
+
+    def write(self, s):
+        self.real_stream.write(s)
+        self.log_file.write(s)
+
+    def flush(self):
+        self.real_stream.flush()
+        self.log_file.flush()
+
+    def isatty(self):
+        return False
+
+
+@dataclass
+class RunContext:
+    run_id: str
+    output_dir: Path
+    metric_calls_path: Path
+    candidate_snapshots_path: Path
+    train_ids: set[str]
+    dev_ids: set[str]
+    score_definition: dict[str, Any]
+
+
+RUN_CTX: RunContext | None = None
+
+
+# -------------------------
+# BFCL formatting helpers
+# -------------------------
+
 def _stringify_question(question: Any) -> str:
-    """Best-effort stringify for logging/trace only. BFCL is multi-turn; this just picks the first user content."""
+    """Best-effort stringify for trace anchoring. BFCL is multi-turn; this picks the first user content."""
     if isinstance(question, list) and question:
         first = question[0]
         if isinstance(first, str):
             return first
         if isinstance(first, dict):
             return str(first.get("content", ""))
+        if isinstance(first, list) and first:
+            # BFCL questions often look like [[{role, content}], [{...}], ...]
+            msg0 = first[0]
+            if isinstance(msg0, dict):
+                return str(msg0.get("content", ""))
     if isinstance(question, dict):
         return str(question.get("content", ""))
     if isinstance(question, str):
@@ -54,7 +122,6 @@ def _stringify_question(question: Any) -> str:
 
 
 def _fn_name(executable_call: str) -> str:
-    """Extract function name from BFCL executable string like `grep(file='x')`."""
     if not executable_call:
         return ""
     idx = executable_call.find("(")
@@ -62,11 +129,6 @@ def _fn_name(executable_call: str) -> str:
 
 
 def _soft_turn_score(gt_turn: list[str], pred_turn: list[str]) -> float:
-    """
-    Soft, cheap signal to help GEPA search:
-    - 1.0 if exact match (order+args string exactness)
-    - else, score based on overlap of function names (ignores args) with order-insensitive F1-ish heuristic
-    """
     if gt_turn == pred_turn:
         return 1.0
     gt_fns = [_fn_name(x) for x in gt_turn]
@@ -87,7 +149,6 @@ def _soft_turn_score(gt_turn: list[str], pred_turn: list[str]) -> float:
 
 
 def _soft_sequence_score(gt: list[list[str]], pred: list[list[str]]) -> float:
-    """Aggregate soft score across turns."""
     if not gt and not pred:
         return 1.0
     n = max(len(gt), len(pred), 1)
@@ -100,7 +161,6 @@ def _soft_sequence_score(gt: list[list[str]], pred: list[list[str]]) -> float:
 
 
 def _diff_summary(gt: list[list[str]], pred: list[list[str]], max_turns: int = 8, max_calls_per_turn: int = 8) -> str:
-    """Readable per-turn diff summary for GEPA feedback."""
     lines: list[str] = []
     n = min(max(len(gt), len(pred)), max_turns)
     for i in range(n):
@@ -137,16 +197,8 @@ def _diff_summary(gt: list[list[str]], pred: list[list[str]], max_turns: int = 8
 # DSPy wrappers
 # -------------------------
 
-
 class BFCLExample(dspy.Example):
-    def __init__(
-        self,
-        test_id: str | None = None,
-        question: str | None = None,
-        *,
-        base: dspy.Example | None = None,
-        **kwargs: Any,
-    ):
+    def __init__(self, test_id: str | None = None, question: str | None = None, *, base: dspy.Example | None = None, **kwargs: Any):
         if base is not None:
             super().__init__(base=base, **kwargs)
         else:
@@ -159,10 +211,7 @@ def __init__(self, score: float, feedback: str) -> None:
 
 
 class BFCLAgent(dspy.Module):
-    """
-    DSPy module wrapper around pytest-driven BFCL evaluation.
-    The only optimized artifact is the instruction string stored in a DSPy Signature.
-    """
+    """DSPy module wrapper around pytest-driven BFCL evaluation."""
 
     def __init__(
         self,
@@ -180,27 +229,46 @@ def __init__(
         self.enable_scoring_mode = enable_scoring_mode
         self._instruction_path = self.base_dir / "current_instruction.txt"
 
-        # This predictor exists so GEPA can optimize the `instructions` field.
         instruction_signature = dspy.Signature("prompt_input -> prompt_output", instructions=instruction_text)
         self.prompt_predictor = dspy.Predict(instruction_signature)
 
+    def get_instruction_text(self) -> str:
+        instructions = getattr(self.prompt_predictor.signature, "instructions", "")
+        if isinstance(instructions, (list, tuple)):
+            return "\n".join(str(p) for p in instructions if p)
+        return str(instructions or "")
+
+    @staticmethod
+    def _summarize_behavior_from_calls(tool_calls_by_turn: list[list[dict[str, Any]]]) -> str:
+        tool_seq: list[str] = []
+        for turn in tool_calls_by_turn:
+            for call in turn:
+                fn = call.get("function")
+                if fn:
+                    tool_seq.append(fn)
+        return f"TOOLS: {' -> '.join(tool_seq) or 'NONE'}\nNUM_TOOLS: {len(tool_seq)}"
+
     def forward(self, test_id: str, question: str) -> dspy.Prediction:
-        """
-        Runs one BFCL test via pytest using the current instruction file.
-        Returns enough artifacts for metrics to generate BFCL-aligned feedback.
-        """
-        # ---- Create a real DSPy trace anchor ----
-        # We don't *use* the output; we just ensure the predictor is invoked so GEPA has a traced component.
+        # ----- timing breakdown -----
+        t0 = time.perf_counter()
+        timing: dict[str, float] = {}
+
+        # ---- Trace anchor: invoke predictor so GEPA has a component trace ----
         try:
+            t_a = time.perf_counter()
             _ = self.prompt_predictor(prompt_input=question)
+            timing["dspy_trace_anchor_s"] = time.perf_counter() - t_a
         except Exception:
-            # If tracing fails due to LM issues, continue; pytest run is the true evaluator.
-            pass
+            timing["dspy_trace_anchor_s"] = 0.0
 
+        # Write current instruction
+        t_w = time.perf_counter()
         instruction_text = self.get_instruction_text()
+        instruction_hash = sha256_text(instruction_text)
         self._instruction_path.write_text(instruction_text, encoding="utf-8")
+        timing["write_instruction_s"] = time.perf_counter() - t_w
 
-        # Unique run dir avoids stale artifacts being reused across GEPA candidates.
+        # Unique run dir prevents stale artifacts reuse
         run_id = uuid.uuid4().hex[:12]
         output_dir = self.base_dir / "runs" / f"{test_id}__{run_id}"
         output_dir.mkdir(parents=True, exist_ok=True)
@@ -217,36 +285,50 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
             "-q",
             "-x",
         ]
-
         if self.enable_scoring_mode:
             cmd.append("--gepa-scoring-mode")
 
+        # Run pytest
+        t_p = time.perf_counter()
         result = subprocess.run(cmd, capture_output=True, text=True)
-        passed = result.returncode == 0
+        timing["pytest_run_s"] = time.perf_counter() - t_p
 
         complete_path = output_dir / "raw" / f"{test_id}_complete.json"
+
         tool_calls_by_turn: list[list[dict[str, Any]]] = []
         executable_responses: list[list[str]] = []
         evaluation: dict[str, Any] | None = None
         eval_error: str | None = None
 
+        # Parse + evaluate
+        t_e = time.perf_counter()
         if complete_path.exists():
             try:
                 complete_data = json.loads(complete_path.read_text())
                 tool_calls_by_turn = MessageSerializer.extract_tool_calls_by_turn(complete_data)
+
+                t_fmt = time.perf_counter()
                 executable_responses = MessageSerializer.format_to_executable(tool_calls_by_turn)
+                timing["format_to_executable_s"] = time.perf_counter() - t_fmt
+
+                t_chk = time.perf_counter()
                 evaluation = bfcl_evaluator._run_evaluation(test_id, tool_calls_by_turn, executable_responses)
+                timing["bfcl_checker_s"] = time.perf_counter() - t_chk
             except Exception as e:
                 eval_error = f"{type(e).__name__}: {e}"
         else:
             eval_error = "Complete JSON not found (agent may have crashed before serialization)."
+        timing["parse_and_eval_s"] = time.perf_counter() - t_e
 
         tools_used = [call.get("function") for turn in tool_calls_by_turn for call in turn if call.get("function")]
         behavior_summary = self._summarize_behavior_from_calls(tool_calls_by_turn)
 
+        timing["total_forward_s"] = time.perf_counter() - t0
+
         return dspy.Prediction(
             test_id=test_id,
-            passed=passed,
+            instruction_hash=instruction_hash,
+            instruction_text=instruction_text,
             tools_used=tools_used,
             behavior=behavior_summary,
             executable_responses=executable_responses,
@@ -255,30 +337,14 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
             pytest_stdout=result.stdout,
             pytest_stderr=result.stderr,
             run_dir=str(output_dir),
+            timing=timing,
         )
 
-    def get_instruction_text(self) -> str:
-        instructions = getattr(self.prompt_predictor.signature, "instructions", "")
-        if isinstance(instructions, (list, tuple)):
-            return "\n".join(str(p) for p in instructions if p)
-        return str(instructions or "")
-
-    @staticmethod
-    def _summarize_behavior_from_calls(tool_calls_by_turn: list[list[dict[str, Any]]]) -> str:
-        tool_seq: list[str] = []
-        for turn in tool_calls_by_turn:
-            for call in turn:
-                fn = call.get("function")
-                if fn:
-                    tool_seq.append(fn)
-        return f"TOOLS: {' -> '.join(tool_seq) or 'NONE'}\nNUM_TOOLS: {len(tool_seq)}"
-
 
 # -------------------------
-# Metric
+# Metric (logs every call incrementally)
 # -------------------------
 
-
 def bfcl_metric_with_feedback(
     gold: dspy.Example,
     pred: dspy.Prediction,
@@ -287,9 +353,10 @@ def bfcl_metric_with_feedback(
     pred_trace: Optional[Any] = None,
 ) -> MetricFeedback:
     """
-    GEPA metric aligned to BFCL:
-    - score is primarily BFCL validity (pass/fail), but we add a soft score component to provide gradient.
-    - feedback includes BFCL evaluator diagnostics + per-turn executable diffs + constraint hints.
+    Score definition (explicitly persisted in run_manifest.json):
+      hard_valid ∈ {0,1} = BFCL checker validation.valid
+      soft ∈ [0,1] = turn-wise overlap score based on function-name overlap (F1-like)
+      final = 0.9*hard_valid + 0.1*soft
     """
     test_id = getattr(pred, "test_id", None) or getattr(gold, "test_id", None)
     feedback_parts: list[str] = []
@@ -311,37 +378,37 @@ def bfcl_metric_with_feedback(
     evaluation: dict[str, Any] | None = getattr(pred, "evaluation", None)
     eval_error: str | None = getattr(pred, "eval_error", None)
 
-    # Primary validity
-    valid = False
+    hard_valid = False
     if evaluation and isinstance(evaluation, dict):
-        try:
-            valid = bool(evaluation.get("validation", {}).get("valid", False))
-        except Exception:
-            valid = False
+        hard_valid = bool(evaluation.get("validation", {}).get("valid", False))
 
-    # Soft score for gradient (helps GEPA search)
-    soft = _soft_sequence_score(gt, pred_exec) if gt else (1.0 if valid else 0.0)
+    soft = _soft_sequence_score(gt, pred_exec) if gt else (1.0 if hard_valid else 0.0)
+    final_score = (1.0 if hard_valid else 0.0) * 0.9 + soft * 0.1
 
-    # Final score: keep pass/fail dominant, but allow soft improvements to be visible
-    # This prevents GEPA from being totally flat when nothing flips to PASS yet.
-    score = (1.0 if valid else 0.0) * 0.9 + soft * 0.1
+    split = None
+    if RUN_CTX and test_id:
+        if test_id in RUN_CTX.train_ids:
+            split = "train"
+        elif test_id in RUN_CTX.dev_ids:
+            split = "dev"
+        else:
+            split = "unknown"
 
-    feedback_parts.append(f"RESULT: {'PASS' if valid else 'FAIL'}")
-    feedback_parts.append(f"SCORE_BREAKDOWN: hard={'1.0' if valid else '0.0'} soft={soft:.3f} final={score:.3f}")
+    feedback_parts.append(f"RESULT: {'PASS' if hard_valid else 'FAIL'}")
+    feedback_parts.append(f"SCORE_BREAKDOWN: hard={'1.0' if hard_valid else '0.0'} soft={soft:.3f} final={final_score:.3f}")
+    if split:
+        feedback_parts.append(f"SPLIT: {split}")
 
     if involved_classes:
         feedback_parts.append(f"INVOLVED_CLASSES (servers mounted): {', '.join(involved_classes)}")
     if excluded:
         feedback_parts.append(f"EXCLUDED_FUNCTIONS: {', '.join(excluded)}")
 
-    # If we have evaluator info, surface the most relevant parts
     if evaluation and isinstance(evaluation, dict):
         validation = evaluation.get("validation", {})
         irrelevance = evaluation.get("irrelevance_check", {})
         feedback_parts.append("EVALUATOR_VALIDATION:")
-        # Keep it compact; GEPA reflection needs signal, not a huge JSON blob.
         if isinstance(validation, dict):
-            # Include key flags + common fields if present
             for k in ["valid", "reason", "error_type", "error_message"]:
                 if k in validation:
                     feedback_parts.append(f"  {k}: {validation.get(k)}")
@@ -357,36 +424,66 @@ def bfcl_metric_with_feedback(
     if eval_error:
         feedback_parts.append(f"EVAL_ERROR: {eval_error}")
 
-    # Per-turn executable diff is the strongest actionable feedback
     if gt:
         feedback_parts.append("EXECUTABLE_DIFF:")
         feedback_parts.append(_diff_summary(gt, pred_exec))
 
-    # Constraint violation hint: excluded function used
     if excluded and pred_exec:
         used_fns = {_fn_name(s) for turn in pred_exec for s in turn}
         bad = sorted(set(excluded) & used_fns)
         if bad:
             feedback_parts.append(f"CONSTRAINT_VIOLATION: used excluded function(s): {', '.join(bad)}")
 
-    # Light behavior summary
     if hasattr(pred, "behavior"):
         feedback_parts.append("BEHAVIOR_SUMMARY:")
         feedback_parts.append(str(pred.behavior))
 
-    # Where artifacts live (useful for debugging candidate runs)
     run_dir = getattr(pred, "run_dir", None)
     if run_dir:
         feedback_parts.append(f"RUN_DIR: {run_dir}")
 
-    return MetricFeedback(score=score, feedback="\n".join(feedback_parts))
+    # ---- First-class machine-readable metric call record ----
+    if RUN_CTX and test_id:
+        record = {
+            "ts": utc_now_iso(),
+            "run_id": RUN_CTX.run_id,
+            "test_id": test_id,
+            "split": split,
+            "instruction_hash": getattr(pred, "instruction_hash", None),
+            "hard_valid": hard_valid,
+            "soft": soft,
+            "final": final_score,
+            "timing": getattr(pred, "timing", None),
+            "run_dir": run_dir,
+            "eval_error": eval_error,
+            "evaluator_validation": safe_json(evaluation.get("validation")) if isinstance(evaluation, dict) else None,
+            "evaluator_irrelevance": safe_json(evaluation.get("irrelevance_check")) if isinstance(evaluation, dict) else None,
+        }
+        append_jsonl(RUN_CTX.metric_calls_path, record)
+
+        # Opportunistic candidate snapshot (what GEPA is “trying”)
+        snap = {
+            "ts": utc_now_iso(),
+            "run_id": RUN_CTX.run_id,
+            "instruction_hash": getattr(pred, "instruction_hash", None),
+            "instruction_text": getattr(pred, "instruction_text", None),
+            "latest_eval": {
+                "test_id": test_id,
+                "split": split,
+                "hard_valid": hard_valid,
+                "soft": soft,
+                "final": final_score,
+            },
+        }
+        append_jsonl(RUN_CTX.candidate_snapshots_path, snap)
+
+    return MetricFeedback(score=final_score, feedback="\n".join(feedback_parts))
 
 
 # -------------------------
 # Data loading
 # -------------------------
 
-
 def load_test_cases(subset: str, limit: int) -> list[BFCLExample]:
     test_ids = bfcl_loader.find_tests_in_category(subset, limit=limit)
     examples: list[BFCLExample] = []
@@ -399,9 +496,34 @@ def load_test_cases(subset: str, limit: int) -> list[BFCLExample]:
 
 
 # -------------------------
-# Main
+# Run manifest + environment capture
 # -------------------------
 
+def try_git_info() -> dict[str, Any]:
+    info: dict[str, Any] = {}
+    try:
+        head = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=False)
+        info["git_commit"] = head.stdout.strip() if head.returncode == 0 else None
+        st = subprocess.run(["git", "status", "--porcelain"], capture_output=True, text=True, check=False)
+        info["git_dirty"] = bool(st.stdout.strip())
+    except Exception:
+        info["git_commit"] = None
+        info["git_dirty"] = None
+    return info
+
+
+def build_score_definition() -> dict[str, Any]:
+    return {
+        "hard_valid": "BFCL evaluator validation.valid (boolean) from multi_turn_checker",
+        "soft": "turn-wise function-name overlap F1-like score (ignores args), averaged across turns",
+        "final": "0.9*hard_valid + 0.1*soft",
+        "note": "Optimization and candidate scores use `final`. Hard-valid-rate is also reported separately for clarity.",
+    }
+
+
+# -------------------------
+# Main
+# -------------------------
 
 def main():
     parser = argparse.ArgumentParser()
@@ -419,118 +541,239 @@ def main():
 
     args.output_dir.mkdir(parents=True, exist_ok=True)
 
-    examples = load_test_cases(args.test_subset, args.num_tests)
-    train_size = int(0.7 * len(examples))
-    trainset, devset = examples[:train_size], examples[train_size:]
-
-    instruction_text = args.instruction_file.read_text()
-    instruction_hash = sha256_text(instruction_text)
-
-    agent = BFCLAgent(
-        instruction_text=instruction_text,
-        model=args.model,
-        base_dir=args.output_dir,
-        pytest_binary=args.pytest_binary,
-        enable_scoring_mode=args.gepa_scoring_mode,
-    )
-
-    # Baseline (use BFCL evaluator validity when available, not pytest returncode alone)
-    baseline_valid = 0
-    baseline_total = len(examples)
-    baseline_details: list[dict[str, Any]] = []
-
-    for e in examples:
-        pred = agent(test_id=e.test_id, question=e.question)
-        valid = False
-        if getattr(pred, "evaluation", None):
-            valid = bool(pred.evaluation.get("validation", {}).get("valid", False))
-        else:
-            valid = bool(getattr(pred, "passed", False))
-        baseline_valid += 1 if valid else 0
-        baseline_details.append(
-            {
-                "test_id": e.test_id,
-                "valid": valid,
-                "run_dir": getattr(pred, "run_dir", None),
-                "eval_error": getattr(pred, "eval_error", None),
-            }
+    # ---- Mirror stdout/stderr to console.log automatically ----
+    console_log_path = args.output_dir / "console.log"
+    console_log_f = console_log_path.open("w", encoding="utf-8")
+    real_out, real_err = sys.stdout, sys.stderr
+    sys.stdout = TeeIO(real_out, console_log_f)
+    sys.stderr = TeeIO(real_err, console_log_f)
+
+    overall_t0 = time.perf_counter()
+    timings: dict[str, float] = {}
+
+    run_id = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
+    metric_calls_path = args.output_dir / "metric_calls.jsonl"
+    candidate_snapshots_path = args.output_dir / "candidate_snapshots.jsonl"
+
+    score_def = build_score_definition()
+
+    try:
+        print(f"[{utc_now_iso()}] RUN_ID={run_id}")
+        print(f"[{utc_now_iso()}] output_dir={args.output_dir}")
+
+        # Load dataset and split
+        t_load = time.perf_counter()
+        examples = load_test_cases(args.test_subset, args.num_tests)
+        train_size = int(0.7 * len(examples))
+        trainset, devset = examples[:train_size], examples[train_size:]
+        timings["load_dataset_s"] = time.perf_counter() - t_load
+
+        train_ids = {e.test_id for e in trainset}
+        dev_ids = {e.test_id for e in devset}
+
+        (args.output_dir / "dataset_split.json").write_text(
+            json.dumps(
+                {
+                    "run_id": run_id,
+                    "test_subset": args.test_subset,
+                    "num_tests": args.num_tests,
+                    "train_ids": sorted(train_ids),
+                    "dev_ids": sorted(dev_ids),
+                    "train_size": len(train_ids),
+                    "dev_size": len(dev_ids),
+                },
+                indent=2,
+            ),
+            encoding="utf-8",
+        )
+
+        # Initialize global run context for metric logging
+        global RUN_CTX
+        RUN_CTX = RunContext(
+            run_id=run_id,
+            output_dir=args.output_dir,
+            metric_calls_path=metric_calls_path,
+            candidate_snapshots_path=candidate_snapshots_path,
+            train_ids=train_ids,
+            dev_ids=dev_ids,
+            score_definition=score_def,
         )
 
-    baseline_score = baseline_valid / max(baseline_total, 1)
-    (args.output_dir / "baseline.json").write_text(
-        json.dumps(
-            {
-                "instruction_hash": instruction_hash,
-                "bfcl_valid_rate": baseline_score,
-                "valid": baseline_valid,
-                "total": baseline_total,
-                "test_ids": [e.test_id for e in examples],
-                "model": args.model,
-                "runs": baseline_details,
+        instruction_text = args.instruction_file.read_text(encoding="utf-8")
+        instruction_hash = sha256_text(instruction_text)
+
+        # Manifest: config, hyperparams, environment, git, score definition, dataset split
+        manifest = {
+            "run_id": run_id,
+            "created_at": utc_now_iso(),
+            "argv": sys.argv,
+            "args": safe_json(vars(args)),
+            "instruction_file": str(args.instruction_file),
+            "instruction_hash": instruction_hash,
+            "score_definition": score_def,
+            "dataset_split": {
+                "train_ids": sorted(train_ids),
+                "dev_ids": sorted(dev_ids),
             },
-            indent=2,
+            "environment": {
+                "python": sys.version,
+                "platform": platform.platform(),
+                "cwd": os.getcwd(),
+            },
+            **try_git_info(),
+        }
+        (args.output_dir / "run_manifest.json").write_text(json.dumps(manifest, indent=2), encoding="utf-8")
+
+        agent = BFCLAgent(
+            instruction_text=instruction_text,
+            model=args.model,
+            base_dir=args.output_dir,
+            pytest_binary=args.pytest_binary,
+            enable_scoring_mode=args.gepa_scoring_mode,
+        )
+
+        # Baseline
+        t_base = time.perf_counter()
+        baseline_valid = 0
+        baseline_total = len(examples)
+        baseline_details: list[dict[str, Any]] = []
+        for e in examples:
+            pred = agent(test_id=e.test_id, question=e.question)
+            valid = False
+            if getattr(pred, "evaluation", None):
+                valid = bool(pred.evaluation.get("validation", {}).get("valid", False))
+            baseline_valid += 1 if valid else 0
+            baseline_details.append(
+                {
+                    "test_id": e.test_id,
+                    "valid": valid,
+                    "instruction_hash": getattr(pred, "instruction_hash", None),
+                    "run_dir": getattr(pred, "run_dir", None),
+                    "timing": getattr(pred, "timing", None),
+                    "eval_error": getattr(pred, "eval_error", None),
+                }
+            )
+        timings["baseline_s"] = time.perf_counter() - t_base
+
+        baseline_valid_rate = baseline_valid / max(baseline_total, 1)
+        (args.output_dir / "baseline.json").write_text(
+            json.dumps(
+                {
+                    "run_id": run_id,
+                    "instruction_hash": instruction_hash,
+                    "bfcl_valid_rate": baseline_valid_rate,
+                    "valid": baseline_valid,
+                    "total": baseline_total,
+                    "test_ids": [e.test_id for e in examples],
+                    "model": args.model,
+                    "score_definition": score_def,
+                    "runs": baseline_details,
+                },
+                indent=2,
+            ),
+            encoding="utf-8",
         )
-    )
-
-    # GEPA
-    reflection_lm = dspy.LM(args.reflection_model)
-    dspy.configure(lm=reflection_lm)
-
-    gepa_kwargs = dict(
-        metric=bfcl_metric_with_feedback,
-        reflection_lm=reflection_lm,
-        track_stats=True,
-        log_dir=str(args.output_dir / "gepa_logs"),
-        seed=42,
-    )
-
-    if args.auto is not None:
-        gepa_kwargs["auto"] = args.auto
-    else:
-        gepa_kwargs["max_full_evals"] = args.max_evaluations
-
-    gepa = GEPA(**gepa_kwargs)
-    optimized_agent = gepa.compile(agent, trainset=trainset, valset=devset)
-    results = optimized_agent.detailed_results
-
-    # Dump candidates
-    candidates = []
-    for i, cand in enumerate(results.candidates):
-        instr = cand.get_instruction_text()
-        candidates.append(
-            {
-                "candidate_id": i,
-                "instruction_hash": sha256_text(instr),
-                "instruction_text": instr,
-                "val_score": results.val_aggregate_scores[i],
-                "discovered_at_metric_call": results.discovery_eval_counts[i],
-                "parents": results.parents[i],
-            }
+        print(f"[{utc_now_iso()}] Baseline BFCL valid rate: {baseline_valid_rate:.3f} ({baseline_valid}/{baseline_total})")
+
+        # GEPA
+        t_gepa = time.perf_counter()
+        reflection_lm = dspy.LM(args.reflection_model)
+        dspy.configure(lm=reflection_lm)
+
+        gepa_kwargs: dict[str, Any] = dict(
+            metric=bfcl_metric_with_feedback,
+            reflection_lm=reflection_lm,
+            track_stats=True,
+            log_dir=str(args.output_dir / "gepa_logs"),
+            seed=42,
         )
-    (args.output_dir / "gepa_candidates.json").write_text(json.dumps(candidates, indent=2))
-
-    # Pareto (simple: max score per val instance)
-    best_ids = set().union(*results.per_val_instance_best_candidates)
-    with open(args.output_dir / "gepa_pareto.txt", "w", encoding="utf-8") as f:
-        f.write("GEPA Pareto Frontier\n====================\n\n")
-        for i in sorted(best_ids, key=lambda i: results.val_aggregate_scores[i], reverse=True):
-            f.write(f"Candidate {i} | score={results.val_aggregate_scores[i]:.3f}\n")
-            f.write("-" * 40 + "\n")
-            f.write(results.candidates[i].get_instruction_text() + "\n\n")
-
-    # Final instruction
-    final_instr = optimized_agent.get_instruction_text()
-    (args.output_dir / "optimized_instructions.txt").write_text(final_instr)
-
-    # Metadata
-    meta = {
-        "baseline_bfcl_valid_rate": baseline_score,
-        "final_score": max(results.val_aggregate_scores) if results.val_aggregate_scores else None,
-        "total_metric_calls": results.total_metric_calls,
-        "num_full_val_evals": results.num_full_val_evals,
-        "seed": results.seed,
-    }
-    (args.output_dir / "optimization_metadata.json").write_text(json.dumps(meta, indent=2))
+        if args.auto is not None:
+            gepa_kwargs["auto"] = args.auto
+        else:
+            gepa_kwargs["max_full_evals"] = args.max_evaluations
+
+        # Persist GEPA config/hparams exactly
+        (args.output_dir / "gepa_config.json").write_text(json.dumps(safe_json(gepa_kwargs), indent=2), encoding="utf-8")
+
+        gepa = GEPA(**gepa_kwargs)
+        optimized_agent = gepa.compile(agent, trainset=trainset, valset=devset)
+        results = optimized_agent.detailed_results
+        timings["gepa_compile_s"] = time.perf_counter() - t_gepa
+
+        # Final candidates summary (still useful)
+        candidates = []
+        for i, cand in enumerate(results.candidates):
+            instr = cand.get_instruction_text()
+            candidates.append(
+                {
+                    "candidate_id": i,
+                    "instruction_hash": sha256_text(instr),
+                    "instruction_text": instr,
+                    "val_score": results.val_aggregate_scores[i],
+                    "discovered_at_metric_call": results.discovery_eval_counts[i],
+                    "parents": results.parents[i],
+                }
+            )
+        (args.output_dir / "gepa_candidates.json").write_text(json.dumps(candidates, indent=2), encoding="utf-8")
+
+        # Pareto
+        best_ids = set().union(*results.per_val_instance_best_candidates)
+        with open(args.output_dir / "gepa_pareto.txt", "w", encoding="utf-8") as f:
+            f.write("GEPA Pareto Frontier\n====================\n\n")
+            for i in sorted(best_ids, key=lambda i: results.val_aggregate_scores[i], reverse=True):
+                f.write(f"Candidate {i} | score={results.val_aggregate_scores[i]:.3f}\n")
+                f.write("-" * 40 + "\n")
+                f.write(results.candidates[i].get_instruction_text() + "\n\n")
+
+        final_instr = optimized_agent.get_instruction_text()
+        (args.output_dir / "optimized_instructions.txt").write_text(final_instr, encoding="utf-8")
+
+        # Scores file (explicit: which examples and how computed)
+        scores_payload = {
+            "run_id": run_id,
+            "score_definition": score_def,
+            "dataset_split": {
+                "train_ids": sorted(train_ids),
+                "dev_ids": sorted(dev_ids),
+            },
+            "baseline": {
+                "bfcl_valid_rate_over_all_examples": baseline_valid_rate,
+                "examples_used": [e.test_id for e in examples],
+                "valid_count": baseline_valid,
+                "total_count": baseline_total,
+            },
+            "gepa": {
+                "objective": "final (0.9*hard_valid + 0.1*soft) aggregated over dev set by GEPA internals",
+                "val_aggregate_scores": safe_json(results.val_aggregate_scores),
+                "candidate_count": len(results.candidates),
+            },
+            "note": "For per-evaluation, per-test, per-step details see metric_calls.jsonl (append-only).",
+        }
+        (args.output_dir / "scores.json").write_text(json.dumps(scores_payload, indent=2), encoding="utf-8")
+
+        # Metadata + timings
+        timings["total_wall_s"] = time.perf_counter() - overall_t0
+        (args.output_dir / "timings.json").write_text(json.dumps({"run_id": run_id, **timings}, indent=2), encoding="utf-8")
+
+        meta = {
+            "run_id": run_id,
+            "baseline_bfcl_valid_rate": baseline_valid_rate,
+            "final_score": max(results.val_aggregate_scores) if results.val_aggregate_scores else None,
+            "total_metric_calls": results.total_metric_calls,
+            "num_full_val_evals": results.num_full_val_evals,
+            "seed": results.seed,
+        }
+        (args.output_dir / "optimization_metadata.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
+
+        print(f"[{utc_now_iso()}] Done. See {args.output_dir}/run_manifest.json, scores.json, metric_calls.jsonl")
+
+    finally:
+        # Restore streams and close file
+        sys.stdout.flush()
+        sys.stderr.flush()
+        sys.stdout = real_out
+        sys.stderr = real_err
+        console_log_f.close()
 
 
 if __name__ == "__main__":

From 9a8876cd5261f513385dff9e8ba0dbf535194f1d Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Sat, 27 Dec 2025 15:21:22 -0800
Subject: [PATCH 09/23] Making GEPA-BFCL experiment more readable. Started with
 logging_utils

---
 experiments/gepa_bfcl/__init__.py      |  0
 experiments/gepa_bfcl/agent.py         |  0
 experiments/gepa_bfcl/data.py          |  0
 experiments/gepa_bfcl/logging_utils.py | 89 ++++++++++++++++++++++++++
 experiments/gepa_bfcl/metrics.py       |  0
 experiments/gepa_bfcl/run.py           |  0
 experiments/gepa_bfcl/scoring_utils.py |  0
 7 files changed, 89 insertions(+)
 create mode 100644 experiments/gepa_bfcl/__init__.py
 create mode 100644 experiments/gepa_bfcl/agent.py
 create mode 100644 experiments/gepa_bfcl/data.py
 create mode 100644 experiments/gepa_bfcl/logging_utils.py
 create mode 100644 experiments/gepa_bfcl/metrics.py
 create mode 100644 experiments/gepa_bfcl/run.py
 create mode 100644 experiments/gepa_bfcl/scoring_utils.py

diff --git a/experiments/gepa_bfcl/__init__.py b/experiments/gepa_bfcl/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/experiments/gepa_bfcl/agent.py b/experiments/gepa_bfcl/agent.py
new file mode 100644
index 0000000..e69de29
diff --git a/experiments/gepa_bfcl/data.py b/experiments/gepa_bfcl/data.py
new file mode 100644
index 0000000..e69de29
diff --git a/experiments/gepa_bfcl/logging_utils.py b/experiments/gepa_bfcl/logging_utils.py
new file mode 100644
index 0000000..3ee0ac5
--- /dev/null
+++ b/experiments/gepa_bfcl/logging_utils.py
@@ -0,0 +1,89 @@
+""""
+logging_utils.py
+
+Utility functions for logging and saving outputs
+"""
+
+from __future__ import annotations
+import json
+import hashlib
+import subprocess
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+
+def utc_now_iso() -> str:
+    """
+    Returns current UTC time
+    """
+    return (
+        datetime.now(timezone.utc)
+        .replace(microsecond=0)
+        .isoformat()
+        .replace("+00:00", "Z")
+    )
+    
+    
+def sha256_text(text: str) -> str:
+    """
+    Computes a SHA 256 hash of string
+
+    Used to identify instruction prompts across runs instead of storing large strings everywhere
+    """
+    hexdigest = hashlib.sha256(text.encode("utf-8")).hexdigest()
+    return f"sha256:{hexdigest}"
+
+
+def safe_json(obj: Any) -> Any:
+    """
+    Convert a given object into a JSON-serializable structure
+    """
+    try:
+        json.dumps(obj)
+        return obj
+    
+    except Exception:
+        if isinstance(obj, dict):
+            return {str(k): safe_json(v) for k, v in obj.items()}
+        if isinstance(obj, (list, tuple)):
+            return [safe_json(x) for x in obj]
+        if hasattr(obj, "__dict__"):
+            return safe_json(obj.__dict__)
+        return repr(obj)
+    
+
+def append_jsonl(path: Path, record: dict[str, Any]) -> None:
+    """
+    Append a record to a .jsonl file
+    
+    If the file at path doesn't exist, it will be created 
+    """
+    path.parent.mkdir(parents=True, exist_ok=True)
+    # Open the file
+    with path.open("a", encoding="utf-8") as f:
+        f.write(json.dumps(record, ensure_ascii=False) + "\n")
+        
+
+class TeeIO:
+    """
+    Similar to a file, this object processes writes to both a 
+    stream (stdout, stderr) and a log file
+    """
+    
+    def __init__(self, real_stream, log_file):
+        self.real_stream = real_stream
+        self.log_file = log_file
+        
+    def write(self, s: str) -> None:
+        self.real_stream.write(s)
+        self.log_file.write(s)
+        
+    def flush(self) -> None:
+        self.real_stream.flush()
+        self.log_file.flush()
+
+    def isatty(self) -> bool:
+        return False
+    
diff --git a/experiments/gepa_bfcl/metrics.py b/experiments/gepa_bfcl/metrics.py
new file mode 100644
index 0000000..e69de29
diff --git a/experiments/gepa_bfcl/run.py b/experiments/gepa_bfcl/run.py
new file mode 100644
index 0000000..e69de29
diff --git a/experiments/gepa_bfcl/scoring_utils.py b/experiments/gepa_bfcl/scoring_utils.py
new file mode 100644
index 0000000..e69de29

From bb5fabfac1ca6c44b4998876058bb5014b8bbae3 Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Sat, 27 Dec 2025 19:22:16 -0800
Subject: [PATCH 10/23] Finished logging and scoring utils

---
 experiments/gepa_bfcl/logging_utils.py |  49 ++++++++-
 experiments/gepa_bfcl/scoring_utils.py | 132 +++++++++++++++++++++++++
 2 files changed, 179 insertions(+), 2 deletions(-)

diff --git a/experiments/gepa_bfcl/logging_utils.py b/experiments/gepa_bfcl/logging_utils.py
index 3ee0ac5..ff785ac 100644
--- a/experiments/gepa_bfcl/logging_utils.py
+++ b/experiments/gepa_bfcl/logging_utils.py
@@ -1,7 +1,7 @@
 """"
 logging_utils.py
 
-Utility functions for logging and saving outputs
+Utility functions and objects for logging and saving outputs
 """
 
 from __future__ import annotations
@@ -71,7 +71,6 @@ class TeeIO:
     Similar to a file, this object processes writes to both a 
     stream (stdout, stderr) and a log file
     """
-    
     def __init__(self, real_stream, log_file):
         self.real_stream = real_stream
         self.log_file = log_file
@@ -86,4 +85,50 @@ def flush(self) -> None:
 
     def isatty(self) -> bool:
         return False
+
+
+@dataclass
+class RunContext:
+    """
+    Stores metadata used by metric functions and loggers
+    
+    Meant to be read only after initialization
+    """
+    run_id: str
+    output_dir: Path
+    metric_calls_path: Path
+    candidate_snapshots_path: Path
+    train_ids: set[str]
+    dev_ids: set[str]
+    score_definition: dict[str, Any]
+    
+RUN_CTX: RunContext | None = None
+    
+    
+def try_git_info() -> dict[str, Any]:
+    """
+    Tries to retrieve git info, does not crash if not found
+    """
+    info:dict[str, Any] = dict()
+    try:
+        head = subprocess.run(
+            args=["git", "rev-parse", "HEAD"],
+            capture_output=True,
+            text=True,
+            check=False
+        )
+        info["git_commit"] = head.stdout.strip() if head.returncode == 0 else None
+        
+        status = subprocess.run(
+            args=["git", "status", "--porcelain"],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
+        info["git_dirty"] = bool(status.stdout.strip())
+    
+    except Exception:
+        info["git_commit"] = None
+        info["git_dirty"] = None
     
+    return info
\ No newline at end of file
diff --git a/experiments/gepa_bfcl/scoring_utils.py b/experiments/gepa_bfcl/scoring_utils.py
index e69de29..5f19917 100644
--- a/experiments/gepa_bfcl/scoring_utils.py
+++ b/experiments/gepa_bfcl/scoring_utils.py
@@ -0,0 +1,132 @@
+""""
+scoring_utils.py
+
+Utility functions used for evaluating a BFCL agent's tool use
+"""
+
+from __future__ import annotations
+from typing import List
+
+
+def fn_name(executable_call: str) -> str:
+    """
+    Extract the function name from a tool call string
+    
+    Ex: read(file='log.txt') -> 'read'
+    """
+    if not executable_call:
+        return ""
+    
+    i = executable_call.index("(")
+    return executable_call[:i] if i != -1 else executable_call
+
+
+def soft_turn_score(gt_turn: List[str], pred_turn: List[str]) -> float:
+    """
+    Returns a score in [0, 1] for a single turn by comparing function
+    overlap between ground truth and agent prediction
+    """
+    # Perfectly aligned
+    if gt_turn == pred_turn:
+        return 1.0
+    
+    gt_fns = [fn_name(x) for x in gt_turn]
+    pr_fns = [fn_name(x) for x in pred_turn]
+    
+    # No functions expected AND no functions called
+    if not gt_fns and not pr_fns:
+        return 1.0
+    
+    # Either:
+    # No functions were expected but agent still called some
+    # OR agent didn't call any functions when it was expected to
+    if not gt_fns or not pr_fns:
+        return 0.0
+    
+    gt_set = set(gt_fns)
+    pr_set = set(pr_fns)
+    intersection = len(gt_set.intersection(pr_set))
+    
+    # No tool intersection -> 0.0
+    if intersection == 0:
+        return 0.0
+    
+    # Of all the tools the agent called, how many were in G.T
+    precision = intersection / max(len(pr_set), 1)
+    # Of all the tools in GT, how many did the agent call
+    recall = intersection / max(len(gt_set), 1) 
+    
+    # F1 Score = harmonic mean of precision and recall
+    # Higher F1 = high prec AND high rec
+    # Lower F1 = low prec and rec OR extreme difference btwn them
+    return (2 * precision * recall) / (precision + recall)
+    
+
+def soft_sequence_score(gt: List[List[str]], pred: List[List[str]]) -> float:
+    """
+    Returns a score in [0, 1] for a given multi-turn sequence, which is the
+    arithmetic average of soft turn scores
+    """
+    # No functions expected AND no functions called
+    if not gt and not pred:
+        return 1.0
+    
+    n = max(len(gt), len(pred), 1)
+    total = 0.0
+    
+    for i in range(n):
+        gt_turn = gt[i] if i < len(gt) else []
+        pred_turn = pred[i] if i < len(pred) else []
+        
+        # Add up each turn's F1 Score
+        total += soft_turn_score(gt_turn, pred_turn)
+        
+    # Return average
+    return total / n
+
+
+def diff_summary(gt: List[List[str]], pred: List[List[str]], 
+                *, max_turns: int = 8, max_calls_per_turn: int = 8
+                ) -> str:
+    """
+    Produce a readable string representation of the diff between
+    GT and predicted tool call sequences
+
+    Intended for logging
+    """
+    lines: List[str] = []
+    n = min(max(len(gt), len(pred)), max_turns)
+    
+    for i in range(n):
+        gt_turn = gt[i] if i < len(gt) else []
+        pr_turn = pred[i] if i < len(pred) else []
+
+        if gt_turn == pr_turn:
+            lines.append(f"TURN {i + 1}: OK (exact match)")
+            continue
+
+        lines.append(f"TURN {i + 1}: MISMATCH")
+        lines.append("  EXPECTED:")
+        if gt_turn:
+            for s in gt_turn[:max_calls_per_turn]:
+                lines.append(f"    - {s}")
+            if len(gt_turn) > max_calls_per_turn:
+                lines.append(f"    ... (+{len(gt_turn) - max_calls_per_turn} more)")
+        else:
+            lines.append("    - (no calls expected)")
+
+        lines.append("  GOT:")
+        if pr_turn:
+            for s in pr_turn[:max_calls_per_turn]:
+                lines.append(f"    - {s}")
+            if len(pr_turn) > max_calls_per_turn:
+                lines.append(f"    ... (+{len(pr_turn) - max_calls_per_turn} more)")
+        else:
+            lines.append("    - (no calls produced)")
+
+    if len(gt) != len(pred):
+        lines.append(
+            f"TURN COUNT: expected {len(gt)} turns, got {len(pred)} turns"
+        )
+
+    return "\n".join(lines)
\ No newline at end of file

From 682b5a4a9b456b050dc87b7e4483f931a8a90abf Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Sat, 27 Dec 2025 20:20:52 -0800
Subject: [PATCH 11/23] working on BFCLAgent forward

---
 experiments/gepa_bfcl/agent.py | 167 +++++++++++++++++++++++++++++++++
 1 file changed, 167 insertions(+)

diff --git a/experiments/gepa_bfcl/agent.py b/experiments/gepa_bfcl/agent.py
index e69de29..7c7c18b 100644
--- a/experiments/gepa_bfcl/agent.py
+++ b/experiments/gepa_bfcl/agent.py
@@ -0,0 +1,167 @@
+"""
+DSPy module wrapper for running BFCL tests with pytest
+"""
+
+from __future__ import annotations
+import json
+import subprocess
+import time
+import uuid
+from pathlib import Path
+from typing import Any, List
+import dspy
+from tests.benchmarks.bfcl import evaluator as bfcl_evaluator
+from tests.utils.fastagent_helpers import MessageSerializer
+from logging_utils import sha256_text
+
+
+class BFCLExample(dspy.Example):
+    """
+    DSPy Example wrapper for BFCL cases/examples
+    """
+    
+    def __init__(
+        self,
+        test_id: str | None = None,
+        question: str | None = None,
+        *,
+        base: dspy.Example | None = None,
+        **kwargs: Any
+    ):
+        if base is None:
+            super().__init__(test_id=test_id, question=question, **kwargs)
+        else:
+            super().__init__(base=base, **kwargs)
+            
+            
+class MetricFeedback(dspy.Prediction):
+    """
+    Container for metric score + text feedback returned to GEPA
+    """
+    
+    def __init__(self, score: float, feedback: str):
+        super().__init__(score=score, feedback=feedback)
+        
+
+class BFCLAgent(dspy.Module):
+    """
+    DSPy module that evaluates a given instruction prompt by running
+    BFCL tests (with pytest) and parsing resulting outputs
+    """
+    
+    def __init__(
+        self, 
+        instruction_text: str,
+        model: str,
+        base_dir: Path,
+        pytest_binary: str,
+        enable_scoring_mode: bool
+    ):
+        super().__init__()
+        self.model = model
+        self.base_dir = base_dir
+        self.base_dir.mkdir(parents=True, exist_ok=True)
+        self.pytest_binary = pytest_binary
+        self.enable_scoring_mode = enable_scoring_mode
+        
+        # The file at this path is changed before each run
+        self._instruction_path = self.base_dir / "current_instruction.txt"
+        
+        # Define the model's task
+        signature = dspy.Signature(
+            "prompt_input -> prompt_output",
+            instructions=instruction_text
+        )
+        
+        # dspy.Predict handles logic of constructing prompt 
+        # and sending it to the LM
+        self.prompt_predictor = dspy.Predict(signature)
+    
+    
+    def forward(self, test_id: str, question: str) -> dspy.Prediction:
+        """
+        Run a single BFCL test case using the current instruction prompt
+        """
+        # Initialize timing
+        t0 = time.perf_counter()
+        timings: dict[str, float] = {}
+
+        # EXPLAIN
+        try:
+            t_trace = time.perf_counter()
+            _ = self.prompt_predictor(prompt_input=question)
+            timings["dspy_trace_anchor_s"] = time.perf_counter() - t_trace
+        except Exception:
+            timings["dspy_trace_anchor_s"] = 0.0
+        
+        # Write current instruction
+        instruction_text = self.get_instruction_text()
+        instruction_hash = sha256_text(instruction_text)
+        
+        t_write = time.perf_counter()
+        self._instruction_path.write_text(instruction_text, encoding="utf-8")
+        timings["write_instruction_s"] = time.perf_counter() - t_write
+
+        # Create a unique directory for each individual run
+        run_uid = uuid.uuid4().hex[:12]
+        run_dir = self.base_dir / "runs" / f"{test_id}__{run_uid}"
+        run_dir.mkdir(parents=True, exist_ok=True)
+        
+        # Construct the pytest command
+        cmd = [
+            self.pytest_binary,
+            f"tests/benchmarks/bfcl/test_bfcl.py::test_bfcl[{test_id}]",
+            "--model", 
+            self.model, 
+            "--instruction-file", 
+            str(self._instruction_path),
+            "--output-dir",
+            str(run_dir),
+            "-q",
+            "-x"
+        ]
+        if self.enable_scoring_mode:
+            cmd.append("--gepa-scoring-mode")
+
+        # Run the pytest command
+        t_pytest = time.perf_counter()
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True
+        )
+        timings["pytest_run_s"] = time.perf_counter() - t_pytest
+        complete_path = run_dir / "raw" / f"{test_id}_complete.json"
+
+        tool_calls_by_turn: List[List[dict[str, Any]]] = []
+        executable_responses: List[List[str]] = []
+        evaluation: dict[str, Any] | None = None
+        eval_error: str | None = None
+    
+    
+    def get_instruction_text(self) -> str:
+        """
+        Return the current instruction text used by dspy
+        """
+        instructions = getattr(self.prompt_predictor.signature, "instructions", "")
+        if isinstance(instructions, (list, tuple)):
+            return "\n".join(str(p) for p in instructions if p)
+        return str(instructions or "")
+    
+    @staticmethod
+    def _summarize_behavior_from_calls(tool_calls: List[List[dict[str, Any]]]) -> str:
+        """
+        Summarize tool-use behavior for logging and feedback
+        """
+        tool_seq: List[str] = []
+        for turn in tool_calls:
+            for call in turn:
+                fn = call.get("function")
+                if fn:
+                    tool_seq.append(fn)
+                    
+        return (
+            f"TOOLS: {' -> '.join(tool_seq) or 'NONE'}\n"
+            f"NUM_TOOLS: {len(tool_seq)}"
+        )
+        
\ No newline at end of file

From dd5302701ffcb4b2025cfd806cfff60a01851712 Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Mon, 29 Dec 2025 12:20:18 -0800
Subject: [PATCH 12/23] GEPA on BFCL package runs correctly

---
 .../{gepa_bfcl/data.py => __init__.py}        |   0
 experiments/gepa_bfcl/agent.py                |  76 +++-
 experiments/gepa_bfcl/data_utils.py           |  50 +++
 experiments/gepa_bfcl/metrics.py              | 193 ++++++++++
 experiments/gepa_bfcl/run.py                  | 349 ++++++++++++++++++
 5 files changed, 650 insertions(+), 18 deletions(-)
 rename experiments/{gepa_bfcl/data.py => __init__.py} (100%)
 create mode 100644 experiments/gepa_bfcl/data_utils.py

diff --git a/experiments/gepa_bfcl/data.py b/experiments/__init__.py
similarity index 100%
rename from experiments/gepa_bfcl/data.py
rename to experiments/__init__.py
diff --git a/experiments/gepa_bfcl/agent.py b/experiments/gepa_bfcl/agent.py
index 7c7c18b..14c2f8e 100644
--- a/experiments/gepa_bfcl/agent.py
+++ b/experiments/gepa_bfcl/agent.py
@@ -1,4 +1,6 @@
 """
+agent.py
+
 DSPy module wrapper for running BFCL tests with pytest
 """
 
@@ -12,7 +14,7 @@
 import dspy
 from tests.benchmarks.bfcl import evaluator as bfcl_evaluator
 from tests.utils.fastagent_helpers import MessageSerializer
-from logging_utils import sha256_text
+from .logging_utils import sha256_text
 
 
 class BFCLExample(dspy.Example):
@@ -32,15 +34,6 @@ def __init__(
             super().__init__(test_id=test_id, question=question, **kwargs)
         else:
             super().__init__(base=base, **kwargs)
-            
-            
-class MetricFeedback(dspy.Prediction):
-    """
-    Container for metric score + text feedback returned to GEPA
-    """
-    
-    def __init__(self, score: float, feedback: str):
-        super().__init__(score=score, feedback=feedback)
         
 
 class BFCLAgent(dspy.Module):
@@ -77,22 +70,22 @@ def __init__(
         # and sending it to the LM
         self.prompt_predictor = dspy.Predict(signature)
     
-    
+
     def forward(self, test_id: str, question: str) -> dspy.Prediction:
         """
         Run a single BFCL test case using the current instruction prompt
         """
         # Initialize timing
         t0 = time.perf_counter()
-        timings: dict[str, float] = {}
+        timing: dict[str, float] = {}
 
-        # EXPLAIN
+        # dspy trace anchor
         try:
             t_trace = time.perf_counter()
             _ = self.prompt_predictor(prompt_input=question)
-            timings["dspy_trace_anchor_s"] = time.perf_counter() - t_trace
+            timing["dspy_trace_anchor_s"] = time.perf_counter() - t_trace
         except Exception:
-            timings["dspy_trace_anchor_s"] = 0.0
+            timing["dspy_trace_anchor_s"] = 0.0
         
         # Write current instruction
         instruction_text = self.get_instruction_text()
@@ -100,7 +93,7 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
         
         t_write = time.perf_counter()
         self._instruction_path.write_text(instruction_text, encoding="utf-8")
-        timings["write_instruction_s"] = time.perf_counter() - t_write
+        timing["write_instruction_s"] = time.perf_counter() - t_write
 
         # Create a unique directory for each individual run
         run_uid = uuid.uuid4().hex[:12]
@@ -130,14 +123,61 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
             capture_output=True,
             text=True
         )
-        timings["pytest_run_s"] = time.perf_counter() - t_pytest
+        timing["pytest_run_s"] = time.perf_counter() - t_pytest
+        
+        # Parse outputs and evaluate
         complete_path = run_dir / "raw" / f"{test_id}_complete.json"
 
         tool_calls_by_turn: List[List[dict[str, Any]]] = []
         executable_responses: List[List[str]] = []
         evaluation: dict[str, Any] | None = None
         eval_error: str | None = None
-    
+
+        t_eval = time.perf_counter()
+        if complete_path.exists():
+            try:
+                complete_data = json.loads(complete_path.read_text())
+                tool_calls_by_turn = MessageSerializer.extract_tool_calls_by_turn(complete_data)
+                
+                t_fmt = time.perf_counter()
+                executable_responses = MessageSerializer.format_to_executable(tool_calls_by_turn)
+                timing["format_to_executable_s"] = time.perf_counter() - t_fmt
+                
+                t_chk = time.perf_counter()
+                evaluation = bfcl_evaluator._run_evaluation(
+                    test_id,
+                    tool_calls_by_turn,
+                    executable_responses,
+                )
+                timing["bfcl_checker_s"] = time.perf_counter() - t_chk
+            except Exception as e:
+                eval_error = f"{type(e).__name__}: {e}"
+        
+        else:
+            eval_error = "Complete JSON not found (agent may have crashed)"
+            
+        timing["parse_and_eval_s"] = time.perf_counter() - t_eval
+        
+        tools_used = [call.get("function") for turn in tool_calls_by_turn for call in turn if call.get("function")]
+        behavior_summary = self._summarize_behavior_from_calls(tool_calls_by_turn)
+
+        timing["total_forward_s"] = time.perf_counter() - t0
+
+        # Final prediction for the current case
+        return dspy.Prediction(
+            test_id=test_id,
+            instruction_hash=instruction_hash,
+            instruction_text=instruction_text,
+            tools_used=tools_used,
+            behavior=behavior_summary,
+            executable_responses=executable_responses,
+            evaluation=evaluation,
+            eval_error=eval_error,
+            pytest_stdout=result.stdout,
+            pytest_stderr=result.stderr,
+            run_dir=str(run_dir),
+            timing=timing
+        )
     
     def get_instruction_text(self) -> str:
         """
diff --git a/experiments/gepa_bfcl/data_utils.py b/experiments/gepa_bfcl/data_utils.py
new file mode 100644
index 0000000..22f8242
--- /dev/null
+++ b/experiments/gepa_bfcl/data_utils.py
@@ -0,0 +1,50 @@
+"""
+data.py
+
+Dataset loading utilities for GEPA on BFCL tests
+"""
+
+from __future__ import annotations
+from typing import List, Any
+from tests.benchmarks.bfcl import loader as bfcl_loader
+from .agent import BFCLExample
+
+
+def stringify_question(question: Any) -> str:
+    
+    if isinstance(question, list) and question:
+        first = question[0]
+
+        if isinstance(first, str):
+            return first
+
+        if isinstance(first, dict):
+            return str(first.get("content", ""))
+
+        if isinstance(first, list) and first:
+            msg0 = first[0]
+            if isinstance(msg0, dict):
+                return str(msg0.get("content", ""))
+
+    if isinstance(question, dict):
+        return str(question.get("content", ""))
+
+    if isinstance(question, str):
+        return question
+
+    return ""
+
+
+def load_test_cases(subset: str, limit: int,) -> List[BFCLExample]:
+    """
+    Load BFCL test cases from a given subset and return as BFCLExample objects
+    """
+    test_ids = bfcl_loader.find_tests_in_category(subset, limit=limit)
+    examples: List[BFCLExample] = []
+    for test_id in test_ids[:limit]:
+        entry = bfcl_loader.load_test_entry(test_id)
+        question = stringify_question(entry.get("question", ""))
+        ex = BFCLExample(test_id=test_id, question=question)
+        examples.append(ex.with_inputs("test_id", "question"))
+
+    return examples
diff --git a/experiments/gepa_bfcl/metrics.py b/experiments/gepa_bfcl/metrics.py
index e69de29..d4ea2fd 100644
--- a/experiments/gepa_bfcl/metrics.py
+++ b/experiments/gepa_bfcl/metrics.py
@@ -0,0 +1,193 @@
+"""
+metrics.py
+
+Metric and feedback for GEPA optimization on BFCL
+"""
+
+from __future__ import annotations
+from typing import Any, Optional, List
+import dspy
+from tests.benchmarks.bfcl import loader as bfcl_loader
+from .logging_utils import RUN_CTX, append_jsonl, safe_json, utc_now_iso
+from .scoring_utils import fn_name, soft_sequence_score, diff_summary
+
+
+class MetricFeedback(dspy.Prediction):
+    """
+    Prediction returned to GEPA containing a scalar score and
+    human-readable feedback
+    """
+    
+    def __init__(self, score: float, feedback: str):
+        super().__init__(score=score, feedback=feedback)
+
+
+def build_score_definition() -> dict[str, Any]:
+    """
+    Returns a description of how scores are computed
+    """
+    return {
+        "hard_valid": "BFCL evaluator validation.valid (boolean) from multi_turn_checker",
+        "soft": "turn-wise function-name overlap F1-like score (ignores args), averaged across turns",
+        "final": "0.9*hard_valid + 0.1*soft",
+        "note": (
+            "Optimization and candidate scores use `final`."
+            "Hard validity is the primary objective; "
+            "soft score provides shaping for optimization."
+        )
+    }    
+    
+def bfcl_metric_with_feedback(
+    gold: dspy.Example,
+    pred: dspy.Prediction,
+    trace: Optional[Any] = None,
+    pred_name: Optional[str] = None,
+    pred_trace: Optional[Any] = None
+) -> MetricFeedback:
+    """
+    Computes the GEPA metric for a single BFCL evaluation.
+    Returns MetricFeedback(score, feedback)
+    """
+    # Extract test id and initialize feedback
+    test_id = getattr(pred, "test_id", None) or getattr(gold, "test_id", None)
+    feedback_parts: List[str] = []
+    
+    # Load BFCL truth + constraints for feedback
+    gt: list[list[str]] = []
+    excluded: list[str] = []
+    involved_classes: list[str] = []
+    try:
+        if test_id:
+            gt = bfcl_loader.load_ground_truth(test_id)
+            entry = bfcl_loader.load_test_entry(test_id)
+            excluded = entry.get("excluded_function", []) or []
+            involved_classes = entry.get("involved_classes", []) or []
+    except Exception as e:
+        feedback_parts.append(
+            f"WARNING: could not load BFCL ground truth/entry: {type(e).__name__}: {e}"
+        )
+    
+    # Pull prediction info
+    pred_exec: list[list[str]] = getattr(pred, "executable_responses", []) or []
+    evaluation: dict[str, Any] | None = getattr(pred, "evaluation", None)
+    eval_error: str | None = getattr(pred, "eval_error", None)
+    
+    # Compute hard-valid (pass/fail)
+    hard_valid = False
+    if evaluation and isinstance(evaluation, dict):
+        hard_valid = bool(evaluation.get("validation", {}).get("valid", False))
+        
+    # Compute soft score
+    if gt:
+        soft = soft_sequence_score(gt, pred_exec)
+    else:
+        soft = 1.0 if hard_valid else 0.0
+        
+    # Final score
+    final_score = 0.9*(1.0 if hard_valid else 0.0) + 0.1*soft
+    
+    # Train/dev split
+    split = None
+    if RUN_CTX and test_id:
+        if test_id in RUN_CTX.train_ids:
+            split = "train"
+        elif test_id in RUN_CTX.dev_ids:
+            split = "dev"
+        else:
+            split = "unknown"
+            
+    feedback_parts.append(f"RESULT: {'PASS' if hard_valid else 'FAIL'}")
+    feedback_parts.append(
+        f"SCORE_BREAKDOWN: hard={'1.0' if hard_valid else '0.0'} "
+        f"soft={soft:.3f} final={final_score:.3f}"
+    )
+    if split:
+        feedback_parts.append(f"SPLIT: {split}")
+        
+    if involved_classes:
+        feedback_parts.append(f"INVOLVED_CLASSES (servers mounted): {', '.join(involved_classes)}")
+    if excluded:
+        feedback_parts.append(f"EXCLUDED_FUNCTIONS: {', '.join(excluded)}")
+
+    if evaluation and isinstance(evaluation, dict):
+        validation = evaluation.get("validation", {})
+        irrelevance = evaluation.get("irrelevance_check", {})
+        feedback_parts.append("EVALUATOR_VALIDATION:")
+        if isinstance(validation, dict):
+            for k in ["valid", "reason", "error_type", "error_message"]:
+                if k in validation:
+                    feedback_parts.append(f"  {k}: {validation.get(k)}")
+        else:
+            feedback_parts.append(f"  validation: {validation}")
+
+        if isinstance(irrelevance, dict) and irrelevance:
+            feedback_parts.append("EVALUATOR_IRRELEVANCE_CHECK:")
+            for k in ["is_irrelevant", "reason"]:
+                if k in irrelevance:
+                    feedback_parts.append(f"  {k}: {irrelevance.get(k)}")
+
+    if eval_error:
+        feedback_parts.append(f"EVAL_ERROR: {eval_error}")
+
+    if gt:
+        feedback_parts.append("EXECUTABLE_DIFF:")
+        feedback_parts.append(diff_summary(gt, pred_exec))
+
+    if excluded and pred_exec:
+        used_fns = {fn_name(s) for turn in pred_exec for s in turn}
+        bad = sorted(set(excluded) & used_fns)
+        if bad:
+            feedback_parts.append(f"CONSTRAINT_VIOLATION: used excluded function(s): {', '.join(bad)}")
+
+    if hasattr(pred, "behavior"):
+        feedback_parts.append("BEHAVIOR_SUMMARY:")
+        feedback_parts.append(str(pred.behavior))
+
+    run_dir = getattr(pred, "run_dir", None)
+    if run_dir:
+        feedback_parts.append(f"RUN_DIR: {run_dir}")
+
+    # Log the record
+    if RUN_CTX and test_id:
+        record = {
+            "ts": utc_now_iso(),
+            "run_id": RUN_CTX.run_id,
+            "test_id": test_id,
+            "split": split,
+            "instruction_hash": getattr(pred, "instruction_hash", None),
+            "hard_valid": hard_valid,
+            "soft": soft,
+            "final": final_score,
+            "timing": getattr(pred, "timing", None),
+            "run_dir": run_dir,
+            "eval_error": eval_error,
+            "evaluator_validation": (
+                safe_json(evaluation.get("validation"))
+                if isinstance(evaluation, dict)
+                else None
+            ),
+            "evaluator_irrelevance": (
+                safe_json(evaluation.get("irrelevance_check"))
+                if isinstance(evaluation, dict)
+                else None
+            ),
+        }
+        append_jsonl(RUN_CTX.metric_calls_path, record)
+        
+        # Candidate snapshot
+        snap = {
+            "ts": utc_now_iso(),
+            "run_id": RUN_CTX.run_id,
+            "instruction_hash": getattr(pred, "instruction_hash", None),
+            "instruction_text": getattr(pred, "instruction_text", None),
+            "latest_eval": {
+                "test_id": test_id,
+                "split": split,
+                "hard_valid": hard_valid,
+                "soft": soft,
+                "final": final_score,
+            },
+        }
+        append_jsonl(RUN_CTX.candidate_snapshots_path, snap)
+        
+    return MetricFeedback(score=final_score, feedback="\n".join(feedback_parts))
\ No newline at end of file
diff --git a/experiments/gepa_bfcl/run.py b/experiments/gepa_bfcl/run.py
index e69de29..4473388 100644
--- a/experiments/gepa_bfcl/run.py
+++ b/experiments/gepa_bfcl/run.py
@@ -0,0 +1,349 @@
+"""
+run.py
+
+Orchestrator for running GEPA-based instruction optimization 
+experiments on BFCL tests with logging/artifacts
+
+Run once per experiment with 
+`python -m experiments.gepa_bfcl.run --instruction-file path/to/instruction.txt [other options]`
+"""
+
+from __future__ import annotations
+import argparse
+import json
+import os
+import platform
+import sys
+import time
+import uuid
+from pathlib import Path
+from typing import Any
+import shlex
+
+import dspy
+from dspy.teleprompt import GEPA
+
+from .agent import BFCLAgent
+from .data_utils import load_test_cases
+from .metrics import bfcl_metric_with_feedback, build_score_definition
+from .logging_utils import (
+    RUN_CTX,
+    RunContext,
+    TeeIO,
+    append_jsonl,
+    safe_json,
+    sha256_text,
+    try_git_info,
+    utc_now_iso,
+)
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run GEPA instruction optimization on BFCL"
+    )
+    
+    parser.add_argument("--test-subset", default="multi_turn_base")
+    parser.add_argument("--num-tests", type=int, default=10) #TODO: FIND A WAY TO MAKE THIS RUN ON ALL TEST CASES SIMPLY
+
+    parser.add_argument("--model", default="gpt-5")
+    parser.add_argument("--reflection-model", default="gpt-5")
+
+    parser.add_argument("--max-evaluations", type=int, default=20)
+    parser.add_argument("--auto", choices=["light", "medium", "heavy"], default=None)
+
+    parser.add_argument(
+        "--instruction-file",
+        type=Path,
+        required=True,
+        help="Path to initial instruction prompt.",
+    )
+
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("outputs/gepa_on_bfcl"),
+    )
+
+    parser.add_argument("--pytest-binary", default="pytest")
+    parser.add_argument("--gepa-scoring-mode", action="store_true")
+
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Console mirroring
+    console_log_path = args.output_dir / "console.log"
+    console_log_f = console_log_path.open("w", encoding="utf-8")
+    real_out, real_err = sys.stdout, sys.stderr
+    sys.stdout = TeeIO(real_out, console_log_f)
+    sys.stderr = TeeIO(real_err, console_log_f)
+    
+    # Metadata initialization
+    overall_t0 = time.perf_counter()
+    timings: dict[str, float] = {}
+    run_id = f"{time.strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
+    
+    # ---- Persist exact rerun command ----
+    python_executable = sys.executable
+    script_path = Path(__file__).resolve()
+
+    argv = [python_executable, str(script_path)] + sys.argv[1:]
+    command_str = shlex.join(argv)
+
+    command_path = args.output_dir / "command.sh"
+    command_path.write_text(
+        "#!/usr/bin/env bash\n\n" + command_str + "\n",
+        encoding="utf-8",
+    )
+
+    # Make it executable for convenience
+    command_path.chmod(0o755)
+
+    
+    metric_calls_path = args.output_dir / "metric_calls.jsonl"
+    candidate_snapshots_path = args.output_dir / "candidate_snapshots.jsonl"
+    score_definition = build_score_definition()
+
+    try:
+        print(f"[{utc_now_iso()}] RUN_ID={run_id}")
+        print(f"[{utc_now_iso()}] output_dir={args.output_dir}")
+
+        # Load dataset
+        t_load = time.perf_counter()
+        examples = load_test_cases(args.test_subset, args.num_tests)
+        
+        train_size = int(0.7 * len(examples))
+        trainset, devset = examples[:train_size], examples[train_size+1:]
+        timings["load_dataset_s"] = time.perf_counter() - t_load
+        
+        # Split dataset
+        train_ids = {e.test_id for e in trainset}
+        dev_ids = {e.test_id for e in devset}
+
+        (args.output_dir / "dataset_split.json").write_text(
+            json.dumps(
+                {
+                    "run_id": run_id,
+                    "test_subset": args.test_subset,
+                    "num_tests": args.num_tests,
+                    "train_ids": sorted(train_ids),
+                    "dev_ids": sorted(dev_ids),
+                },
+                indent=2,
+            ),
+            encoding="utf-8",
+        )
+        
+        # Initialize global run context
+        global RUN_CTX
+        RUN_CTX = RunContext(
+            run_id=run_id,
+            output_dir=args.output_dir,
+            metric_calls_path=metric_calls_path,
+            candidate_snapshots_path=candidate_snapshots_path,
+            train_ids=train_ids,
+            dev_ids=dev_ids,
+            score_definition=score_definition
+        )
+        
+        # Load initial instructions
+        instruction_text = args.instruction_file.read_text(encoding="utf-8")
+        instruction_hash = sha256_text(instruction_text)
+        
+        # Write the run manifest
+        manifest = {
+            "run_id": run_id,
+            "created_at": utc_now_iso(),
+            "argv": sys.argv,
+            "args": safe_json(vars(args)),
+            "instruction_file": str(args.instruction_file),
+            "instruction_hash": instruction_hash,
+            "score_definition": score_definition,
+            "dataset_split": {
+                "train_ids": sorted(train_ids),
+                "dev_ids": sorted(dev_ids),
+            },
+            "environment": {
+                "python": sys.version,
+                "platform": platform.platform(),
+                "cwd": os.getcwd(),
+            },
+            **try_git_info(),
+        }
+        (args.output_dir / "run_manifest.json").write_text(
+            json.dumps(manifest, indent=2),
+            encoding="utf-8",
+        )
+        
+        # Create agent
+        agent = BFCLAgent(
+            instruction_text=instruction_text,
+            model=args.model,
+            base_dir=args.output_dir,
+            pytest_binary=args.pytest_binary,
+            enable_scoring_mode=args.gepa_scoring_mode,
+        )
+        
+        # Run and evaluate baseline - no GEPA!
+        t_base = time.perf_counter()
+        baseline_valid = 0
+        baseline_details: list[dict[str, Any]] = []
+
+        for ex in examples:
+            pred = agent(test_id=ex.test_id, question=ex.question)
+
+            valid = False
+            if pred.evaluation:
+                valid = bool(
+                    pred.evaluation.get("validation", {}).get("valid", False)
+                )
+
+            baseline_valid += int(valid)
+            baseline_details.append(
+                {
+                    "test_id": ex.test_id,
+                    "valid": valid,
+                    "run_dir": pred.run_dir,
+                    "eval_error": pred.eval_error,
+                }
+            )
+
+        timings["baseline_s"] = time.perf_counter() - t_base
+        
+        # Persist baseline
+        baseline_valid_rate = baseline_valid / max(len(examples), 1)
+
+        (args.output_dir / "baseline.json").write_text(
+            json.dumps(
+                {
+                    "run_id": run_id,
+                    "instruction_hash": instruction_hash,
+                    "bfcl_valid_rate": baseline_valid_rate,
+                    "valid": baseline_valid,
+                    "total": len(examples),
+                    "runs": baseline_details,
+                },
+                indent=2,
+            ),
+            encoding="utf-8",
+        )
+
+        print(
+            f"[{utc_now_iso()}] Baseline BFCL valid rate: "
+            f"{baseline_valid_rate:.3f} ({baseline_valid}/{len(examples)})"
+        )
+        
+        # Finalize GEPA parameters
+        t_gepa = time.perf_counter()
+
+        reflection_lm = dspy.LM(args.reflection_model)
+        dspy.configure(lm=reflection_lm)
+        gepa_kwargs: dict[str, Any] = {
+            "metric": bfcl_metric_with_feedback,
+            "reflection_lm": reflection_lm,
+            "track_stats": True,
+            "log_dir": str(args.output_dir / "gepa_logs"),
+            "seed": 42,
+        }
+
+        if args.auto is not None:
+            gepa_kwargs["auto"] = args.auto
+        else:
+            gepa_kwargs["max_full_evals"] = args.max_evaluations
+            
+        (args.output_dir / "gepa_config.json").write_text(
+            json.dumps(safe_json(gepa_kwargs), indent=2),
+            encoding="utf-8",
+        )
+        
+        # Create and run GEPA optimizer
+        gepa = GEPA(**gepa_kwargs)
+        optimized_agent = gepa.compile(
+            agent,
+            trainset=trainset,
+            valset=devset,
+        )
+
+        results = optimized_agent.detailed_results
+        timings["gepa_compile_s"] = time.perf_counter() - t_gepa
+        
+        # Final candidates summary (still useful)
+        candidates = []
+        for i, cand in enumerate(results.candidates):
+            instr = cand.get_instruction_text()
+            candidates.append(
+                {
+                    "candidate_id": i,
+                    "instruction_hash": sha256_text(instr),
+                    "instruction_text": instr,
+                    "val_score": results.val_aggregate_scores[i],
+                    "discovered_at_metric_call": results.discovery_eval_counts[i],
+                    "parents": results.parents[i],
+                }
+            )
+        (args.output_dir / "gepa_candidates.json").write_text(json.dumps(candidates, indent=2), encoding="utf-8")
+
+        # Pareto
+        best_ids = set().union(*results.per_val_instance_best_candidates)
+        with open(args.output_dir / "gepa_pareto.txt", "w", encoding="utf-8") as f:
+            f.write("GEPA Pareto Frontier\n====================\n\n")
+            for i in sorted(best_ids, key=lambda i: results.val_aggregate_scores[i], reverse=True):
+                f.write(f"Candidate {i} | score={results.val_aggregate_scores[i]:.3f}\n")
+                f.write("-" * 40 + "\n")
+                f.write(results.candidates[i].get_instruction_text() + "\n\n")
+
+        final_instr = optimized_agent.get_instruction_text()
+        (args.output_dir / "optimized_instructions.txt").write_text(final_instr, encoding="utf-8")
+
+        # Scores file (explicit: which examples and how computed)
+        scores_payload = {
+            "run_id": run_id,
+            "score_definition": score_definition,
+            "dataset_split": {
+                "train_ids": sorted(train_ids),
+                "dev_ids": sorted(dev_ids),
+            },
+            "baseline": {
+                "bfcl_valid_rate_over_all_examples": baseline_valid_rate,
+                "examples_used": [e.test_id for e in examples],
+                "valid_count": baseline_valid,
+                "total_count": len(examples),
+            },
+            "gepa": {
+                "objective": "final (0.9*hard_valid + 0.1*soft) aggregated over dev set by GEPA internals",
+                "val_aggregate_scores": safe_json(results.val_aggregate_scores),
+                "candidate_count": len(results.candidates),
+            },
+            "note": "For per-evaluation, per-test, per-step details see metric_calls.jsonl (append-only).",
+        }
+        (args.output_dir / "scores.json").write_text(json.dumps(scores_payload, indent=2), encoding="utf-8")
+
+        # Metadata + timings
+        timings["total_wall_s"] = time.perf_counter() - overall_t0
+        (args.output_dir / "timings.json").write_text(json.dumps({"run_id": run_id, **timings}, indent=2), encoding="utf-8")
+
+        meta = {
+            "run_id": run_id,
+            "baseline_bfcl_valid_rate": baseline_valid_rate,
+            "final_score": max(results.val_aggregate_scores) if results.val_aggregate_scores else None,
+            "total_metric_calls": results.total_metric_calls,
+            "num_full_val_evals": results.num_full_val_evals,
+            "seed": results.seed,
+        }
+        (args.output_dir / "optimization_metadata.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
+
+        print(f"[{utc_now_iso()}] Done. See {args.output_dir}/run_manifest.json, scores.json, metric_calls.jsonl")
+
+
+    finally:
+        sys.stdout.flush()
+        sys.stderr.flush()
+        sys.stdout = real_out
+        sys.stderr = real_err
+        console_log_f.close()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 84aa5d1445c87b86ac03e83ea6dc2cd5fa2a8a6e Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Sat, 3 Jan 2026 19:55:08 -0800
Subject: [PATCH 13/23] =?UTF-8?q?Metric=20doesn=E2=80=99t=20include=20invo?=
 =?UTF-8?q?lved=20classes,=20excluded=20functions,=20or=20constraint=20vio?=
 =?UTF-8?q?lation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                       |  1 -
 experiments/gepa_bfcl/metrics.py | 12 ++++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4f0ac22..8241a94 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,7 +53,6 @@ fastagent.secrets.yaml
 outputs/
 output*/
 results/
-experiments/
 fastagent.jsonl
 test_script_*.py
 .claude/
diff --git a/experiments/gepa_bfcl/metrics.py b/experiments/gepa_bfcl/metrics.py
index d4ea2fd..867af60 100644
--- a/experiments/gepa_bfcl/metrics.py
+++ b/experiments/gepa_bfcl/metrics.py
@@ -104,10 +104,10 @@ def bfcl_metric_with_feedback(
     if split:
         feedback_parts.append(f"SPLIT: {split}")
         
-    if involved_classes:
-        feedback_parts.append(f"INVOLVED_CLASSES (servers mounted): {', '.join(involved_classes)}")
-    if excluded:
-        feedback_parts.append(f"EXCLUDED_FUNCTIONS: {', '.join(excluded)}")
+    # if involved_classes:
+    #     feedback_parts.append(f"INVOLVED_CLASSES (servers mounted): {', '.join(involved_classes)}")
+    # if excluded:
+    #     feedback_parts.append(f"EXCLUDED_FUNCTIONS: {', '.join(excluded)}")
 
     if evaluation and isinstance(evaluation, dict):
         validation = evaluation.get("validation", {})
@@ -136,8 +136,8 @@ def bfcl_metric_with_feedback(
     if excluded and pred_exec:
         used_fns = {fn_name(s) for turn in pred_exec for s in turn}
         bad = sorted(set(excluded) & used_fns)
-        if bad:
-            feedback_parts.append(f"CONSTRAINT_VIOLATION: used excluded function(s): {', '.join(bad)}")
+        # if bad:
+        #     feedback_parts.append(f"CONSTRAINT_VIOLATION: used excluded function(s): {', '.join(bad)}")
 
     if hasattr(pred, "behavior"):
         feedback_parts.append("BEHAVIOR_SUMMARY:")

From ba3921241585976422c6f30adaab2cb50d1c8135 Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Sat, 3 Jan 2026 20:10:06 -0800
Subject: [PATCH 14/23] bfcl test cases can now be shuffled and run on an
 entire subset

---
 experiments/gepa_bfcl/data_utils.py |  2 +-
 experiments/gepa_bfcl/run.py        | 25 +++++++++++++++++++++----
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/experiments/gepa_bfcl/data_utils.py b/experiments/gepa_bfcl/data_utils.py
index 22f8242..ed56bdf 100644
--- a/experiments/gepa_bfcl/data_utils.py
+++ b/experiments/gepa_bfcl/data_utils.py
@@ -35,7 +35,7 @@ def stringify_question(question: Any) -> str:
     return ""
 
 
-def load_test_cases(subset: str, limit: int,) -> List[BFCLExample]:
+def load_test_cases(subset: str, limit: int | None = None) -> List[BFCLExample]:
     """
     Load BFCL test cases from a given subset and return as BFCLExample objects
     """
diff --git a/experiments/gepa_bfcl/run.py b/experiments/gepa_bfcl/run.py
index 4473388..11287e8 100644
--- a/experiments/gepa_bfcl/run.py
+++ b/experiments/gepa_bfcl/run.py
@@ -19,6 +19,7 @@
 from pathlib import Path
 from typing import Any
 import shlex
+import random
 
 import dspy
 from dspy.teleprompt import GEPA
@@ -41,9 +42,11 @@ def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="Run GEPA instruction optimization on BFCL"
     )
-    
+
     parser.add_argument("--test-subset", default="multi_turn_base")
-    parser.add_argument("--num-tests", type=int, default=10) #TODO: FIND A WAY TO MAKE THIS RUN ON ALL TEST CASES SIMPLY
+    parser.add_argument("--shuffle", action="store_true")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--num-tests", type=int, default=None)
 
     parser.add_argument("--model", default="gpt-5")
     parser.add_argument("--reflection-model", default="gpt-5")
@@ -113,10 +116,20 @@ def main() -> None:
 
         # Load dataset
         t_load = time.perf_counter()
-        examples = load_test_cases(args.test_subset, args.num_tests)
+        all_examples = load_test_cases(args.test_subset, limit=None)
         
+        rng = random.Random(args.seed)
+        examples = list(all_examples)
+        if args.shuffle:
+            rng.shuffle(examples)
+            
+        if args.num_tests is not None:
+            examples = examples[: args.num_tests]
+
+                
         train_size = int(0.7 * len(examples))
-        trainset, devset = examples[:train_size], examples[train_size+1:]
+        trainset = examples[:train_size]
+        devset = examples[train_size:]
         timings["load_dataset_s"] = time.perf_counter() - t_load
         
         # Split dataset
@@ -128,7 +141,10 @@ def main() -> None:
                 {
                     "run_id": run_id,
                     "test_subset": args.test_subset,
+                    "shuffle": args.shuffle,
+                    "seed": args.seed,
                     "num_tests": args.num_tests,
+                    "examples_used_ordered": [e.test_id for e in examples],
                     "train_ids": sorted(train_ids),
                     "dev_ids": sorted(dev_ids),
                 },
@@ -136,6 +152,7 @@ def main() -> None:
             ),
             encoding="utf-8",
         )
+
         
         # Initialize global run context
         global RUN_CTX

From cbe74cb4bbaeb55f7d12d3245d1e5583ddadee52 Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Sat, 3 Jan 2026 20:10:18 -0800
Subject: [PATCH 15/23] removed soft score

---
 experiments/gepa_bfcl/metrics.py | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/experiments/gepa_bfcl/metrics.py b/experiments/gepa_bfcl/metrics.py
index 867af60..4271a41 100644
--- a/experiments/gepa_bfcl/metrics.py
+++ b/experiments/gepa_bfcl/metrics.py
@@ -23,19 +23,15 @@ def __init__(self, score: float, feedback: str):
 
 
 def build_score_definition() -> dict[str, Any]:
-    """
-    Returns a description of how scores are computed
-    """
     return {
         "hard_valid": "BFCL evaluator validation.valid (boolean) from multi_turn_checker",
-        "soft": "turn-wise function-name overlap F1-like score (ignores args), averaged across turns",
-        "final": "0.9*hard_valid + 0.1*soft",
+        "final": "1.0 if hard_valid else 0.0",
         "note": (
-            "Optimization and candidate scores use `final`."
-            "Hard validity is the primary objective; "
-            "soft score provides shaping for optimization."
+            "Optimization and candidate scores use only hard validity. "
+            "No soft or shaping score is applied."
         )
-    }    
+    }
+  
     
 def bfcl_metric_with_feedback(
     gold: dspy.Example,
@@ -77,14 +73,9 @@ def bfcl_metric_with_feedback(
     if evaluation and isinstance(evaluation, dict):
         hard_valid = bool(evaluation.get("validation", {}).get("valid", False))
         
-    # Compute soft score
-    if gt:
-        soft = soft_sequence_score(gt, pred_exec)
-    else:
-        soft = 1.0 if hard_valid else 0.0
-        
     # Final score
-    final_score = 0.9*(1.0 if hard_valid else 0.0) + 0.1*soft
+    final_score = 1.0 if hard_valid else 0.0
+
     
     # Train/dev split
     split = None
@@ -98,8 +89,7 @@ def bfcl_metric_with_feedback(
             
     feedback_parts.append(f"RESULT: {'PASS' if hard_valid else 'FAIL'}")
     feedback_parts.append(
-        f"SCORE_BREAKDOWN: hard={'1.0' if hard_valid else '0.0'} "
-        f"soft={soft:.3f} final={final_score:.3f}"
+        f"SCORE: {'1.0' if hard_valid else '0.0'} (hard_valid)"
     )
     if split:
         feedback_parts.append(f"SPLIT: {split}")
@@ -156,7 +146,6 @@ def bfcl_metric_with_feedback(
             "split": split,
             "instruction_hash": getattr(pred, "instruction_hash", None),
             "hard_valid": hard_valid,
-            "soft": soft,
             "final": final_score,
             "timing": getattr(pred, "timing", None),
             "run_dir": run_dir,
@@ -184,7 +173,6 @@ def bfcl_metric_with_feedback(
                 "test_id": test_id,
                 "split": split,
                 "hard_valid": hard_valid,
-                "soft": soft,
                 "final": final_score,
             },
         }

From a5cf3cafa188c9286aa509efe070892f00f602f9 Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Tue, 6 Jan 2026 00:32:20 -0800
Subject: [PATCH 16/23] agent and reflection LMs are separated

---
 experiments/gepa_bfcl.py              | 11 ++++++++++-
 experiments/gepa_bfcl/agent.py        | 11 ++++++++++-
 experiments/gepa_overview.txt         | 12 ++++++++++++
 tests/benchmarks/bfcl/instruction.txt |  2 --
 4 files changed, 32 insertions(+), 4 deletions(-)
 create mode 100644 experiments/gepa_overview.txt

diff --git a/experiments/gepa_bfcl.py b/experiments/gepa_bfcl.py
index 3c1a356..30b9eb1 100644
--- a/experiments/gepa_bfcl.py
+++ b/experiments/gepa_bfcl.py
@@ -530,7 +530,7 @@ def main():
     parser.add_argument("--test-subset", default="multi_turn_base")
     parser.add_argument("--num-tests", type=int, default=10)
     parser.add_argument("--model", default="gpt-5")
-    parser.add_argument("--reflection-model", default="gpt-5")
+    parser.add_argument("--reflection-model", default="gpt-5-mini")
     parser.add_argument("--max-evaluations", type=int, default=20)
     parser.add_argument("--output-dir", type=Path, default=Path("outputs/gepa_on_bfcl"))
     parser.add_argument("--auto", choices=["light", "medium", "heavy"], default=None)
@@ -611,6 +611,10 @@ def main():
             "instruction_file": str(args.instruction_file),
             "instruction_hash": instruction_hash,
             "score_definition": score_def,
+            "models": {
+                "agent_model": args.model,
+                "reflection_model": args.reflection_model,
+            },
             "dataset_split": {
                 "train_ids": sorted(train_ids),
                 "dev_ids": sorted(dev_ids),
@@ -627,6 +631,7 @@ def main():
         agent = BFCLAgent(
             instruction_text=instruction_text,
             model=args.model,
+            execution_lm=execution_lm,
             base_dir=args.output_dir,
             pytest_binary=args.pytest_binary,
             enable_scoring_mode=args.gepa_scoring_mode,
@@ -678,6 +683,8 @@ def main():
         # GEPA
         t_gepa = time.perf_counter()
         reflection_lm = dspy.LM(args.reflection_model)
+        execution_lm = dspy.LM(args.model)
+        
         dspy.configure(lm=reflection_lm)
 
         gepa_kwargs: dict[str, Any] = dict(
@@ -691,6 +698,8 @@ def main():
             gepa_kwargs["auto"] = args.auto
         else:
             gepa_kwargs["max_full_evals"] = args.max_evaluations
+            
+        gepa_kwargs["reflection_lm"] = args.reflection_model
 
         # Persist GEPA config/hparams exactly
         (args.output_dir / "gepa_config.json").write_text(json.dumps(safe_json(gepa_kwargs), indent=2), encoding="utf-8")
diff --git a/experiments/gepa_bfcl/agent.py b/experiments/gepa_bfcl/agent.py
index 14c2f8e..7fad00b 100644
--- a/experiments/gepa_bfcl/agent.py
+++ b/experiments/gepa_bfcl/agent.py
@@ -4,6 +4,10 @@
 DSPy module wrapper for running BFCL tests with pytest
 """
 
+# IMPORTANT:
+# All DSPy modules in BFCLAgent must explicitly use execution_lm.
+# Never rely on dspy.settings.lm here.
+
 from __future__ import annotations
 import json
 import subprocess
@@ -46,12 +50,14 @@ def __init__(
         self, 
         instruction_text: str,
         model: str,
+        execution_lm: dspy.LM,
         base_dir: Path,
         pytest_binary: str,
         enable_scoring_mode: bool
     ):
         super().__init__()
         self.model = model
+        self.execution_lm = execution_lm
         self.base_dir = base_dir
         self.base_dir.mkdir(parents=True, exist_ok=True)
         self.pytest_binary = pytest_binary
@@ -68,7 +74,10 @@ def __init__(
         
         # dspy.Predict handles logic of constructing prompt 
         # and sending it to the LM
-        self.prompt_predictor = dspy.Predict(signature)
+        self.prompt_predictor = dspy.Predict(
+            signature,
+            lm=self.execution_lm,
+        )
     
 
     def forward(self, test_id: str, question: str) -> dspy.Prediction:
diff --git a/experiments/gepa_overview.txt b/experiments/gepa_overview.txt
new file mode 100644
index 0000000..255d57d
--- /dev/null
+++ b/experiments/gepa_overview.txt
@@ -0,0 +1,12 @@
+for step in optimization:
+    select candidate(s)
+    run agent on train examples
+    compute metric → (score, feedback)
+    build reflection prompt containing:
+        - current instruction
+        - feedback summaries
+        - scores
+        - possibly history
+    ask reflection LM:
+        "Propose an improved instruction"
+    parse LM output into a new instruction candidate
\ No newline at end of file
diff --git a/tests/benchmarks/bfcl/instruction.txt b/tests/benchmarks/bfcl/instruction.txt
index 8bf4645..b2d9568 100644
--- a/tests/benchmarks/bfcl/instruction.txt
+++ b/tests/benchmarks/bfcl/instruction.txt
@@ -8,5 +8,3 @@ You should only return the function calls in your response. You SHOULD NOT inclu
 At each turn, you should try your best to complete the tasks requested by the user within the current turn. 
 Continue to output functions to call until you have fulfilled the user's request to the best of your ability. 
 Once you have no more functions to call, the system will consider the current turn complete and proceed to the next turn or task.
-
-{{serverInstructions}}

From a6d392283e65e4b0cf97aad8a4466bdcef4398c9 Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Mon, 12 Jan 2026 13:55:53 -0800
Subject: [PATCH 17/23] Enhancing how models are separated

---
 experiments/gepa_bfcl/agent.py            | 15 ++----
 experiments/gepa_bfcl/env_utils.py        | 53 +++++++++++++++++++++
 experiments/gepa_bfcl/run.py              | 56 +++++++++++++++++++++--
 tests/benchmarks/bfcl/instruction_old.txt | 12 +++++
 4 files changed, 121 insertions(+), 15 deletions(-)
 create mode 100644 experiments/gepa_bfcl/env_utils.py
 create mode 100644 tests/benchmarks/bfcl/instruction_old.txt

diff --git a/experiments/gepa_bfcl/agent.py b/experiments/gepa_bfcl/agent.py
index 7fad00b..d2f88e2 100644
--- a/experiments/gepa_bfcl/agent.py
+++ b/experiments/gepa_bfcl/agent.py
@@ -4,10 +4,6 @@
 DSPy module wrapper for running BFCL tests with pytest
 """
 
-# IMPORTANT:
-# All DSPy modules in BFCLAgent must explicitly use execution_lm.
-# Never rely on dspy.settings.lm here.
-
 from __future__ import annotations
 import json
 import subprocess
@@ -74,10 +70,7 @@ def __init__(
         
         # dspy.Predict handles logic of constructing prompt 
         # and sending it to the LM
-        self.prompt_predictor = dspy.Predict(
-            signature,
-            lm=self.execution_lm,
-        )
+        self.prompt_predictor = dspy.Predict(signature)
     
 
     def forward(self, test_id: str, question: str) -> dspy.Prediction:
@@ -91,10 +84,12 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
         # dspy trace anchor
         try:
             t_trace = time.perf_counter()
-            _ = self.prompt_predictor(prompt_input=question)
+            with dspy.context(lm=self.execution_lm):
+                _ = self.prompt_predictor(prompt_input=question)
             timing["dspy_trace_anchor_s"] = time.perf_counter() - t_trace
-        except Exception:
+        except Exception as e:
             timing["dspy_trace_anchor_s"] = 0.0
+            print(f"[TRACE_ANCHOR_ERROR] {type(e).__name__}: {e}")
         
         # Write current instruction
         instruction_text = self.get_instruction_text()
diff --git a/experiments/gepa_bfcl/env_utils.py b/experiments/gepa_bfcl/env_utils.py
new file mode 100644
index 0000000..e1566e5
--- /dev/null
+++ b/experiments/gepa_bfcl/env_utils.py
@@ -0,0 +1,53 @@
+"""
+env_utils.py
+
+Environment validation util functions
+"""
+
+import sys
+from typing import Any, List
+import os
+
+MODEL_PROVIDER_ENV_VARS = {
+    # OpenAI
+    "gpt-": ["OPENAI_API_KEY"],
+
+    # Anthropic
+    "claude-": ["ANTHROPIC_API_KEY"],
+    
+    # Qwen
+    "qwen-": ["QWEN_API_KEY"],
+    
+    # Kimi
+    "kimi-": ["KIMI_API_KEY"],
+}
+
+def validate_model_environment(models: List[str]) -> None:
+    """
+    Validate that required environment variables are set
+    for the requested models. Exit early if misconfigured.
+    """
+    missing: dict[str, List[str]] = {}
+
+    for model in models:
+        for prefix, env_vars in MODEL_PROVIDER_ENV_VARS.items():
+            if model.startswith(prefix):
+                for env in env_vars:
+                    val = os.getenv(env)
+                    if not val or is_invalid_key(val):
+                        missing.setdefault(model, []).append(env)
+
+    if missing:
+        print("\n[CONFIG ERROR] Missing required environment variables:\n")
+        for model, envs in missing.items():
+            print(f"  Model '{model}' requires:")
+            for env in envs:
+                print(f"    - {env}")
+        print(
+            "\nSet the missing variables and re-run. "
+            "No artifacts were produced for this run.\n"
+        )
+        sys.exit(2)
+
+def is_invalid_key(value: str) -> bool:
+    return value.strip() == "" or value.lower().startswith("your_")
\ No newline at end of file
diff --git a/experiments/gepa_bfcl/run.py b/experiments/gepa_bfcl/run.py
index 11287e8..82e57bb 100644
--- a/experiments/gepa_bfcl/run.py
+++ b/experiments/gepa_bfcl/run.py
@@ -27,6 +27,7 @@
 from .agent import BFCLAgent
 from .data_utils import load_test_cases
 from .metrics import bfcl_metric_with_feedback, build_score_definition
+from .env_utils import validate_model_environment
 from .logging_utils import (
     RUN_CTX,
     RunContext,
@@ -49,7 +50,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--num-tests", type=int, default=None)
 
     parser.add_argument("--model", default="gpt-5")
-    parser.add_argument("--reflection-model", default="gpt-5")
+    parser.add_argument("--reflection-model", default="gpt-5-mini")
 
     parser.add_argument("--max-evaluations", type=int, default=20)
     parser.add_argument("--auto", choices=["light", "medium", "heavy"], default=None)
@@ -75,6 +76,9 @@ def parse_args() -> argparse.Namespace:
 
 def main() -> None:
     args = parse_args()
+    
+    validate_model_environment([args.model, args.reflection_model])
+    
     args.output_dir.mkdir(parents=True, exist_ok=True)
     
     # Console mirroring
@@ -108,6 +112,7 @@ def main() -> None:
     
     metric_calls_path = args.output_dir / "metric_calls.jsonl"
     candidate_snapshots_path = args.output_dir / "candidate_snapshots.jsonl"
+    reflection_calls_path = args.output_dir / "reflection_calls.jsonl"
     score_definition = build_score_definition()
 
     try:
@@ -179,6 +184,10 @@ def main() -> None:
             "instruction_file": str(args.instruction_file),
             "instruction_hash": instruction_hash,
             "score_definition": score_definition,
+            "models": {
+                "agent_model": args.model,
+                "reflection_model": args.reflection_model
+            },
             "dataset_split": {
                 "train_ids": sorted(train_ids),
                 "dev_ids": sorted(dev_ids),
@@ -195,10 +204,19 @@ def main() -> None:
             encoding="utf-8",
         )
         
+        # Create LMs
+        reflection_lm = dspy.LM(args.reflection_model)
+        execution_lm = dspy.LM(args.model)
+
+        # Always configure a global LM (reflection-only by policy)
+        dspy.configure(lm=reflection_lm)
+
+        
         # Create agent
         agent = BFCLAgent(
             instruction_text=instruction_text,
             model=args.model,
+            execution_lm=execution_lm,
             base_dir=args.output_dir,
             pytest_binary=args.pytest_binary,
             enable_scoring_mode=args.gepa_scoring_mode,
@@ -255,9 +273,6 @@ def main() -> None:
         
         # Finalize GEPA parameters
         t_gepa = time.perf_counter()
-
-        reflection_lm = dspy.LM(args.reflection_model)
-        dspy.configure(lm=reflection_lm)
         gepa_kwargs: dict[str, Any] = {
             "metric": bfcl_metric_with_feedback,
             "reflection_lm": reflection_lm,
@@ -278,12 +293,43 @@ def main() -> None:
         
         # Create and run GEPA optimizer
         gepa = GEPA(**gepa_kwargs)
+        
+        reflection_lm.history.clear()
         optimized_agent = gepa.compile(
             agent,
             trainset=trainset,
             valset=devset,
         )
 
+        for i, entry in enumerate(reflection_lm.history):
+            record = {
+                "ts": entry.get("timestamp"),
+                "run_id": run_id,
+                "call_index": i,
+                "model": entry.get("model") or args.reflection_model,
+                "model_type": entry.get("model_type"),
+
+                # Prompting
+                "prompt": entry.get("prompt"),
+                "messages": entry.get("messages"),
+
+                # Outputs
+                "raw_response": entry.get("response"),
+                "outputs": entry.get("outputs"),
+
+                # Generation config
+                "kwargs": entry.get("kwargs"),
+
+                # Usage & cost
+                "usage": entry.get("usage"),
+                "cost": entry.get("cost"),
+
+                # Traceability
+                "uuid": entry.get("uuid"),
+            }
+
+            append_jsonl(reflection_calls_path, safe_json(record))
+
         results = optimized_agent.detailed_results
         timings["gepa_compile_s"] = time.perf_counter() - t_gepa
         
@@ -330,7 +376,7 @@ def main() -> None:
                 "total_count": len(examples),
             },
             "gepa": {
-                "objective": "final (0.9*hard_valid + 0.1*soft) aggregated over dev set by GEPA internals",
+                "objective": "binary hard_valid (1.0 pass / 0.0 fail) aggregated over dev set by GEPA",
                 "val_aggregate_scores": safe_json(results.val_aggregate_scores),
                 "candidate_count": len(results.candidates),
             },
diff --git a/tests/benchmarks/bfcl/instruction_old.txt b/tests/benchmarks/bfcl/instruction_old.txt
new file mode 100644
index 0000000..0b61c05
--- /dev/null
+++ b/tests/benchmarks/bfcl/instruction_old.txt
@@ -0,0 +1,12 @@
+You are an expert in composing functions. You are given a question and a set of possible functions. 
+Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
+If none of the functions can be used, point it out. 
+If the given question lacks the parameters required by the function, also point it out.
+
+You should only return the function calls in your response. You SHOULD NOT include any other text in the response.
+
+At each turn, you should try your best to complete the tasks requested by the user within the current turn. 
+Continue to output functions to call until you have fulfilled the user's request to the best of your ability. 
+Once you have no more functions to call, the system will consider the current turn complete and proceed to the next turn or task.
+
+{{serverInstructions}}
\ No newline at end of file

From 3f0d69afa874496bc3622bf162dd3190bac7cb3a Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Tue, 13 Jan 2026 14:21:08 -0800
Subject: [PATCH 18/23] Specific test cases/range can be specified in args

---
 experiments/gepa_bfcl/data_utils.py | 31 ++++++++++++
 experiments/gepa_bfcl/env_utils.py  |  1 +
 experiments/gepa_bfcl/run.py        | 73 +++++++++++++++++++++++++----
 3 files changed, 96 insertions(+), 9 deletions(-)

diff --git a/experiments/gepa_bfcl/data_utils.py b/experiments/gepa_bfcl/data_utils.py
index ed56bdf..4cadbc8 100644
--- a/experiments/gepa_bfcl/data_utils.py
+++ b/experiments/gepa_bfcl/data_utils.py
@@ -48,3 +48,34 @@ def load_test_cases(subset: str, limit: int | None = None) -> List[BFCLExample]:
         examples.append(ex.with_inputs("test_id", "question"))
 
     return examples
+
+
+def extract_test_number(test_id: str) -> int | None:
+    try:
+        return int(test_id.rsplit("_", 1)[-1])
+    except ValueError:
+        return None
+    
+    
+def parse_test_number_spec(spec: str) -> set[int]:
+    numbers: set[int] = set()
+
+    for part in spec.split(","):
+        part = part.strip()
+        if not part:
+            continue
+
+        if "-" in part:
+            start_s, end_s = part.split("-", 1)
+            start, end = int(start_s), int(end_s)
+
+            if start > end:
+                raise ValueError(
+                    f"Invalid test number range: {start}-{end}"
+                )
+
+            numbers.update(range(start, end + 1))
+        else:
+            numbers.add(int(part))
+
+    return numbers
diff --git a/experiments/gepa_bfcl/env_utils.py b/experiments/gepa_bfcl/env_utils.py
index e1566e5..91ae944 100644
--- a/experiments/gepa_bfcl/env_utils.py
+++ b/experiments/gepa_bfcl/env_utils.py
@@ -49,5 +49,6 @@ def validate_model_environment(models: List[str]) -> None:
         )
         sys.exit(2)
 
+
 def is_invalid_key(value: str) -> bool:
     return value.strip() == "" or value.lower().startswith("your_")
\ No newline at end of file
diff --git a/experiments/gepa_bfcl/run.py b/experiments/gepa_bfcl/run.py
index 82e57bb..5f3d7ac 100644
--- a/experiments/gepa_bfcl/run.py
+++ b/experiments/gepa_bfcl/run.py
@@ -25,7 +25,7 @@
 from dspy.teleprompt import GEPA
 
 from .agent import BFCLAgent
-from .data_utils import load_test_cases
+from .data_utils import load_test_cases, extract_test_number, parse_test_number_spec
 from .metrics import bfcl_metric_with_feedback, build_score_definition
 from .env_utils import validate_model_environment
 from .logging_utils import (
@@ -48,9 +48,10 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--shuffle", action="store_true")
     parser.add_argument("--seed", type=int, default=42)
     parser.add_argument("--num-tests", type=int, default=None)
-
-    parser.add_argument("--model", default="gpt-5")
-    parser.add_argument("--reflection-model", default="gpt-5-mini")
+    parser.add_argument("--test-numbers", type=str, default=None)
+    
+    parser.add_argument("--model", default="gpt-5-mini")
+    parser.add_argument("--reflection-model", default="gpt-5")
 
     parser.add_argument("--max-evaluations", type=int, default=20)
     parser.add_argument("--auto", choices=["light", "medium", "heavy"], default=None)
@@ -118,18 +119,52 @@ def main() -> None:
     try:
         print(f"[{utc_now_iso()}] RUN_ID={run_id}")
         print(f"[{utc_now_iso()}] output_dir={args.output_dir}")
+        
+        selected_test_numbers: set[int] | None = None
+        if args.test_numbers:
+            selected_test_numbers = parse_test_number_spec(args.test_numbers)
 
         # Load dataset
         t_load = time.perf_counter()
         all_examples = load_test_cases(args.test_subset, limit=None)
-        
-        rng = random.Random(args.seed)
+
         examples = list(all_examples)
+
+        # Explicit numeric test selection
+        if selected_test_numbers is not None:
+            before = len(examples)
+
+            matched = []
+            matched_numbers = set()
+
+            for e in examples:
+                num = extract_test_number(e.test_id)
+                if num in selected_test_numbers:
+                    matched.append(e)
+                    matched_numbers.add(num)
+
+            examples = matched
+            after = len(examples)
+
+            print(
+                f"[{utc_now_iso()}] Selected tests by number: "
+                f"{sorted(matched_numbers)} ({after}/{len(selected_test_numbers)} found"
+            )
+
+        # ---- Shuffle & slice ----
+        rng = random.Random(args.seed)
+
         if args.shuffle:
             rng.shuffle(examples)
-            
+
         if args.num_tests is not None:
-            examples = examples[: args.num_tests]
+            if selected_test_numbers is not None:
+                print(
+                    f"[{utc_now_iso()}] --test-numbers provided; ignoring --num-tests"
+                )
+            else:
+                examples = examples[: args.num_tests]
+
 
                 
         train_size = int(0.7 * len(examples))
@@ -152,12 +187,19 @@ def main() -> None:
                     "examples_used_ordered": [e.test_id for e in examples],
                     "train_ids": sorted(train_ids),
                     "dev_ids": sorted(dev_ids),
+                    "test_number_selection": (
+                        sorted(selected_test_numbers) if selected_test_numbers is not None else None
+                    ),
+                    "selection_mode": (
+                        "explicit_numbers" if selected_test_numbers is not None
+                        else "first_n" if args.num_tests is not None
+                        else "all"
+                    ),
                 },
                 indent=2,
             ),
             encoding="utf-8",
         )
-
         
         # Initialize global run context
         global RUN_CTX
@@ -184,6 +226,19 @@ def main() -> None:
             "instruction_file": str(args.instruction_file),
             "instruction_hash": instruction_hash,
             "score_definition": score_definition,
+            "test_selection": {
+                "mode": (
+                    "explicit_numbers" if selected_test_numbers is not None
+                    else "first_n" if args.num_tests is not None
+                    else "all"
+                ),
+                "test_numbers": (
+                    sorted(selected_test_numbers) if selected_test_numbers is not None else None
+                ),
+                "num_tests": args.num_tests,
+                "shuffle": args.shuffle,
+                "seed": args.seed,
+            },
             "models": {
                 "agent_model": args.model,
                 "reflection_model": args.reflection_model

From 757eaa54960f6f76a734dccd75476a1564eb8462 Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Tue, 13 Jan 2026 14:40:02 -0800
Subject: [PATCH 19/23] Log how each run of a test case is mapped to which
 instruction

---
 experiments/gepa_bfcl/agent.py         | 48 +++++++++++++++++++++++++-
 experiments/gepa_bfcl/logging_utils.py | 13 ++++++-
 experiments/gepa_bfcl/run.py           |  7 ++--
 3 files changed, 64 insertions(+), 4 deletions(-)

diff --git a/experiments/gepa_bfcl/agent.py b/experiments/gepa_bfcl/agent.py
index d2f88e2..f234c02 100644
--- a/experiments/gepa_bfcl/agent.py
+++ b/experiments/gepa_bfcl/agent.py
@@ -14,7 +14,8 @@
 import dspy
 from tests.benchmarks.bfcl import evaluator as bfcl_evaluator
 from tests.utils.fastagent_helpers import MessageSerializer
-from .logging_utils import sha256_text
+from .logging_utils import sha256_text, RUN_CTX, append_jsonl, utc_now_iso, safe_json
+
 
 
 class BFCLExample(dspy.Example):
@@ -77,6 +78,22 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
         """
         Run a single BFCL test case using the current instruction prompt
         """
+        phase = "unknown"
+        if RUN_CTX is not None:
+            if test_id in RUN_CTX.train_ids:
+                phase = "gepa_train"
+            elif test_id in RUN_CTX.dev_ids:
+                phase = "gepa_dev"
+            else:
+                phase = "baseline"
+                
+        test_number = None
+        try:
+            test_number = int(test_id.rsplit("_", 1)[-1])
+        except Exception:
+            pass
+
+
         # Initialize timing
         t0 = time.perf_counter()
         timing: dict[str, float] = {}
@@ -166,6 +183,35 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
         behavior_summary = self._summarize_behavior_from_calls(tool_calls_by_turn)
 
         timing["total_forward_s"] = time.perf_counter() - t0
+        
+        if RUN_CTX is not None:
+            record = {
+                "ts": utc_now_iso(),
+                "run_id": RUN_CTX.run_id,
+
+                "phase": phase,
+                "test_id": test_id,
+                "test_number": test_number,
+
+                "instruction": {
+                    "hash": instruction_hash,
+                },
+
+                "evaluation": {
+                    "valid": bool(
+                        evaluation.get("validation", {}).get("valid", False)
+                    ) if evaluation else False,
+                    "eval_error": eval_error,
+                },
+
+                "run_dir": str(run_dir),
+            }
+
+            append_jsonl(
+                RUN_CTX.run_index_path,
+                safe_json(record)
+            )
+
 
         # Final prediction for the current case
         return dspy.Prediction(
diff --git a/experiments/gepa_bfcl/logging_utils.py b/experiments/gepa_bfcl/logging_utils.py
index ff785ac..8b4fe32 100644
--- a/experiments/gepa_bfcl/logging_utils.py
+++ b/experiments/gepa_bfcl/logging_utils.py
@@ -98,6 +98,7 @@ class RunContext:
     output_dir: Path
     metric_calls_path: Path
     candidate_snapshots_path: Path
+    run_index_path: Path
     train_ids: set[str]
     dev_ids: set[str]
     score_definition: dict[str, Any]
@@ -131,4 +132,14 @@ def try_git_info() -> dict[str, Any]:
         info["git_commit"] = None
         info["git_dirty"] = None
     
-    return info
\ No newline at end of file
+    return info
+
+def log_run_index(record: dict[str, Any]) -> None:
+    """
+    Append a single BFCL execution record to run_index.jsonl
+    """
+    global RUN_CTX
+    if RUN_CTX is None:
+        return
+
+    append_jsonl(RUN_CTX.run_index_path, safe_json(record))
diff --git a/experiments/gepa_bfcl/run.py b/experiments/gepa_bfcl/run.py
index 5f3d7ac..2ce1fc2 100644
--- a/experiments/gepa_bfcl/run.py
+++ b/experiments/gepa_bfcl/run.py
@@ -114,6 +114,8 @@ def main() -> None:
     metric_calls_path = args.output_dir / "metric_calls.jsonl"
     candidate_snapshots_path = args.output_dir / "candidate_snapshots.jsonl"
     reflection_calls_path = args.output_dir / "reflection_calls.jsonl"
+    run_index_path = args.output_dir / "run_index.jsonl"
+    
     score_definition = build_score_definition()
 
     try:
@@ -151,7 +153,7 @@ def main() -> None:
                 f"{sorted(matched_numbers)} ({after}/{len(selected_test_numbers)} found"
             )
 
-        # ---- Shuffle & slice ----
+        # Shuffle & slice
         rng = random.Random(args.seed)
 
         if args.shuffle:
@@ -200,7 +202,7 @@ def main() -> None:
             ),
             encoding="utf-8",
         )
-        
+                
         # Initialize global run context
         global RUN_CTX
         RUN_CTX = RunContext(
@@ -208,6 +210,7 @@ def main() -> None:
             output_dir=args.output_dir,
             metric_calls_path=metric_calls_path,
             candidate_snapshots_path=candidate_snapshots_path,
+            run_index_path=run_index_path,
             train_ids=train_ids,
             dev_ids=dev_ids,
             score_definition=score_definition

From 7b8ad05e6d69a08e6c3de69a62aea27631d68b14 Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Tue, 13 Jan 2026 15:16:46 -0800
Subject: [PATCH 20/23] RunContext wasn't being preserved across files

---
 experiments/gepa_bfcl/metrics.py | 27 ++++++++++++++++++---------
 experiments/gepa_bfcl/run.py     |  9 +++------
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/experiments/gepa_bfcl/metrics.py b/experiments/gepa_bfcl/metrics.py
index 4271a41..caca0f3 100644
--- a/experiments/gepa_bfcl/metrics.py
+++ b/experiments/gepa_bfcl/metrics.py
@@ -8,7 +8,8 @@
 from typing import Any, Optional, List
 import dspy
 from tests.benchmarks.bfcl import loader as bfcl_loader
-from .logging_utils import RUN_CTX, append_jsonl, safe_json, utc_now_iso
+from . import logging_utils
+from .logging_utils import append_jsonl, safe_json, utc_now_iso
 from .scoring_utils import fn_name, soft_sequence_score, diff_summary
 
 
@@ -47,6 +48,14 @@ def bfcl_metric_with_feedback(
     # Extract test id and initialize feedback
     test_id = getattr(pred, "test_id", None) or getattr(gold, "test_id", None)
     feedback_parts: List[str] = []
+    ctx = logging_utils.RUN_CTX
+    
+    if ctx is None:
+        raise RuntimeError(
+            "RUN_CTX is None inside bfcl_metric_with_feedback. "
+            "This means run.py did not initialize logging_utils.RUN_CTX correctly."
+        )
+
     
     # Load BFCL truth + constraints for feedback
     gt: list[list[str]] = []
@@ -79,10 +88,10 @@ def bfcl_metric_with_feedback(
     
     # Train/dev split
     split = None
-    if RUN_CTX and test_id:
-        if test_id in RUN_CTX.train_ids:
+    if ctx and test_id:
+        if ctx.train_ids and test_id in ctx.train_ids:
             split = "train"
-        elif test_id in RUN_CTX.dev_ids:
+        elif ctx.dev_ids and test_id in ctx.dev_ids:
             split = "dev"
         else:
             split = "unknown"
@@ -138,10 +147,10 @@ def bfcl_metric_with_feedback(
         feedback_parts.append(f"RUN_DIR: {run_dir}")
 
     # Log the record
-    if RUN_CTX and test_id:
+    if ctx and test_id:
         record = {
             "ts": utc_now_iso(),
-            "run_id": RUN_CTX.run_id,
+            "run_id": ctx.run_id,
             "test_id": test_id,
             "split": split,
             "instruction_hash": getattr(pred, "instruction_hash", None),
@@ -161,12 +170,12 @@ def bfcl_metric_with_feedback(
                 else None
             ),
         }
-        append_jsonl(RUN_CTX.metric_calls_path, record)
+        append_jsonl(ctx.metric_calls_path, record)
         
         # Candidate snapshot
         snap = {
             "ts": utc_now_iso(),
-            "run_id": RUN_CTX.run_id,
+            "run_id": ctx.run_id,
             "instruction_hash": getattr(pred, "instruction_hash", None),
             "instruction_text": getattr(pred, "instruction_text", None),
             "latest_eval": {
@@ -176,6 +185,6 @@ def bfcl_metric_with_feedback(
                 "final": final_score,
             },
         }
-        append_jsonl(RUN_CTX.candidate_snapshots_path, snap)
+        append_jsonl(ctx.candidate_snapshots_path, snap)
         
     return MetricFeedback(score=final_score, feedback="\n".join(feedback_parts))
\ No newline at end of file
diff --git a/experiments/gepa_bfcl/run.py b/experiments/gepa_bfcl/run.py
index 2ce1fc2..b060286 100644
--- a/experiments/gepa_bfcl/run.py
+++ b/experiments/gepa_bfcl/run.py
@@ -29,8 +29,6 @@
 from .metrics import bfcl_metric_with_feedback, build_score_definition
 from .env_utils import validate_model_environment
 from .logging_utils import (
-    RUN_CTX,
-    RunContext,
     TeeIO,
     append_jsonl,
     safe_json,
@@ -38,6 +36,7 @@
     try_git_info,
     utc_now_iso,
 )
+from . import logging_utils
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
@@ -203,9 +202,7 @@ def main() -> None:
             encoding="utf-8",
         )
                 
-        # Initialize global run context
-        global RUN_CTX
-        RUN_CTX = RunContext(
+        logging_utils.RUN_CTX = logging_utils.RunContext(
             run_id=run_id,
             output_dir=args.output_dir,
             metric_calls_path=metric_calls_path,
@@ -213,7 +210,7 @@ def main() -> None:
             run_index_path=run_index_path,
             train_ids=train_ids,
             dev_ids=dev_ids,
-            score_definition=score_definition
+            score_definition=score_definition,
         )
         
         # Load initial instructions

From c2b8201f92fb11cadc495affee969da36d4139a9 Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Wed, 14 Jan 2026 16:17:17 -0800
Subject: [PATCH 21/23] Evaluation output is sent to the model and logged.
 Ready for final run

---
 .gitignore                       |  2 +-
 experiments/gepa_bfcl/agent.py   | 57 ++++++++++++++++++++++++++++----
 experiments/gepa_bfcl/run.py     | 16 ++-------
 tests/utils/fastagent_helpers.py |  6 ++--
 4 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8241a94..5244106 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,4 +64,4 @@ site/
 # Appworld data
 data/
 
-utils/
\ No newline at end of file
+/utils/
\ No newline at end of file
diff --git a/experiments/gepa_bfcl/agent.py b/experiments/gepa_bfcl/agent.py
index f234c02..0a1a5f0 100644
--- a/experiments/gepa_bfcl/agent.py
+++ b/experiments/gepa_bfcl/agent.py
@@ -106,7 +106,7 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
             timing["dspy_trace_anchor_s"] = time.perf_counter() - t_trace
         except Exception as e:
             timing["dspy_trace_anchor_s"] = 0.0
-            print(f"[TRACE_ANCHOR_ERROR] {type(e).__name__}: {e}")
+            # print(f"[TRACE_ANCHOR_ERROR] {type(e).__name__}: {e}")
         
         # Write current instruction
         instruction_text = self.get_instruction_text()
@@ -153,6 +153,7 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
         executable_responses: List[List[str]] = []
         evaluation: dict[str, Any] | None = None
         eval_error: str | None = None
+        failure_summary: str | None = None
 
         t_eval = time.perf_counter()
         if complete_path.exists():
@@ -160,8 +161,17 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
                 complete_data = json.loads(complete_path.read_text())
                 tool_calls_by_turn = MessageSerializer.extract_tool_calls_by_turn(complete_data)
                 
+                for turn in tool_calls_by_turn:
+                    for call in turn:
+                        if "function" in call and call["function"]:
+                            call["function"] = self.strip_tool_prefix(call["function"])        
+                
                 t_fmt = time.perf_counter()
                 executable_responses = MessageSerializer.format_to_executable(tool_calls_by_turn)
+                executable_responses = [
+                    [self.strip_tool_prefix(call) for call in turn]
+                    for turn in executable_responses
+                ]
                 timing["format_to_executable_s"] = time.perf_counter() - t_fmt
                 
                 t_chk = time.perf_counter()
@@ -170,6 +180,17 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
                     tool_calls_by_turn,
                     executable_responses,
                 )
+                
+                if evaluation is not None:
+                    eval_path = run_dir / "evaluation.json"
+                    eval_path.write_text(
+                        json.dumps(safe_json(evaluation), indent=2),
+                        encoding="utf-8",
+                    )
+                
+                    if "validation" in evaluation:
+                        failure_summary = self.summarize_validation_failure(evaluation["validation"])
+                
                 timing["bfcl_checker_s"] = time.perf_counter() - t_chk
             except Exception as e:
                 eval_error = f"{type(e).__name__}: {e}"
@@ -180,7 +201,7 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
         timing["parse_and_eval_s"] = time.perf_counter() - t_eval
         
         tools_used = [call.get("function") for turn in tool_calls_by_turn for call in turn if call.get("function")]
-        behavior_summary = self._summarize_behavior_from_calls(tool_calls_by_turn)
+        behavior_summary = self.summarize_behavior_from_calls(tool_calls_by_turn)
 
         timing["total_forward_s"] = time.perf_counter() - t0
         
@@ -202,9 +223,15 @@ def forward(self, test_id: str, question: str) -> dspy.Prediction:
                         evaluation.get("validation", {}).get("valid", False)
                     ) if evaluation else False,
                     "eval_error": eval_error,
+                    "path": str(run_dir / "evaluation.json") if evaluation else None,
                 },
-
-                "run_dir": str(run_dir),
+                
+                "failure_summary": failure_summary,
+                "irrelevant": bool(
+                    evaluation.get("irrelevance_check", {}).get("irrelevant", False)
+                ) if evaluation else False,
+                
+                "run_dir": str(run_dir)
             }
 
             append_jsonl(
@@ -238,8 +265,8 @@ def get_instruction_text(self) -> str:
             return "\n".join(str(p) for p in instructions if p)
         return str(instructions or "")
     
-    @staticmethod
-    def _summarize_behavior_from_calls(tool_calls: List[List[dict[str, Any]]]) -> str:
+    
+    def summarize_behavior_from_calls(self, tool_calls: List[List[dict[str, Any]]]) -> str:
         """
         Summarize tool-use behavior for logging and feedback
         """
@@ -254,4 +281,22 @@ def _summarize_behavior_from_calls(tool_calls: List[List[dict[str, Any]]]) -> st
             f"TOOLS: {' -> '.join(tool_seq) or 'NONE'}\n"
             f"NUM_TOOLS: {len(tool_seq)}"
         )
+        
+    def strip_tool_prefix(self, fn: str) -> str:
+        # vehiclecontrolapi__startEngine -> startEngine
+        return fn.split("__", 1)[-1]
+    
+    
+    def summarize_validation_failure(self, validation: dict[str, Any]) -> str | None:
+        if not validation or validation.get("valid", True):
+            return None
+
+        reasons = []
+
+        for key in ["missing_calls", "extra_calls", "wrong_order", "argument_mismatches"]:
+            if key in validation and validation[key]:
+                reasons.append(f"{key}: {validation[key]}")
+
+        return "; ".join(reasons) if reasons else "validation_failed"
+
         
\ No newline at end of file
diff --git a/experiments/gepa_bfcl/run.py b/experiments/gepa_bfcl/run.py
index b060286..0d425fc 100644
--- a/experiments/gepa_bfcl/run.py
+++ b/experiments/gepa_bfcl/run.py
@@ -55,18 +55,8 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--max-evaluations", type=int, default=20)
     parser.add_argument("--auto", choices=["light", "medium", "heavy"], default=None)
 
-    parser.add_argument(
-        "--instruction-file",
-        type=Path,
-        required=True,
-        help="Path to initial instruction prompt.",
-    )
-
-    parser.add_argument(
-        "--output-dir",
-        type=Path,
-        default=Path("outputs/gepa_on_bfcl"),
-    )
+    parser.add_argument("--instruction-file", type=Path, required=True)
+    parser.add_argument("--output-dir", type=Path, default=Path("outputs/gepa_on_bfcl"))
 
     parser.add_argument("--pytest-binary", default="pytest")
     parser.add_argument("--gepa-scoring-mode", action="store_true")
@@ -149,7 +139,7 @@ def main() -> None:
 
             print(
                 f"[{utc_now_iso()}] Selected tests by number: "
-                f"{sorted(matched_numbers)} ({after}/{len(selected_test_numbers)} found"
+                f"{sorted(matched_numbers)} ({after}/{len(selected_test_numbers)} found)"
             )
 
         # Shuffle & slice
diff --git a/tests/utils/fastagent_helpers.py b/tests/utils/fastagent_helpers.py
index 3785ac3..afca2d2 100644
--- a/tests/utils/fastagent_helpers.py
+++ b/tests/utils/fastagent_helpers.py
@@ -112,9 +112,11 @@ def strip_server_prefix(tool_name: str) -> str:
             tool_name: Tool name potentially with server prefix
 
         Returns:
-            Tool name without prefix (e.g., 'github-list_issues' -> 'list_issues')
+            Tool name without prefix (e.g., 'vehiclecontrolapi__list_issues' -> 'list_issues')
         """
-        if "-" in tool_name:
+        if "__" in tool_name:
+            return tool_name.split("__", 1)[1]
+        elif "-" in tool_name:
             return tool_name.split("-", 1)[1]
         return tool_name
 

From 60a145098db4adb15af5aa63a365d8f1b891c3d8 Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Sat, 17 Jan 2026 14:16:50 -0800
Subject: [PATCH 22/23] Initial scripts for candidate analysis

---
 .../gepa_analysis/candidate_snapshots.py      |  90 ++
 experiments/gepa_analysis/prompt_diff.py      |  81 ++
 experiments/gepa_analysis/prompt_timeline.py  |  94 +++
 experiments/gepa_bfcl.py                      | 789 ------------------
 4 files changed, 265 insertions(+), 789 deletions(-)
 create mode 100644 experiments/gepa_analysis/candidate_snapshots.py
 create mode 100644 experiments/gepa_analysis/prompt_diff.py
 create mode 100644 experiments/gepa_analysis/prompt_timeline.py
 delete mode 100644 experiments/gepa_bfcl.py

diff --git a/experiments/gepa_analysis/candidate_snapshots.py b/experiments/gepa_analysis/candidate_snapshots.py
new file mode 100644
index 0000000..ad9ae98
--- /dev/null
+++ b/experiments/gepa_analysis/candidate_snapshots.py
@@ -0,0 +1,90 @@
+import json
+from pathlib import Path
+from datetime import datetime
+import pandas as pd
+
+
+def load_candidate_snapshots(path: Path) -> pd.DataFrame:
+    rows = []
+
+    with path.open() as f:
+        for line in f:
+            record = json.loads(line)
+
+            eval_info = record.get("latest_eval", {})
+
+            rows.append({
+                "ts": pd.to_datetime(record["ts"], utc=True),
+                "instruction_hash": record["instruction_hash"],
+                "instruction_text": record["instruction_text"],
+                "test_id": eval_info.get("test_id"),
+                "split": eval_info.get("split"),
+                "hard_valid": eval_info.get("hard_valid"),
+                "score": eval_info.get("final"),
+            })
+
+    return pd.DataFrame(rows)
+
+
+def build_candidate_prompt_table(df: pd.DataFrame) -> pd.DataFrame:
+    grouped = df.groupby("instruction_hash")
+
+    rows = []
+
+    for instruction_hash, g in grouped:
+        instruction_text = g["instruction_text"].iloc[0]
+
+        train_scores = g[g["split"] == "train"]["score"]
+        dev_scores = g[g["split"] == "dev"]["score"]
+
+        rows.append({
+            "instruction_hash": instruction_hash,
+            "instruction_text": instruction_text,
+            "first_seen_ts": g["ts"].min(),
+            "last_seen_ts": g["ts"].max(),
+            "n_evals": len(g),
+            "train_pass_rate": train_scores.mean() if not train_scores.empty else None,
+            "dev_pass_rate": dev_scores.mean() if not dev_scores.empty else None,
+            "overall_pass_rate": g["score"].mean(),
+            "instruction_length_chars": len(instruction_text),
+            "instruction_length_lines": instruction_text.count("\n") + 1,
+        })
+
+    candidate_df = pd.DataFrame(rows)
+
+    return candidate_df.sort_values("first_seen_ts").reset_index(drop=True)
+
+
+def main():
+    run_dir = Path("./outputs/gepa_on_bfcl/1-14-prefinal")
+    output_dir = Path("./outputs/gepa_analysis/1-14-prefinal")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    snapshots_path = Path(run_dir / "candidate_snapshots.jsonl")
+
+    df_raw = load_candidate_snapshots(snapshots_path)
+    candidate_df = build_candidate_prompt_table(df_raw)
+
+    candidate_df.to_csv(output_dir / "candidate_snaps.csv", index=False)
+
+    print("\n=== Candidate Prompt Summary ===")
+    print(f"Total snapshot rows: {len(df_raw)}")
+    print(f"Unique prompts: {len(candidate_df)}")
+
+    print("\nTop prompts by dev pass rate:")
+    print(
+        candidate_df
+        .sort_values("dev_pass_rate", ascending=False)
+        .head(5)[
+            [
+                "instruction_hash",
+                "n_evals",
+                "dev_pass_rate",
+                "instruction_length_lines",
+            ]
+        ]
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/experiments/gepa_analysis/prompt_diff.py b/experiments/gepa_analysis/prompt_diff.py
new file mode 100644
index 0000000..9fe6f41
--- /dev/null
+++ b/experiments/gepa_analysis/prompt_diff.py
@@ -0,0 +1,81 @@
+import pandas as pd
+from pathlib import Path
+import difflib
+
+
+def unified_prompt_diff(base_text: str, new_text: str) -> str:
+    base_lines = base_text.splitlines()
+    new_lines = new_text.splitlines()
+
+    diff = difflib.unified_diff(
+        base_lines,
+        new_lines,
+        fromfile="baseline",
+        tofile="candidate",
+        lineterm=""
+    )
+
+    return "\n".join(diff)
+
+def main():
+    output_dir = Path("./outputs/gepa_analysis/1-14-prefinal")
+    df = pd.read_csv(output_dir / "candidate_snaps.csv")
+
+    output_md = Path(output_dir / "prompt_diffs.md")
+
+    # Baseline = most evaluated prompt
+    baseline = df.loc[df["n_evals"].idxmax()]
+
+    # Best non-baseline by overall pass rate
+    best_non_baseline = (
+        df.drop(index=baseline.name)
+        .sort_values("overall_pass_rate", ascending=False)
+        .iloc[0]
+    )
+
+    # Longest prompt (verbosity exploration)
+    longest_prompt = (
+        df.drop(index=baseline.name)
+        .sort_values("instruction_length_lines", ascending=False)
+        .iloc[0]
+    )
+
+    print("Baseline hash:", baseline["instruction_hash"])
+    print("Best non-baseline hash:", best_non_baseline["instruction_hash"])
+    print("Longest prompt hash:", longest_prompt["instruction_hash"])
+
+    with output_md.open("w") as f:
+        f.write("# Prompt Difference Analysis\n\n")
+
+        def write_section(title, base, other):
+            f.write(f"## {title}\n\n")
+            f.write(f"**Baseline hash:** `{base['instruction_hash']}`\n\n")
+            f.write(f"**Candidate hash:** `{other['instruction_hash']}`\n\n")
+            f.write(
+                f"- Overall pass rate: {other['overall_pass_rate']:.3f}\n"
+                f"- Instruction length (lines): {other['instruction_length_lines']}\n\n"
+            )
+
+            diff_text = unified_prompt_diff(
+                base["instruction_text"],
+                other["instruction_text"],
+            )
+
+            f.write("```diff\n")
+            f.write(diff_text if diff_text else "(No textual differences)\n")
+            f.write("\n```\n\n")
+
+        write_section(
+            "Baseline vs Best Non-Baseline Prompt",
+            baseline,
+            best_non_baseline,
+        )
+
+        write_section(
+            "Baseline vs Longest Prompt",
+            baseline,
+            longest_prompt,
+        )
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/experiments/gepa_analysis/prompt_timeline.py b/experiments/gepa_analysis/prompt_timeline.py
new file mode 100644
index 0000000..90777ac
--- /dev/null
+++ b/experiments/gepa_analysis/prompt_timeline.py
@@ -0,0 +1,94 @@
+import matplotlib.pyplot as plt
+from pathlib import Path
+import pandas as pd
+import numpy as np
+
+def plot_prompt_search_timeline(candidate_df: pd.DataFrame, output_dir: Path):
+    # Add discovery order
+    df = candidate_df.copy()
+    df["discovery_index"] = range(len(df))
+    
+    # Baseline = most evaluated prompt (more robust than "first seen")
+    baseline_idx = df["n_evals"].idxmax()
+    baseline = df.loc[baseline_idx]
+    others = df.drop(index=baseline_idx)
+    
+    # Y values: dev pass rate; if NaN, place slightly below 0 to show "no dev eval"
+    y = df["dev_pass_rate"].copy()
+    no_dev_mask = y.isna()
+    y_plot = y.copy()
+    y_plot[no_dev_mask] = -0.05  # sentinel row for "no dev eval"
+    
+    fig, ax = plt.subplots(figsize=(10, 6))
+    
+    # Get colormap normalization based on all instruction lengths
+    norm = plt.Normalize(
+        vmin=df["instruction_length_lines"].min(),
+        vmax=df["instruction_length_lines"].max()
+    )
+    cmap = plt.cm.viridis
+    
+    # Plot all non-baseline prompts
+    scatter = ax.scatter(
+        df.loc[df.index != baseline_idx, "discovery_index"],
+        y_plot.loc[df.index != baseline_idx],
+        c=df.loc[df.index != baseline_idx, "instruction_length_lines"],
+        cmap="viridis",
+        norm=norm,
+        s=80,
+        alpha=0.9,
+    )
+    
+    # Plot baseline prompt with viridis color
+    ax.scatter(
+        baseline["discovery_index"],
+        (-0.05 if pd.isna(baseline["dev_pass_rate"]) else baseline["dev_pass_rate"]),
+        marker="*",
+        s=250,
+        c=[baseline["instruction_length_lines"]],
+        cmap="viridis",
+        norm=norm,
+        # edgecolor="black",
+        linewidth=2,
+        label=f"Baseline (n={int(baseline['n_evals'])})",
+        zorder=5,  # Ensure it's on top
+    )
+    
+    # Add trend line for prompts with dev evals
+    valid_mask = ~no_dev_mask
+    if valid_mask.sum() > 1:
+        z = np.polyfit(df.loc[valid_mask, "discovery_index"], 
+                       df.loc[valid_mask, "dev_pass_rate"], 1)
+        p = np.poly1d(z)
+        ax.plot(df.loc[valid_mask, "discovery_index"], 
+                p(df.loc[valid_mask, "discovery_index"]),
+                "r--", alpha=0.3, linewidth=1.5, label="Trend")
+    
+    ax.set_title("GEPA Prompt Exploration (Dev Pass Rate)", 
+                 fontsize=13, fontweight='bold')
+    ax.set_xlabel("Prompt Discovery Order", fontsize=11)
+    ax.set_ylabel("Dev Pass Rate", fontsize=11)
+    
+    # Make the "no dev eval" row interpretable
+    ax.set_ylim(-0.08, 1.05)
+    ax.axhline(-0.05, linestyle="--", linewidth=1, color='gray', alpha=0.5)
+    ax.text(
+        0, -0.048, "no dev eval",
+        fontsize=9, va="bottom", style='italic', color='gray'
+    )
+    
+    # Add grid for easier reading
+    ax.grid(True, alpha=0.2, linestyle=':')
+    
+    cbar = plt.colorbar(scatter, ax=ax)
+    cbar.set_label("Instruction Length (lines)", fontsize=10)
+    
+    ax.legend(loc="upper right", framealpha=0.9)
+    
+    plt.tight_layout()
+    plt.savefig(output_dir / "prompt_search_timeline.png", dpi=150)
+    plt.close()
+
+output_dir = Path("./outputs/gepa_analysis/1-14-prefinal")
+candidate_df = pd.read_csv(output_dir / "candidate_snaps.csv")
+plot_prompt_search_timeline(candidate_df, output_dir)
\ No newline at end of file
diff --git a/experiments/gepa_bfcl.py b/experiments/gepa_bfcl.py
deleted file mode 100644
index 30b9eb1..0000000
--- a/experiments/gepa_bfcl.py
+++ /dev/null
@@ -1,789 +0,0 @@
-# NOTE:
-# This script performs instruction-only optimization using GEPA over BFCL tests.
-# The BFCL agent is invoked via pytest.
-
-"""
-GEPA-based instruction optimization for BFCL tests with first-class logging/artifacts.
-Run via: `python experiments/gepa_bfcl.py --instruction-file path/to/instruction.txt [other options]`
-"""
-
-import argparse
-import json
-import subprocess
-import hashlib
-import uuid
-import os
-import platform
-import sys
-import time
-from dataclasses import dataclass
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import Any, Optional
-
-import dspy
-from dspy.teleprompt import GEPA
-
-# Ensure repo root importable
-sys.path.insert(0, str(Path(__file__).parent.parent))
-
-from tests.benchmarks.bfcl import loader as bfcl_loader
-from tests.benchmarks.bfcl import evaluator as bfcl_evaluator
-from tests.utils.fastagent_helpers import MessageSerializer
-
-
-# -------------------------
-# JSON / logging utilities
-# -------------------------
-
-def utc_now_iso() -> str:
-    return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace('+00:00', 'Z')
-
-
-def sha256_text(text: str) -> str:
-    return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest()
-
-
-def safe_json(obj: Any) -> Any:
-    """Best-effort JSON-serializable conversion."""
-    try:
-        json.dumps(obj)
-        return obj
-    except Exception:
-        if isinstance(obj, dict):
-            return {str(k): safe_json(v) for k, v in obj.items()}
-        if isinstance(obj, (list, tuple)):
-            return [safe_json(x) for x in obj]
-        if hasattr(obj, "__dict__"):
-            return safe_json(obj.__dict__)
-        return repr(obj)
-
-
-def append_jsonl(path: Path, record: dict[str, Any]) -> None:
-    path.parent.mkdir(parents=True, exist_ok=True)
-    with path.open("a", encoding="utf-8") as f:
-        f.write(json.dumps(record, ensure_ascii=False) + "\n")
-
-
-class TeeIO:
-    """Mirror writes to both the real stream and a file."""
-    def __init__(self, real_stream, log_file):
-        self.real_stream = real_stream
-        self.log_file = log_file
-
-    def write(self, s):
-        self.real_stream.write(s)
-        self.log_file.write(s)
-
-    def flush(self):
-        self.real_stream.flush()
-        self.log_file.flush()
-
-    def isatty(self):
-        return False
-
-
-@dataclass
-class RunContext:
-    run_id: str
-    output_dir: Path
-    metric_calls_path: Path
-    candidate_snapshots_path: Path
-    train_ids: set[str]
-    dev_ids: set[str]
-    score_definition: dict[str, Any]
-
-
-RUN_CTX: RunContext | None = None
-
-
-# -------------------------
-# BFCL formatting helpers
-# -------------------------
-
-def _stringify_question(question: Any) -> str:
-    """Best-effort stringify for trace anchoring. BFCL is multi-turn; this picks the first user content."""
-    if isinstance(question, list) and question:
-        first = question[0]
-        if isinstance(first, str):
-            return first
-        if isinstance(first, dict):
-            return str(first.get("content", ""))
-        if isinstance(first, list) and first:
-            # BFCL questions often look like [[{role, content}], [{...}], ...]
-            msg0 = first[0]
-            if isinstance(msg0, dict):
-                return str(msg0.get("content", ""))
-    if isinstance(question, dict):
-        return str(question.get("content", ""))
-    if isinstance(question, str):
-        return question
-    return ""
-
-
-def _fn_name(executable_call: str) -> str:
-    if not executable_call:
-        return ""
-    idx = executable_call.find("(")
-    return executable_call[:idx] if idx != -1 else executable_call
-
-
-def _soft_turn_score(gt_turn: list[str], pred_turn: list[str]) -> float:
-    if gt_turn == pred_turn:
-        return 1.0
-    gt_fns = [_fn_name(x) for x in gt_turn]
-    pr_fns = [_fn_name(x) for x in pred_turn]
-    if not gt_fns and not pr_fns:
-        return 1.0
-    if not gt_fns or not pr_fns:
-        return 0.0
-
-    gt_set = set(gt_fns)
-    pr_set = set(pr_fns)
-    inter = len(gt_set & pr_set)
-    prec = inter / max(len(pr_set), 1)
-    rec = inter / max(len(gt_set), 1)
-    if prec + rec == 0:
-        return 0.0
-    return (2 * prec * rec) / (prec + rec)
-
-
-def _soft_sequence_score(gt: list[list[str]], pred: list[list[str]]) -> float:
-    if not gt and not pred:
-        return 1.0
-    n = max(len(gt), len(pred), 1)
-    total = 0.0
-    for i in range(n):
-        gt_turn = gt[i] if i < len(gt) else []
-        pr_turn = pred[i] if i < len(pred) else []
-        total += _soft_turn_score(gt_turn, pr_turn)
-    return total / n
-
-
-def _diff_summary(gt: list[list[str]], pred: list[list[str]], max_turns: int = 8, max_calls_per_turn: int = 8) -> str:
-    lines: list[str] = []
-    n = min(max(len(gt), len(pred)), max_turns)
-    for i in range(n):
-        gt_turn = gt[i] if i < len(gt) else []
-        pr_turn = pred[i] if i < len(pred) else []
-        if gt_turn == pr_turn:
-            lines.append(f"TURN {i+1}: OK (exact match)")
-            continue
-
-        lines.append(f"TURN {i+1}: MISMATCH")
-        lines.append("  EXPECTED:")
-        if gt_turn:
-            for s in gt_turn[:max_calls_per_turn]:
-                lines.append(f"    - {s}")
-            if len(gt_turn) > max_calls_per_turn:
-                lines.append(f"    ... (+{len(gt_turn) - max_calls_per_turn} more)")
-        else:
-            lines.append("    - (no calls expected)")
-
-        lines.append("  GOT:")
-        if pr_turn:
-            for s in pr_turn[:max_calls_per_turn]:
-                lines.append(f"    - {s}")
-            if len(pr_turn) > max_calls_per_turn:
-                lines.append(f"    ... (+{len(pr_turn) - max_calls_per_turn} more)")
-        else:
-            lines.append("    - (no calls produced)")
-    if len(gt) != len(pred):
-        lines.append(f"TURN COUNT: expected {len(gt)} turns, got {len(pred)} turns")
-    return "\n".join(lines)
-
-
-# -------------------------
-# DSPy wrappers
-# -------------------------
-
-class BFCLExample(dspy.Example):
-    def __init__(self, test_id: str | None = None, question: str | None = None, *, base: dspy.Example | None = None, **kwargs: Any):
-        if base is not None:
-            super().__init__(base=base, **kwargs)
-        else:
-            super().__init__(test_id=test_id, question=question, **kwargs)
-
-
-class MetricFeedback(dspy.Prediction):
-    def __init__(self, score: float, feedback: str) -> None:
-        super().__init__(score=score, feedback=feedback)
-
-
-class BFCLAgent(dspy.Module):
-    """DSPy module wrapper around pytest-driven BFCL evaluation."""
-
-    def __init__(
-        self,
-        instruction_text: str,
-        model: str,
-        base_dir: Path,
-        pytest_binary: str,
-        enable_scoring_mode: bool,
-    ) -> None:
-        super().__init__()
-        self.model = model
-        self.base_dir = base_dir
-        self.base_dir.mkdir(parents=True, exist_ok=True)
-        self.pytest_binary = pytest_binary
-        self.enable_scoring_mode = enable_scoring_mode
-        self._instruction_path = self.base_dir / "current_instruction.txt"
-
-        instruction_signature = dspy.Signature("prompt_input -> prompt_output", instructions=instruction_text)
-        self.prompt_predictor = dspy.Predict(instruction_signature)
-
-    def get_instruction_text(self) -> str:
-        instructions = getattr(self.prompt_predictor.signature, "instructions", "")
-        if isinstance(instructions, (list, tuple)):
-            return "\n".join(str(p) for p in instructions if p)
-        return str(instructions or "")
-
-    @staticmethod
-    def _summarize_behavior_from_calls(tool_calls_by_turn: list[list[dict[str, Any]]]) -> str:
-        tool_seq: list[str] = []
-        for turn in tool_calls_by_turn:
-            for call in turn:
-                fn = call.get("function")
-                if fn:
-                    tool_seq.append(fn)
-        return f"TOOLS: {' -> '.join(tool_seq) or 'NONE'}\nNUM_TOOLS: {len(tool_seq)}"
-
-    def forward(self, test_id: str, question: str) -> dspy.Prediction:
-        # ----- timing breakdown -----
-        t0 = time.perf_counter()
-        timing: dict[str, float] = {}
-
-        # ---- Trace anchor: invoke predictor so GEPA has a component trace ----
-        try:
-            t_a = time.perf_counter()
-            _ = self.prompt_predictor(prompt_input=question)
-            timing["dspy_trace_anchor_s"] = time.perf_counter() - t_a
-        except Exception:
-            timing["dspy_trace_anchor_s"] = 0.0
-
-        # Write current instruction
-        t_w = time.perf_counter()
-        instruction_text = self.get_instruction_text()
-        instruction_hash = sha256_text(instruction_text)
-        self._instruction_path.write_text(instruction_text, encoding="utf-8")
-        timing["write_instruction_s"] = time.perf_counter() - t_w
-
-        # Unique run dir prevents stale artifacts reuse
-        run_id = uuid.uuid4().hex[:12]
-        output_dir = self.base_dir / "runs" / f"{test_id}__{run_id}"
-        output_dir.mkdir(parents=True, exist_ok=True)
-
-        cmd = [
-            self.pytest_binary,
-            f"tests/benchmarks/bfcl/test_bfcl.py::test_bfcl[{test_id}]",
-            "--model",
-            self.model,
-            "--instruction-file",
-            str(self._instruction_path),
-            "--output-dir",
-            str(output_dir),
-            "-q",
-            "-x",
-        ]
-        if self.enable_scoring_mode:
-            cmd.append("--gepa-scoring-mode")
-
-        # Run pytest
-        t_p = time.perf_counter()
-        result = subprocess.run(cmd, capture_output=True, text=True)
-        timing["pytest_run_s"] = time.perf_counter() - t_p
-
-        complete_path = output_dir / "raw" / f"{test_id}_complete.json"
-
-        tool_calls_by_turn: list[list[dict[str, Any]]] = []
-        executable_responses: list[list[str]] = []
-        evaluation: dict[str, Any] | None = None
-        eval_error: str | None = None
-
-        # Parse + evaluate
-        t_e = time.perf_counter()
-        if complete_path.exists():
-            try:
-                complete_data = json.loads(complete_path.read_text())
-                tool_calls_by_turn = MessageSerializer.extract_tool_calls_by_turn(complete_data)
-
-                t_fmt = time.perf_counter()
-                executable_responses = MessageSerializer.format_to_executable(tool_calls_by_turn)
-                timing["format_to_executable_s"] = time.perf_counter() - t_fmt
-
-                t_chk = time.perf_counter()
-                evaluation = bfcl_evaluator._run_evaluation(test_id, tool_calls_by_turn, executable_responses)
-                timing["bfcl_checker_s"] = time.perf_counter() - t_chk
-            except Exception as e:
-                eval_error = f"{type(e).__name__}: {e}"
-        else:
-            eval_error = "Complete JSON not found (agent may have crashed before serialization)."
-        timing["parse_and_eval_s"] = time.perf_counter() - t_e
-
-        tools_used = [call.get("function") for turn in tool_calls_by_turn for call in turn if call.get("function")]
-        behavior_summary = self._summarize_behavior_from_calls(tool_calls_by_turn)
-
-        timing["total_forward_s"] = time.perf_counter() - t0
-
-        return dspy.Prediction(
-            test_id=test_id,
-            instruction_hash=instruction_hash,
-            instruction_text=instruction_text,
-            tools_used=tools_used,
-            behavior=behavior_summary,
-            executable_responses=executable_responses,
-            evaluation=evaluation,
-            eval_error=eval_error,
-            pytest_stdout=result.stdout,
-            pytest_stderr=result.stderr,
-            run_dir=str(output_dir),
-            timing=timing,
-        )
-
-
-# -------------------------
-# Metric (logs every call incrementally)
-# -------------------------
-
-def bfcl_metric_with_feedback(
-    gold: dspy.Example,
-    pred: dspy.Prediction,
-    trace: Optional[Any] = None,
-    pred_name: Optional[str] = None,
-    pred_trace: Optional[Any] = None,
-) -> MetricFeedback:
-    """
-    Score definition (explicitly persisted in run_manifest.json):
-      hard_valid ∈ {0,1} = BFCL checker validation.valid
-      soft ∈ [0,1] = turn-wise overlap score based on function-name overlap (F1-like)
-      final = 0.9*hard_valid + 0.1*soft
-    """
-    test_id = getattr(pred, "test_id", None) or getattr(gold, "test_id", None)
-    feedback_parts: list[str] = []
-
-    # Load BFCL truth + constraints for feedback
-    gt: list[list[str]] = []
-    excluded: list[str] = []
-    involved_classes: list[str] = []
-    try:
-        if test_id:
-            gt = bfcl_loader.load_ground_truth(test_id)
-            entry = bfcl_loader.load_test_entry(test_id)
-            excluded = entry.get("excluded_function", []) or []
-            involved_classes = entry.get("involved_classes", []) or []
-    except Exception as e:
-        feedback_parts.append(f"WARNING: could not load BFCL ground truth/entry: {type(e).__name__}: {e}")
-
-    pred_exec: list[list[str]] = getattr(pred, "executable_responses", []) or []
-    evaluation: dict[str, Any] | None = getattr(pred, "evaluation", None)
-    eval_error: str | None = getattr(pred, "eval_error", None)
-
-    hard_valid = False
-    if evaluation and isinstance(evaluation, dict):
-        hard_valid = bool(evaluation.get("validation", {}).get("valid", False))
-
-    soft = _soft_sequence_score(gt, pred_exec) if gt else (1.0 if hard_valid else 0.0)
-    final_score = (1.0 if hard_valid else 0.0) * 0.9 + soft * 0.1
-
-    split = None
-    if RUN_CTX and test_id:
-        if test_id in RUN_CTX.train_ids:
-            split = "train"
-        elif test_id in RUN_CTX.dev_ids:
-            split = "dev"
-        else:
-            split = "unknown"
-
-    feedback_parts.append(f"RESULT: {'PASS' if hard_valid else 'FAIL'}")
-    feedback_parts.append(f"SCORE_BREAKDOWN: hard={'1.0' if hard_valid else '0.0'} soft={soft:.3f} final={final_score:.3f}")
-    if split:
-        feedback_parts.append(f"SPLIT: {split}")
-
-    if involved_classes:
-        feedback_parts.append(f"INVOLVED_CLASSES (servers mounted): {', '.join(involved_classes)}")
-    if excluded:
-        feedback_parts.append(f"EXCLUDED_FUNCTIONS: {', '.join(excluded)}")
-
-    if evaluation and isinstance(evaluation, dict):
-        validation = evaluation.get("validation", {})
-        irrelevance = evaluation.get("irrelevance_check", {})
-        feedback_parts.append("EVALUATOR_VALIDATION:")
-        if isinstance(validation, dict):
-            for k in ["valid", "reason", "error_type", "error_message"]:
-                if k in validation:
-                    feedback_parts.append(f"  {k}: {validation.get(k)}")
-        else:
-            feedback_parts.append(f"  validation: {validation}")
-
-        if isinstance(irrelevance, dict) and irrelevance:
-            feedback_parts.append("EVALUATOR_IRRELEVANCE_CHECK:")
-            for k in ["is_irrelevant", "reason"]:
-                if k in irrelevance:
-                    feedback_parts.append(f"  {k}: {irrelevance.get(k)}")
-
-    if eval_error:
-        feedback_parts.append(f"EVAL_ERROR: {eval_error}")
-
-    if gt:
-        feedback_parts.append("EXECUTABLE_DIFF:")
-        feedback_parts.append(_diff_summary(gt, pred_exec))
-
-    if excluded and pred_exec:
-        used_fns = {_fn_name(s) for turn in pred_exec for s in turn}
-        bad = sorted(set(excluded) & used_fns)
-        if bad:
-            feedback_parts.append(f"CONSTRAINT_VIOLATION: used excluded function(s): {', '.join(bad)}")
-
-    if hasattr(pred, "behavior"):
-        feedback_parts.append("BEHAVIOR_SUMMARY:")
-        feedback_parts.append(str(pred.behavior))
-
-    run_dir = getattr(pred, "run_dir", None)
-    if run_dir:
-        feedback_parts.append(f"RUN_DIR: {run_dir}")
-
-    # ---- First-class machine-readable metric call record ----
-    if RUN_CTX and test_id:
-        record = {
-            "ts": utc_now_iso(),
-            "run_id": RUN_CTX.run_id,
-            "test_id": test_id,
-            "split": split,
-            "instruction_hash": getattr(pred, "instruction_hash", None),
-            "hard_valid": hard_valid,
-            "soft": soft,
-            "final": final_score,
-            "timing": getattr(pred, "timing", None),
-            "run_dir": run_dir,
-            "eval_error": eval_error,
-            "evaluator_validation": safe_json(evaluation.get("validation")) if isinstance(evaluation, dict) else None,
-            "evaluator_irrelevance": safe_json(evaluation.get("irrelevance_check")) if isinstance(evaluation, dict) else None,
-        }
-        append_jsonl(RUN_CTX.metric_calls_path, record)
-
-        # Opportunistic candidate snapshot (what GEPA is “trying”)
-        snap = {
-            "ts": utc_now_iso(),
-            "run_id": RUN_CTX.run_id,
-            "instruction_hash": getattr(pred, "instruction_hash", None),
-            "instruction_text": getattr(pred, "instruction_text", None),
-            "latest_eval": {
-                "test_id": test_id,
-                "split": split,
-                "hard_valid": hard_valid,
-                "soft": soft,
-                "final": final_score,
-            },
-        }
-        append_jsonl(RUN_CTX.candidate_snapshots_path, snap)
-
-    return MetricFeedback(score=final_score, feedback="\n".join(feedback_parts))
-
-
-# -------------------------
-# Data loading
-# -------------------------
-
-def load_test_cases(subset: str, limit: int) -> list[BFCLExample]:
-    test_ids = bfcl_loader.find_tests_in_category(subset, limit=limit)
-    examples: list[BFCLExample] = []
-    for test_id in test_ids[:limit]:
-        entry = bfcl_loader.load_test_entry(test_id)
-        question = _stringify_question(entry.get("question", ""))
-        ex = BFCLExample(test_id=test_id, question=question)
-        examples.append(ex.with_inputs("test_id", "question"))
-    return examples
-
-
-# -------------------------
-# Run manifest + environment capture
-# -------------------------
-
-def try_git_info() -> dict[str, Any]:
-    info: dict[str, Any] = {}
-    try:
-        head = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, check=False)
-        info["git_commit"] = head.stdout.strip() if head.returncode == 0 else None
-        st = subprocess.run(["git", "status", "--porcelain"], capture_output=True, text=True, check=False)
-        info["git_dirty"] = bool(st.stdout.strip())
-    except Exception:
-        info["git_commit"] = None
-        info["git_dirty"] = None
-    return info
-
-
-def build_score_definition() -> dict[str, Any]:
-    return {
-        "hard_valid": "BFCL evaluator validation.valid (boolean) from multi_turn_checker",
-        "soft": "turn-wise function-name overlap F1-like score (ignores args), averaged across turns",
-        "final": "0.9*hard_valid + 0.1*soft",
-        "note": "Optimization and candidate scores use `final`. Hard-valid-rate is also reported separately for clarity.",
-    }
-
-
-# -------------------------
-# Main
-# -------------------------
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--test-subset", default="multi_turn_base")
-    parser.add_argument("--num-tests", type=int, default=10)
-    parser.add_argument("--model", default="gpt-5")
-    parser.add_argument("--reflection-model", default="gpt-5-mini")
-    parser.add_argument("--max-evaluations", type=int, default=20)
-    parser.add_argument("--output-dir", type=Path, default=Path("outputs/gepa_on_bfcl"))
-    parser.add_argument("--auto", choices=["light", "medium", "heavy"], default=None)
-    parser.add_argument("--instruction-file", type=Path, required=True)
-    parser.add_argument("--pytest-binary", default="pytest")
-    parser.add_argument("--gepa-scoring-mode", action="store_true")
-    args = parser.parse_args()
-
-    args.output_dir.mkdir(parents=True, exist_ok=True)
-
-    # ---- Mirror stdout/stderr to console.log automatically ----
-    console_log_path = args.output_dir / "console.log"
-    console_log_f = console_log_path.open("w", encoding="utf-8")
-    real_out, real_err = sys.stdout, sys.stderr
-    sys.stdout = TeeIO(real_out, console_log_f)
-    sys.stderr = TeeIO(real_err, console_log_f)
-
-    overall_t0 = time.perf_counter()
-    timings: dict[str, float] = {}
-
-    run_id = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}"
-    metric_calls_path = args.output_dir / "metric_calls.jsonl"
-    candidate_snapshots_path = args.output_dir / "candidate_snapshots.jsonl"
-
-    score_def = build_score_definition()
-
-    try:
-        print(f"[{utc_now_iso()}] RUN_ID={run_id}")
-        print(f"[{utc_now_iso()}] output_dir={args.output_dir}")
-
-        # Load dataset and split
-        t_load = time.perf_counter()
-        examples = load_test_cases(args.test_subset, args.num_tests)
-        train_size = int(0.7 * len(examples))
-        trainset, devset = examples[:train_size], examples[train_size:]
-        timings["load_dataset_s"] = time.perf_counter() - t_load
-
-        train_ids = {e.test_id for e in trainset}
-        dev_ids = {e.test_id for e in devset}
-
-        (args.output_dir / "dataset_split.json").write_text(
-            json.dumps(
-                {
-                    "run_id": run_id,
-                    "test_subset": args.test_subset,
-                    "num_tests": args.num_tests,
-                    "train_ids": sorted(train_ids),
-                    "dev_ids": sorted(dev_ids),
-                    "train_size": len(train_ids),
-                    "dev_size": len(dev_ids),
-                },
-                indent=2,
-            ),
-            encoding="utf-8",
-        )
-
-        # Initialize global run context for metric logging
-        global RUN_CTX
-        RUN_CTX = RunContext(
-            run_id=run_id,
-            output_dir=args.output_dir,
-            metric_calls_path=metric_calls_path,
-            candidate_snapshots_path=candidate_snapshots_path,
-            train_ids=train_ids,
-            dev_ids=dev_ids,
-            score_definition=score_def,
-        )
-
-        instruction_text = args.instruction_file.read_text(encoding="utf-8")
-        instruction_hash = sha256_text(instruction_text)
-
-        # Manifest: config, hyperparams, environment, git, score definition, dataset split
-        manifest = {
-            "run_id": run_id,
-            "created_at": utc_now_iso(),
-            "argv": sys.argv,
-            "args": safe_json(vars(args)),
-            "instruction_file": str(args.instruction_file),
-            "instruction_hash": instruction_hash,
-            "score_definition": score_def,
-            "models": {
-                "agent_model": args.model,
-                "reflection_model": args.reflection_model,
-            },
-            "dataset_split": {
-                "train_ids": sorted(train_ids),
-                "dev_ids": sorted(dev_ids),
-            },
-            "environment": {
-                "python": sys.version,
-                "platform": platform.platform(),
-                "cwd": os.getcwd(),
-            },
-            **try_git_info(),
-        }
-        (args.output_dir / "run_manifest.json").write_text(json.dumps(manifest, indent=2), encoding="utf-8")
-
-        agent = BFCLAgent(
-            instruction_text=instruction_text,
-            model=args.model,
-            execution_lm=execution_lm,
-            base_dir=args.output_dir,
-            pytest_binary=args.pytest_binary,
-            enable_scoring_mode=args.gepa_scoring_mode,
-        )
-
-        # Baseline
-        t_base = time.perf_counter()
-        baseline_valid = 0
-        baseline_total = len(examples)
-        baseline_details: list[dict[str, Any]] = []
-        for e in examples:
-            pred = agent(test_id=e.test_id, question=e.question)
-            valid = False
-            if getattr(pred, "evaluation", None):
-                valid = bool(pred.evaluation.get("validation", {}).get("valid", False))
-            baseline_valid += 1 if valid else 0
-            baseline_details.append(
-                {
-                    "test_id": e.test_id,
-                    "valid": valid,
-                    "instruction_hash": getattr(pred, "instruction_hash", None),
-                    "run_dir": getattr(pred, "run_dir", None),
-                    "timing": getattr(pred, "timing", None),
-                    "eval_error": getattr(pred, "eval_error", None),
-                }
-            )
-        timings["baseline_s"] = time.perf_counter() - t_base
-
-        baseline_valid_rate = baseline_valid / max(baseline_total, 1)
-        (args.output_dir / "baseline.json").write_text(
-            json.dumps(
-                {
-                    "run_id": run_id,
-                    "instruction_hash": instruction_hash,
-                    "bfcl_valid_rate": baseline_valid_rate,
-                    "valid": baseline_valid,
-                    "total": baseline_total,
-                    "test_ids": [e.test_id for e in examples],
-                    "model": args.model,
-                    "score_definition": score_def,
-                    "runs": baseline_details,
-                },
-                indent=2,
-            ),
-            encoding="utf-8",
-        )
-        print(f"[{utc_now_iso()}] Baseline BFCL valid rate: {baseline_valid_rate:.3f} ({baseline_valid}/{baseline_total})")
-
-        # GEPA
-        t_gepa = time.perf_counter()
-        reflection_lm = dspy.LM(args.reflection_model)
-        execution_lm = dspy.LM(args.model)
-        
-        dspy.configure(lm=reflection_lm)
-
-        gepa_kwargs: dict[str, Any] = dict(
-            metric=bfcl_metric_with_feedback,
-            reflection_lm=reflection_lm,
-            track_stats=True,
-            log_dir=str(args.output_dir / "gepa_logs"),
-            seed=42,
-        )
-        if args.auto is not None:
-            gepa_kwargs["auto"] = args.auto
-        else:
-            gepa_kwargs["max_full_evals"] = args.max_evaluations
-            
-        gepa_kwargs["reflection_lm"] = args.reflection_model
-
-        # Persist GEPA config/hparams exactly
-        (args.output_dir / "gepa_config.json").write_text(json.dumps(safe_json(gepa_kwargs), indent=2), encoding="utf-8")
-
-        gepa = GEPA(**gepa_kwargs)
-        optimized_agent = gepa.compile(agent, trainset=trainset, valset=devset)
-        results = optimized_agent.detailed_results
-        timings["gepa_compile_s"] = time.perf_counter() - t_gepa
-
-        # Final candidates summary (still useful)
-        candidates = []
-        for i, cand in enumerate(results.candidates):
-            instr = cand.get_instruction_text()
-            candidates.append(
-                {
-                    "candidate_id": i,
-                    "instruction_hash": sha256_text(instr),
-                    "instruction_text": instr,
-                    "val_score": results.val_aggregate_scores[i],
-                    "discovered_at_metric_call": results.discovery_eval_counts[i],
-                    "parents": results.parents[i],
-                }
-            )
-        (args.output_dir / "gepa_candidates.json").write_text(json.dumps(candidates, indent=2), encoding="utf-8")
-
-        # Pareto
-        best_ids = set().union(*results.per_val_instance_best_candidates)
-        with open(args.output_dir / "gepa_pareto.txt", "w", encoding="utf-8") as f:
-            f.write("GEPA Pareto Frontier\n====================\n\n")
-            for i in sorted(best_ids, key=lambda i: results.val_aggregate_scores[i], reverse=True):
-                f.write(f"Candidate {i} | score={results.val_aggregate_scores[i]:.3f}\n")
-                f.write("-" * 40 + "\n")
-                f.write(results.candidates[i].get_instruction_text() + "\n\n")
-
-        final_instr = optimized_agent.get_instruction_text()
-        (args.output_dir / "optimized_instructions.txt").write_text(final_instr, encoding="utf-8")
-
-        # Scores file (explicit: which examples and how computed)
-        scores_payload = {
-            "run_id": run_id,
-            "score_definition": score_def,
-            "dataset_split": {
-                "train_ids": sorted(train_ids),
-                "dev_ids": sorted(dev_ids),
-            },
-            "baseline": {
-                "bfcl_valid_rate_over_all_examples": baseline_valid_rate,
-                "examples_used": [e.test_id for e in examples],
-                "valid_count": baseline_valid,
-                "total_count": baseline_total,
-            },
-            "gepa": {
-                "objective": "final (0.9*hard_valid + 0.1*soft) aggregated over dev set by GEPA internals",
-                "val_aggregate_scores": safe_json(results.val_aggregate_scores),
-                "candidate_count": len(results.candidates),
-            },
-            "note": "For per-evaluation, per-test, per-step details see metric_calls.jsonl (append-only).",
-        }
-        (args.output_dir / "scores.json").write_text(json.dumps(scores_payload, indent=2), encoding="utf-8")
-
-        # Metadata + timings
-        timings["total_wall_s"] = time.perf_counter() - overall_t0
-        (args.output_dir / "timings.json").write_text(json.dumps({"run_id": run_id, **timings}, indent=2), encoding="utf-8")
-
-        meta = {
-            "run_id": run_id,
-            "baseline_bfcl_valid_rate": baseline_valid_rate,
-            "final_score": max(results.val_aggregate_scores) if results.val_aggregate_scores else None,
-            "total_metric_calls": results.total_metric_calls,
-            "num_full_val_evals": results.num_full_val_evals,
-            "seed": results.seed,
-        }
-        (args.output_dir / "optimization_metadata.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
-
-        print(f"[{utc_now_iso()}] Done. See {args.output_dir}/run_manifest.json, scores.json, metric_calls.jsonl")
-
-    finally:
-        # Restore streams and close file
-        sys.stdout.flush()
-        sys.stderr.flush()
-        sys.stdout = real_out
-        sys.stderr = real_err
-        console_log_f.close()
-
-
-if __name__ == "__main__":
-    main()

From a686189d8da9a0ac719d4c030d17f5b606032b2d Mon Sep 17 00:00:00 2001
From: Parth Kotwal <parthkotwa07@gmail.com>
Date: Mon, 19 Jan 2026 02:12:39 -0800
Subject: [PATCH 23/23] Updated candidate analysis to take in output_dir
 argument and easy to run

---
 .../gepa_analysis/candidate_snapshots.py      | 32 +++++++++++++--
 experiments/gepa_analysis/prompt_diff.py      | 32 +++++++++++++--
 experiments/gepa_analysis/prompt_timeline.py  | 39 ++++++++++++++++---
 experiments/gepa_analysis/run_all.py          | 35 +++++++++++++++++
 4 files changed, 125 insertions(+), 13 deletions(-)
 create mode 100644 experiments/gepa_analysis/run_all.py

diff --git a/experiments/gepa_analysis/candidate_snapshots.py b/experiments/gepa_analysis/candidate_snapshots.py
index ad9ae98..8f22c01 100644
--- a/experiments/gepa_analysis/candidate_snapshots.py
+++ b/experiments/gepa_analysis/candidate_snapshots.py
@@ -1,6 +1,7 @@
-import json
 from pathlib import Path
-from datetime import datetime
+import argparse
+import json
+
 import pandas as pd
 
 
@@ -55,9 +56,32 @@ def build_candidate_prompt_table(df: pd.DataFrame) -> pd.DataFrame:
     return candidate_df.sort_values("first_seen_ts").reset_index(drop=True)
 
 
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Build candidate prompt table.")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="1-14-prefinal",
+        help="Run directory name or path under outputs/gepa_on_bfcl (analysis lives in outputs/gepa_analysis).",
+    )
+    return parser.parse_args()
+
+def resolve_run_dir(output_dir_arg: str) -> Path:
+    arg_path = Path(output_dir_arg)
+    parts = arg_path.parts
+    for idx, part in enumerate(parts[:-1]):
+        if part == "outputs" and parts[idx + 1] == "gepa_on_bfcl":
+            return arg_path
+        if part == "outputs" and parts[idx + 1] == "gepa_analysis":
+            return Path("./outputs/gepa_on_bfcl") / arg_path.name
+    return Path("./outputs/gepa_on_bfcl") / output_dir_arg
+
+
 def main():
-    run_dir = Path("./outputs/gepa_on_bfcl/1-14-prefinal")
-    output_dir = Path("./outputs/gepa_analysis/1-14-prefinal")
+    args = parse_args()
+    run_dir = resolve_run_dir(args.output_dir)
+    run_name = run_dir.name
+    output_dir = Path("./outputs/gepa_analysis") / run_name
     output_dir.mkdir(parents=True, exist_ok=True)
     
     snapshots_path = Path(run_dir / "candidate_snapshots.jsonl")
diff --git a/experiments/gepa_analysis/prompt_diff.py b/experiments/gepa_analysis/prompt_diff.py
index 9fe6f41..8ceda5f 100644
--- a/experiments/gepa_analysis/prompt_diff.py
+++ b/experiments/gepa_analysis/prompt_diff.py
@@ -1,6 +1,8 @@
-import pandas as pd
-from pathlib import Path
 import difflib
+from pathlib import Path
+import argparse
+
+import pandas as pd
 
 
 def unified_prompt_diff(base_text: str, new_text: str) -> str:
@@ -17,8 +19,30 @@ def unified_prompt_diff(base_text: str, new_text: str) -> str:
 
     return "\n".join(diff)
 
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Generate prompt diffs.")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="1-14-prefinal",
+        help="Run directory name or path under outputs/gepa_on_bfcl (analysis lives in outputs/gepa_analysis).",
+    )
+    return parser.parse_args()
+
+def resolve_analysis_dir(output_dir_arg: str) -> Path:
+    arg_path = Path(output_dir_arg)
+    parts = arg_path.parts
+    for idx, part in enumerate(parts[:-1]):
+        if part == "outputs" and parts[idx + 1] == "gepa_analysis":
+            return arg_path
+        if part == "outputs" and parts[idx + 1] == "gepa_on_bfcl":
+            return Path("./outputs/gepa_analysis") / arg_path.name
+    return Path("./outputs/gepa_analysis") / output_dir_arg
+
+
 def main():
-    output_dir = Path("./outputs/gepa_analysis/1-14-prefinal")
+    args = parse_args()
+    output_dir = resolve_analysis_dir(args.output_dir)
     df = pd.read_csv(output_dir / "candidate_snaps.csv")
 
     output_md = Path(output_dir / "prompt_diffs.md")
@@ -78,4 +102,4 @@ def write_section(title, base, other):
         )
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/experiments/gepa_analysis/prompt_timeline.py b/experiments/gepa_analysis/prompt_timeline.py
index 90777ac..ca60f7b 100644
--- a/experiments/gepa_analysis/prompt_timeline.py
+++ b/experiments/gepa_analysis/prompt_timeline.py
@@ -1,7 +1,9 @@
-import matplotlib.pyplot as plt
+import numpy as np
 from pathlib import Path
+import argparse
+
+import matplotlib.pyplot as plt
 import pandas as pd
-import numpy as np
 
 def plot_prompt_search_timeline(candidate_df: pd.DataFrame, output_dir: Path):
     # Add discovery order
@@ -89,6 +91,33 @@ def plot_prompt_search_timeline(candidate_df: pd.DataFrame, output_dir: Path):
     plt.savefig(output_dir / "prompt_search_timeline.png", dpi=150)
     plt.close()
 
-output_dir = Path("./outputs/gepa_analysis/1-14-prefinal")
-candidate_df = pd.read_csv(output_dir / "candidate_snaps.csv")
-plot_prompt_search_timeline(candidate_df, output_dir)
\ No newline at end of file
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Plot prompt search timeline.")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="1-14-prefinal",
+        help="Run directory name or path under outputs/gepa_on_bfcl (analysis lives in outputs/gepa_analysis).",
+    )
+    return parser.parse_args()
+
+def resolve_analysis_dir(output_dir_arg: str) -> Path:
+    arg_path = Path(output_dir_arg)
+    parts = arg_path.parts
+    for idx, part in enumerate(parts[:-1]):
+        if part == "outputs" and parts[idx + 1] == "gepa_analysis":
+            return arg_path
+        if part == "outputs" and parts[idx + 1] == "gepa_on_bfcl":
+            return Path("./outputs/gepa_analysis") / arg_path.name
+    return Path("./outputs/gepa_analysis") / output_dir_arg
+
+
+def main():
+    args = parse_args()
+    output_dir = resolve_analysis_dir(args.output_dir)
+    candidate_df = pd.read_csv(output_dir / "candidate_snaps.csv")
+    plot_prompt_search_timeline(candidate_df, output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/experiments/gepa_analysis/run_all.py b/experiments/gepa_analysis/run_all.py
new file mode 100644
index 0000000..d8f0108
--- /dev/null
+++ b/experiments/gepa_analysis/run_all.py
@@ -0,0 +1,35 @@
+import argparse
+import subprocess
+import sys
+from pathlib import Path
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run all GEPA analysis steps.")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="1-14-prefinal",
+        help="Run directory name or path under outputs/gepa_on_bfcl.",
+    )
+    return parser.parse_args()
+
+
+def run_step(script: str, output_dir: str) -> None:
+    result = subprocess.run(
+        [sys.executable, script, "--output-dir", output_dir],
+        check=False,
+    )
+    if result.returncode != 0:
+        raise SystemExit(result.returncode)
+
+
+def main() -> None:
+    args = parse_args()
+    run_step("experiments/gepa_analysis/candidate_snapshots.py", args.output_dir)
+    run_step("experiments/gepa_analysis/prompt_diff.py", args.output_dir)
+    run_step("experiments/gepa_analysis/prompt_timeline.py", args.output_dir)
+
+
+if __name__ == "__main__":
+    main()