From ba5e382d4d7d3ae684c9c0d9635fae7d0c0e749f Mon Sep 17 00:00:00 2001
From: Huiyun Peng <peng25h@mtholyoke.edu>
Date: Tue, 20 Jan 2026 19:28:02 -0500
Subject: [PATCH 1/5] benchmark agent v1

---
 .env.example                   |   8 +
 agents/__init__.py             |   1 +
 agents/benchmark.py            |  57 +++++
 evaluate.py                    |  18 +-
 requirements.txt               |   3 +
 tools/__init__.py              |   3 +-
 tools/benchmark.py             | 443 +++++++++++++++++++++++++++++++++
 workflows/complete_pipeline.py |  65 ++++-
 8 files changed, 585 insertions(+), 13 deletions(-)
 create mode 100644 agents/benchmark.py
 create mode 100644 tools/benchmark.py

diff --git a/.env.example b/.env.example
index 888a0fe..d25516b 100644
--- a/.env.example
+++ b/.env.example
@@ -36,3 +36,11 @@ LANGSMITH_PROJECT="Agentic Code Optimization"
 # ============================================================================
 # Set to true to enable verbose logging
 # LANGCHAIN_DEBUG=true
+
+# ============================================================================
+# Benchmark Configuration (optional)
+# ============================================================================
+# Command to run baseline/post benchmarks
+# Example:
+# make sure to run docker compose up -d first
+BENCHMARK_CMD="docker run --rm --platform linux/amd64 --network socialnetwork_default -e max_user_index=962 -v DeathStarBench/socialNetwork/wrk2/scripts/social-network/mixed-workload.lua:/scripts/sn_mixed.lua:ro deathstarbench/wrk2-client /usr/local/bin/wrk -D exp -t 12 -c 400 -d 300 -L -s /scripts/sn_mixed.lua http://nginx-thrift:8080/wrk2-api/home-timeline/read -R 200"
diff --git a/agents/__init__.py b/agents/__init__.py
index dc8919e..f281c37 100644
--- a/agents/__init__.py
+++ b/agents/__init__.py
@@ -1,5 +1,6 @@
 from .analyzer import AnalysisReport, AnalyzerAgent
 from .base import BaseAgent
+from .benchmark import BenchmarkAgent, BenchmarkReport
 from .correctness import (AnalysisResult, CorrectnessVerdict,
                           orchestrate_code_correctness)
 from .optimizer import OptimizationReport, OptimizerAgent
diff --git a/agents/benchmark.py b/agents/benchmark.py
new file mode 100644
index 0000000..924acc1
--- /dev/null
+++ b/agents/benchmark.py
@@ -0,0 +1,57 @@
+"""BenchmarkAgent for running external benchmark commands."""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+from agents.base import BaseAgent
+from tools.benchmark import run_benchmark_comparison
+
+
+class BenchmarkReport(BaseModel):
+    """Structured output for benchmark execution results."""
+
+    before: dict = Field(description="Benchmark result before optimization")
+    after: dict = Field(description="Benchmark result after optimization")
+    comparison: dict = Field(description="Comparison metrics between before and after")
+    report_paths: Optional[dict] = Field(
+        default=None, description="Paths to written benchmark reports if generated"
+    )
+    charts: Optional[dict] = Field(
+        default=None, description="Chart render results if generated"
+    )
+    summary: str = Field(
+        description="Short summary of the benchmark outcome and any errors"
+    )
+
+
+class BenchmarkAgent(BaseAgent):
+    """Agent that executes external benchmark commands and reports results."""
+
+    prompt = """You are an automation agent that runs benchmark commands.
+
+Input is JSON with:
+- command_env: env var for benchmark command (default: BENCHMARK_CMD)
+- workdir: working directory for the command
+- timeout_seconds: timeout in seconds
+- output_dir: optional directory to write benchmark artifacts
+- render_charts: whether to render benchmark charts (requires matplotlib)
+
+## Workflow
+1) Call run_benchmark_comparison with the provided inputs.
+2) Produce a BenchmarkReport JSON response.
+3) If the command errors or times out, reflect it in the summary.
+
+## Output Constraints
+- Always return a single JSON object that conforms to BenchmarkReport.
+- Do not emit tool calls in the final response.
+"""
+
+    structured_output_type = BenchmarkReport
+    return_state_field = "benchmark_report"
+
+    tools = [
+        run_benchmark_comparison,
+    ]
diff --git a/evaluate.py b/evaluate.py
index aadfb2a..75ff064 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -14,6 +14,7 @@
 
 import argparse
 import asyncio
+import json
 import logging
 import sys
 import time
@@ -28,8 +29,8 @@
 
 # Import all agents
 from agents import (AnalyzerAgent, BaseAgent, BehaviorSummarizerAgent,
-                    ComponentSummarizerAgent, EnvironmentSummarizerAgent,
-                    OptimizerAgent)
+                    BenchmarkAgent, ComponentSummarizerAgent,
+                    EnvironmentSummarizerAgent, OptimizerAgent)
 from utils import RunManager
 # Import all workflows
 from workflows import orchestrate_complete_pipeline, orchestrate_summarizers
@@ -41,6 +42,7 @@
     "BehaviorSummarizerAgent": BehaviorSummarizerAgent,
     "ComponentSummarizerAgent": ComponentSummarizerAgent,
     "EnvironmentSummarizerAgent": EnvironmentSummarizerAgent,
+    "BenchmarkAgent": BenchmarkAgent,
 }
 
 WORKFLOWS: dict[str, Callable] = {
@@ -102,7 +104,17 @@ async def evaluate_agent(agent_class: Type[BaseAgent], repo_path: str) -> None:
     try:
         logger.info(f"Starting agent execution for repository: {repo_path_obj.absolute()}")
         start_time = time.time()
-        result = await agent.run(str(repo_path_obj.absolute()))
+        if agent_class is BenchmarkAgent:
+            benchmark_dir = run_dir / "benchmark"
+            payload = {
+                "command_env": "BENCHMARK_CMD",
+                "workdir": str(repo_path_obj.absolute()),
+                "output_dir": str(benchmark_dir),
+                "render_charts": True,
+            }
+            result = await agent.run(json.dumps(payload))
+        else:
+            result = await agent.run(str(repo_path_obj.absolute()))
         execution_time = time.time() - start_time
         logger.info(f"Agent execution completed successfully, result length: {len(result) if result else 0}")
         logger.info(f"RESULT:\n{result}")
diff --git a/requirements.txt b/requirements.txt
index f2c458d..5874b04 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -34,3 +34,6 @@ beautilog>=0.2.4
 # Utilities
 python-json-logger>=2.0.0
 colorama>=0.4.6
+
+# Benchmark Visualization
+matplotlib>=3.8.0
diff --git a/tools/__init__.py b/tools/__init__.py
index ae59104..25e5d5d 100644
--- a/tools/__init__.py
+++ b/tools/__init__.py
@@ -1,7 +1,8 @@
 """Tools for agents."""
 
 from .analysis import (build_analysis_bundle, read_code_snippet, read_file,
-                            search_codebase)
+                       search_codebase)
+from .benchmark import run_benchmark_command, run_benchmark_comparison
 from .behavior import (analyze_code_complexity, analyze_code_structure,
                        analyze_data_dependencies,
                        analyze_error_handling_strategy,
diff --git a/tools/benchmark.py b/tools/benchmark.py
new file mode 100644
index 0000000..d741508
--- /dev/null
+++ b/tools/benchmark.py
@@ -0,0 +1,443 @@
+"""Tools for running external benchmark commands."""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+import re
+import shlex
+import subprocess
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence
+
+from langchain_core.tools import tool
+
+
+def _normalize_command(command: str | Sequence[str]) -> List[str]:
+    if isinstance(command, str):
+        return shlex.split(command)
+    return list(command)
+
+_TIME_UNITS_MS = {
+    "us": 0.001,
+    "µs": 0.001,
+    "ms": 1.0,
+    "s": 1000.0,
+}
+
+_DATA_UNITS_MB = {
+    "KB": 1.0 / 1024.0,
+    "MB": 1.0,
+    "GB": 1024.0,
+}
+
+
+def _to_ms(value: float, unit: str) -> Optional[float]:
+    factor = _TIME_UNITS_MS.get(unit)
+    if factor is None:
+        return None
+    return value * factor
+
+
+def _to_mb(value: float, unit: str) -> Optional[float]:
+    factor = _DATA_UNITS_MB.get(unit)
+    if factor is None:
+        return None
+    return value * factor
+
+
+def parse_wrk_output(output: str) -> Dict[str, Any]:
+    """Parse wrk/wrk2 output and extract summary metrics."""
+    metrics: Dict[str, Any] = {
+        "latency_ms": {},
+        "percentiles_ms": {},
+        "socket_errors": {},
+    }
+
+    latency_line = re.search(
+        r"Latency\s+([\d\.]+)([a-zµ]+)\s+([\d\.]+)([a-zµ]+)\s+([\d\.]+)([a-zµ]+)",
+        output,
+        re.IGNORECASE,
+    )
+    if latency_line:
+        avg = _to_ms(float(latency_line.group(1)), latency_line.group(2))
+        stdev = _to_ms(float(latency_line.group(3)), latency_line.group(4))
+        p99 = _to_ms(float(latency_line.group(5)), latency_line.group(6))
+        if avg is not None:
+            metrics["latency_ms"]["avg"] = avg
+        if stdev is not None:
+            metrics["latency_ms"]["stdev"] = stdev
+        if p99 is not None:
+            metrics["latency_ms"]["p99"] = p99
+
+    for line in output.splitlines():
+        match = re.match(r"^\s*([\d\.]+)%\s+([\d\.]+)([a-zµ]+)\s*$", line)
+        if not match:
+            continue
+        percentile = match.group(1)
+        value = _to_ms(float(match.group(2)), match.group(3))
+        if value is not None:
+            metrics["percentiles_ms"][percentile] = value
+
+    rps_match = re.search(r"Requests/sec:\s*([\d\.]+)", output)
+    if rps_match:
+        metrics["requests_per_sec"] = float(rps_match.group(1))
+
+    transfer_match = re.search(r"Transfer/sec:\s*([\d\.]+)\s*([KMG]B)", output)
+    if transfer_match:
+        transfer_mb = _to_mb(float(transfer_match.group(1)), transfer_match.group(2))
+        if transfer_mb is not None:
+            metrics["transfer_per_sec_mb"] = transfer_mb
+
+    socket_match = re.search(
+        r"Socket errors:\s*connect\s+(\d+),\s*read\s+(\d+),\s*write\s+(\d+),\s*timeout\s+(\d+)",
+        output,
+    )
+    if socket_match:
+        metrics["socket_errors"] = {
+            "connect": int(socket_match.group(1)),
+            "read": int(socket_match.group(2)),
+            "write": int(socket_match.group(3)),
+            "timeout": int(socket_match.group(4)),
+        }
+
+    summary_match = re.search(
+        r"(\d+)\s+requests in\s+([\d\.]+)([smh]),\s+([\d\.]+)([KMG]B)\s+read",
+        output,
+    )
+    if summary_match:
+        metrics["total_requests"] = int(summary_match.group(1))
+        duration = float(summary_match.group(2))
+        duration_unit = summary_match.group(3)
+        if duration_unit == "s":
+            metrics["duration_seconds"] = duration
+        elif duration_unit == "m":
+            metrics["duration_seconds"] = duration * 60.0
+        elif duration_unit == "h":
+            metrics["duration_seconds"] = duration * 3600.0
+
+        data_mb = _to_mb(float(summary_match.group(4)), summary_match.group(5))
+        if data_mb is not None:
+            metrics["data_read_mb"] = data_mb
+
+    return metrics
+
+
+def compare_benchmark_metrics(before: Dict[str, Any], after: Dict[str, Any]) -> Dict[str, Any]:
+    """Compare benchmark metrics before vs after."""
+    def delta(before_val: Optional[float], after_val: Optional[float]) -> Dict[str, Any]:
+        if before_val is None or after_val is None:
+            return {"before": before_val, "after": after_val, "delta": None, "pct_change": None}
+        diff = after_val - before_val
+        pct = (diff / before_val * 100.0) if before_val != 0 else None
+        return {
+            "before": before_val,
+            "after": after_val,
+            "delta": diff,
+            "pct_change": pct,
+        }
+
+    before_metrics = before.get("metrics", {})
+    after_metrics = after.get("metrics", {})
+
+    comparison = {
+        "requests_per_sec": delta(before_metrics.get("requests_per_sec"), after_metrics.get("requests_per_sec")),
+        "transfer_per_sec_mb": delta(
+            before_metrics.get("transfer_per_sec_mb"),
+            after_metrics.get("transfer_per_sec_mb"),
+        ),
+        "latency_avg_ms": delta(
+            before_metrics.get("latency_ms", {}).get("avg"),
+            after_metrics.get("latency_ms", {}).get("avg"),
+        ),
+        "latency_p50_ms": delta(
+            before_metrics.get("percentiles_ms", {}).get("50.000"),
+            after_metrics.get("percentiles_ms", {}).get("50.000"),
+        ),
+        "latency_p90_ms": delta(
+            before_metrics.get("percentiles_ms", {}).get("90.000"),
+            after_metrics.get("percentiles_ms", {}).get("90.000"),
+        ),
+        "latency_p99_ms": delta(
+            before_metrics.get("percentiles_ms", {}).get("99.000"),
+            after_metrics.get("percentiles_ms", {}).get("99.000"),
+        ),
+    }
+    return comparison
+
+
+def write_benchmark_report(
+    output_dir: Path,
+    before: Dict[str, Any],
+    after: Dict[str, Any],
+    comparison: Dict[str, Any],
+) -> Dict[str, Any]:
+    """Write benchmark summary artifacts to disk."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+    report_path = output_dir / "benchmark_report.json"
+    comparison_path = output_dir / "benchmark_comparison.json"
+
+    report_path.write_text(json.dumps({"before": before, "after": after}, indent=2), encoding="utf-8")
+    comparison_path.write_text(json.dumps(comparison, indent=2), encoding="utf-8")
+
+    return {
+        "benchmark_report": report_path.as_posix(),
+        "benchmark_comparison": comparison_path.as_posix(),
+    }
+
+
+def render_benchmark_charts(
+    output_dir: Path,
+    comparison: Dict[str, Any],
+    before: Dict[str, Any],
+    after: Dict[str, Any],
+) -> Dict[str, Any]:
+    """Render benchmark comparison charts (optional, requires matplotlib)."""
+    try:
+        import matplotlib.pyplot as plt  # type: ignore
+    except Exception as exc:
+        return {"error": f"matplotlib_not_installed: {exc}", "charts": []}
+
+    charts = []
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    latency_items = [
+        ("p50", comparison.get("latency_p50_ms")),
+        ("p90", comparison.get("latency_p90_ms")),
+        ("p99", comparison.get("latency_p99_ms")),
+        ("avg", comparison.get("latency_avg_ms")),
+    ]
+    labels = [
+        name
+        for name, item in latency_items
+        if item and item["before"] is not None and item["after"] is not None
+    ]
+    before_vals = [
+        item["before"]
+        for name, item in latency_items
+        if item and item["before"] is not None and item["after"] is not None
+    ]
+    after_vals = [
+        item["after"]
+        for name, item in latency_items
+        if item and item["before"] is not None and item["after"] is not None
+    ]
+
+    if labels:
+        fig, ax = plt.subplots(figsize=(6, 4))
+        x = range(len(labels))
+        ax.bar([i - 0.2 for i in x], before_vals, width=0.4, label="Before")
+        ax.bar([i + 0.2 for i in x], after_vals, width=0.4, label="After")
+        ax.set_xticks(list(x))
+        ax.set_xticklabels(labels)
+        ax.set_ylabel("Latency (ms)")
+        ax.set_title("Latency Comparison")
+        ax.legend()
+        chart_path = output_dir / "latency_comparison.png"
+        fig.tight_layout()
+        fig.savefig(chart_path, dpi=140)
+        plt.close(fig)
+        charts.append(chart_path.as_posix())
+
+    throughput_items = [
+        ("rps", comparison.get("requests_per_sec")),
+        ("MB/s", comparison.get("transfer_per_sec_mb")),
+    ]
+    labels = [
+        name
+        for name, item in throughput_items
+        if item and item["before"] is not None and item["after"] is not None
+    ]
+    before_vals = [
+        item["before"]
+        for name, item in throughput_items
+        if item and item["before"] is not None and item["after"] is not None
+    ]
+    after_vals = [
+        item["after"]
+        for name, item in throughput_items
+        if item and item["before"] is not None and item["after"] is not None
+    ]
+
+    if labels:
+        fig, ax = plt.subplots(figsize=(6, 4))
+        x = range(len(labels))
+        ax.bar([i - 0.2 for i in x], before_vals, width=0.4, label="Before")
+        ax.bar([i + 0.2 for i in x], after_vals, width=0.4, label="After")
+        ax.set_xticks(list(x))
+        ax.set_xticklabels(labels)
+        ax.set_ylabel("Throughput")
+        ax.set_title("Throughput Comparison")
+        ax.legend()
+        chart_path = output_dir / "throughput_comparison.png"
+        fig.tight_layout()
+        fig.savefig(chart_path, dpi=140)
+        plt.close(fig)
+        charts.append(chart_path.as_posix())
+
+    return {"charts": charts}
+
+
+def _run_benchmark_command_impl(
+    command: str | List[str],
+    command_env: str,
+    workdir: str,
+    timeout_seconds: int,
+    output_dir: str,
+    label: str,
+) -> Dict[str, Any]:
+    if not command:
+        command = os.environ.get(command_env, "")
+
+    use_shell = isinstance(command, str)
+    argv = _normalize_command(command) if not use_shell else []
+    if (use_shell and not command) or (not use_shell and not argv):
+        return {"error": "empty_command", "command_env": command_env}
+
+    cwd = Path(workdir).resolve() if workdir else Path.cwd().resolve()
+    if not cwd.exists():
+        return {"error": "workdir_not_found", "workdir": str(cwd)}
+
+    start = time.monotonic()
+    timed_out = False
+    try:
+        result = subprocess.run(
+            command if use_shell else argv,
+            capture_output=True,
+            text=True,
+            cwd=str(cwd),
+            timeout=timeout_seconds,
+            shell=use_shell,
+            executable="/bin/bash" if use_shell else None,
+        )
+        stdout_text = result.stdout or ""
+        stderr_text = result.stderr or ""
+        exit_code = result.returncode
+    except subprocess.TimeoutExpired as exc:
+        timed_out = True
+        stdout_text = exc.stdout or ""
+        stderr_text = exc.stderr or ""
+        exit_code = None
+    except FileNotFoundError:
+        return {"error": "command_not_found", "command": argv}
+    except Exception as exc:
+        return {"error": "command_failed", "detail": str(exc)}
+
+    duration = time.monotonic() - start
+
+    metrics = parse_wrk_output(stdout_text) or parse_wrk_output(stderr_text)
+
+    payload: Dict[str, Any] = {
+        "command": command if use_shell else argv,
+        "workdir": str(cwd),
+        "exit_code": exit_code,
+        "timed_out": timed_out,
+        "duration_seconds": round(duration, 3),
+        "stdout": stdout_text,
+        "stderr": stderr_text,
+        "metrics": metrics,
+    }
+
+    if output_dir:
+        out_dir = Path(output_dir)
+        out_dir.mkdir(parents=True, exist_ok=True)
+        stdout_path = out_dir / f"{label}_stdout.txt"
+        stderr_path = out_dir / f"{label}_stderr.txt"
+        stdout_path.write_text(stdout_text or "", encoding="utf-8")
+        stderr_path.write_text(stderr_text or "", encoding="utf-8")
+        payload["stdout_path"] = stdout_path.as_posix()
+        payload["stderr_path"] = stderr_path.as_posix()
+
+    return payload
+
+
+@tool
+async def run_benchmark_command(
+    command: str | List[str] = "",
+    command_env: str = "BENCHMARK_CMD",
+    workdir: str = "",
+    timeout_seconds: int = 1800,
+    output_dir: str = "",
+    label: str = "benchmark",
+) -> str:
+    """Run an external benchmark command and return stdout/stderr metadata.
+
+    Args:
+        command (str | List[str]): Command string or argv list to execute. If empty, loads from command_env.
+        command_env (str): Environment variable containing the command string. Default: BENCHMARK_CMD.
+        workdir (str): Working directory for the command. Defaults to current directory.
+        timeout_seconds (int): Timeout in seconds. Default: 1800.
+        output_dir (str): Optional directory to write stdout/stderr files.
+        label (str): Label prefix for output files when output_dir is set.
+
+    Returns:
+        str: JSON string with execution metadata and outputs.
+    """
+    payload = await asyncio.to_thread(
+        _run_benchmark_command_impl,
+        command,
+        command_env,
+        workdir,
+        timeout_seconds,
+        output_dir,
+        label,
+    )
+    return json.dumps(payload)
+
+
+@tool
+async def run_benchmark_comparison(
+    command_env: str = "BENCHMARK_CMD",
+    workdir: str = "",
+    timeout_seconds: int = 1800,
+    output_dir: str = "",
+    render_charts: bool = True,
+) -> str:
+    """Run benchmark commands before/after and return comparison payload."""
+    command_before = os.environ.get(command_env, "")
+    command_after = os.environ.get(command_env, "")
+
+    before = await asyncio.to_thread(
+        _run_benchmark_command_impl,
+        command_before,
+        command_env,
+        workdir,
+        timeout_seconds,
+        output_dir,
+        "before",
+    )
+    after = await asyncio.to_thread(
+        _run_benchmark_command_impl,
+        command_after,
+        command_env,
+        workdir,
+        timeout_seconds,
+        output_dir,
+        "after",
+    )
+
+    if "error" in before:
+        return json.dumps({"error": "before_failed", "before": before})
+    if "error" in after:
+        return json.dumps({"error": "after_failed", "after": after})
+
+    comparison = compare_benchmark_metrics(before, after)
+    payload: Dict[str, Any] = {
+        "before": before,
+        "after": after,
+        "comparison": comparison,
+    }
+
+    if output_dir:
+        report_paths = write_benchmark_report(
+            Path(output_dir), before, after, comparison
+        )
+        payload["report_paths"] = report_paths
+        if render_charts:
+            payload["charts"] = render_benchmark_charts(
+                Path(output_dir), comparison, before, after
+            )
+
+    return json.dumps(payload)
diff --git a/workflows/complete_pipeline.py b/workflows/complete_pipeline.py
index 5e8089b..46b9a58 100644
--- a/workflows/complete_pipeline.py
+++ b/workflows/complete_pipeline.py
@@ -1,11 +1,12 @@
 """LangGraph-based orchestrator for the complete optimization pipeline.
 
-This module implements a four-phase iterative pipeline:
+This module implements a five-phase iterative pipeline:
 1. PHASE 1 - SUMMARIZATION: Run environment, component, and behavior summarizers in parallel
 2. PHASE 2 - ANALYSIS: Analyze summaries to produce optimization guidance with static signals
 3. PHASE 3 - OPTIMIZATION: Apply safe code improvements based on analysis
 4. PHASE 4 - CORRECTNESS CHECK: Validate applied changes for correctness and quality
-5. LOOP: Conditionally loop back to summarization or exit based on iteration count
+5. PHASE 5 - BENCHMARK: Run external benchmark command if configured
+6. LOOP: Conditionally loop back to summarization or exit based on iteration count
 
 The pipeline uses temporary files for inter-agent communication and manages state flow
 through a LangGraph workflow with iterative refinement.
@@ -22,8 +23,9 @@
 from langgraph.graph import END, START, StateGraph
 from langgraph.graph.state import CompiledStateGraph
 
-from agents import (AnalysisReport, AnalyzerAgent, OptimizationReport,
-                    OptimizerAgent, orchestrate_code_correctness)
+from agents import (AnalysisReport, AnalyzerAgent, BenchmarkAgent,
+                    BenchmarkReport, OptimizationReport, OptimizerAgent,
+                    orchestrate_code_correctness)
 
 from .summary_orchestrator import orchestrate_summarizers
 
@@ -107,9 +109,15 @@ def build_pipeline_state(phases: list[PipelinePhase]) -> Type[TypedDict]:
     PipelinePhase(
         "correctness_check",
         None,
-        return_fields=["analysis_result", "correctness_verdict"],
+        return_fields=["analysis_result", "correctness_verdict", "correctness_report"],
         return_class=tuple,  # Composite: (AnalysisResult, CorrectnessVerdict)
     ),
+    PipelinePhase(
+        "benchmark",
+        None,
+        agent_class=BenchmarkAgent,
+        return_class=BenchmarkReport,
+    ),
 ]
 
 # Dynamically build PipelineState from phase definitions
@@ -237,6 +245,33 @@ async def optimize_node(state: PipelineState) -> dict:
     return {"optimization_report": result}
 
 
+async def benchmark_node(state: PipelineState) -> dict:
+    """PHASE 5: Run benchmark command to measure performance impact.
+
+    Uses BenchmarkAgent to execute an external benchmark command configured
+    via environment variable (BENCHMARK_CMD by default). The benchmark
+    runs in the repository root unless overridden.
+
+    Args:
+        state: Pipeline state containing code_path
+
+    Returns:
+        Dictionary with benchmark_report (JSON string)
+    """
+    logger.info("PHASE 5: Running BenchmarkAgent")
+    agent = BenchmarkAgent()
+
+    payload = {
+        "command_env": "BENCHMARK_CMD",
+        "workdir": state["code_path"],
+    }
+
+    result = await agent.run(json.dumps(payload))
+    logger.info("PHASE 5: Benchmark complete")
+
+    return {"benchmark_report": result}
+
+
 async def correctness_check_node(state: PipelineState) -> dict:
     """PHASE 4: Run code correctness workflow.
 
@@ -289,6 +324,12 @@ async def correctness_check_node(state: PipelineState) -> dict:
     return {
         "analysis_result": correctness_state.get("analysis_result", ""),
         "correctness_verdict": correctness_state.get("correctness_verdict", ""),
+        "correctness_report": json.dumps(
+            {
+                "analysis_result": correctness_state.get("analysis_result", ""),
+                "correctness_verdict": correctness_state.get("correctness_verdict", ""),
+            }
+        ),
     }
 
 
@@ -316,14 +357,15 @@ def should_continue_loop(state: PipelineState) -> Literal["summarization", str]:
 
 
 def build_complete_pipeline() -> CompiledStateGraph:
-    """Build the complete four-phase optimization pipeline with iteration loop.
+    """Build the complete five-phase optimization pipeline with iteration loop.
 
     Constructs a LangGraph StateGraph from phase definitions that orchestrates:
     1. Summarization (parallel environment, component, behavior summaries)
     2. Analysis (structured guidance based on summaries with static signals)
     3. Optimization (apply safe code improvements)
     4. Correctness Check (orchestrates pain analysis and structured verdict)
-    5. Loop decision (conditionally loop back to summarization or end)
+    5. Benchmark (run external benchmark command)
+    6. Loop decision (conditionally loop back to summarization or end)
 
     The workflow follows a sequential DAG with conditional loop:
     START → summarization → analysis → optimization → correctness_check → (loop decision) → {summarization | END}
@@ -332,7 +374,8 @@ def build_complete_pipeline() -> CompiledStateGraph:
     - summarization → summary_text
     - analysis → analysis_report
     - optimization → optimization_report
-    - correctness_check → pain_analysis_result, correctness_verdict
+    - correctness_check → analysis_result, correctness_verdict, correctness_report
+    - benchmark → benchmark_report
 
     Returns:
         Compiled LangGraph StateGraph ready for invocation
@@ -346,6 +389,7 @@ def build_complete_pipeline() -> CompiledStateGraph:
         "analysis": analyze_node,
         "optimization": optimize_node,
         "correctness_check": correctness_check_node,
+        "benchmark": benchmark_node,
     }
 
     # Add nodes from phase definitions
@@ -413,7 +457,8 @@ async def orchestrate_complete_pipeline(code_path: str) -> PipelineState:
     2. Analysis: Produces structured optimization guidance with static analysis
     3. Optimization: Applies safe code improvements
     4. Correctness Check: Validates applied changes for correctness and quality
-    5. Loop: Conditionally iterates up to MAX_ITERATIONS times
+    5. Benchmark: Runs external benchmark command if configured
+    6. Loop: Conditionally iterates up to MAX_ITERATIONS times
 
     State fields are dynamically generated from phase definitions:
     - Phase outputs are automatically added to state based on agent.return_state_field
@@ -431,6 +476,8 @@ async def orchestrate_complete_pipeline(code_path: str) -> PipelineState:
         - optimization_report: Final OptimizationReport JSON from final iteration
         - analysis_result: Detailed correctness analysis from final iteration
         - correctness_verdict: Yes/No verdict from final iteration
+        - correctness_report: Combined correctness report JSON from final iteration
+        - benchmark_report: BenchmarkReport JSON from final iteration
         - iteration_count: Number of iterations completed
     """
     logger.info(f"Starting complete optimization pipeline for {code_path}")

From 3a1325a67a5e5d862260302b6c2198a517c50458 Mon Sep 17 00:00:00 2001
From: Huiyun Peng <peng25h@mtholyoke.edu>
Date: Tue, 20 Jan 2026 19:28:39 -0500
Subject: [PATCH 2/5] benchmark agent version2

---
 .env.example                   |  13 ++-
 agents/__init__.py             |   1 -
 agents/benchmark.py            |  57 ------------
 evaluate.py                    |  24 ++---
 tools/__init__.py              |   2 +-
 workflows/complete_pipeline.py | 163 +++++++++++++++++++++------------
 6 files changed, 127 insertions(+), 133 deletions(-)
 delete mode 100644 agents/benchmark.py

diff --git a/.env.example b/.env.example
index d25516b..1e27917 100644
--- a/.env.example
+++ b/.env.example
@@ -42,5 +42,14 @@ LANGSMITH_PROJECT="Agentic Code Optimization"
 # ============================================================================
 # Command to run baseline/post benchmarks
 # Example:
-# make sure to run docker compose up -d first
-BENCHMARK_CMD="docker run --rm --platform linux/amd64 --network socialnetwork_default -e max_user_index=962 -v DeathStarBench/socialNetwork/wrk2/scripts/social-network/mixed-workload.lua:/scripts/sn_mixed.lua:ro deathstarbench/wrk2-client /usr/local/bin/wrk -D exp -t 12 -c 400 -d 300 -L -s /scripts/sn_mixed.lua http://nginx-thrift:8080/wrk2-api/home-timeline/read -R 200"
+export BENCHMARK_CMD="cd DeathStarBench/socialNetwork && \
+docker compose build user-service && \
+docker compose up -d && \
+docker run --rm --platform linux/amd64 \
+  --network socialnetwork_default \
+  -e max_user_index=962 \
+  -v /Users/peng397/Desktop/agentic-code-optimization/DeathStarBench/socialNetwork/wrk2/scripts/social-network/mixed-workload.lua:/tmp/sn_mixed.lua:ro \
+  deathstarbench/wrk2-client \
+  /usr/local/bin/wrk -D exp -t 12 -c 400 -d 30 -L \
+  -s /tmp/sn_mixed.lua \
+  http://nginx-thrift:8080/wrk2-api/home-timeline/read -R 200"
\ No newline at end of file
diff --git a/agents/__init__.py b/agents/__init__.py
index f281c37..dc8919e 100644
--- a/agents/__init__.py
+++ b/agents/__init__.py
@@ -1,6 +1,5 @@
 from .analyzer import AnalysisReport, AnalyzerAgent
 from .base import BaseAgent
-from .benchmark import BenchmarkAgent, BenchmarkReport
 from .correctness import (AnalysisResult, CorrectnessVerdict,
                           orchestrate_code_correctness)
 from .optimizer import OptimizationReport, OptimizerAgent
diff --git a/agents/benchmark.py b/agents/benchmark.py
deleted file mode 100644
index 924acc1..0000000
--- a/agents/benchmark.py
+++ /dev/null
@@ -1,57 +0,0 @@
-"""BenchmarkAgent for running external benchmark commands."""
-
-from __future__ import annotations
-
-from typing import Optional
-
-from pydantic import BaseModel, Field
-
-from agents.base import BaseAgent
-from tools.benchmark import run_benchmark_comparison
-
-
-class BenchmarkReport(BaseModel):
-    """Structured output for benchmark execution results."""
-
-    before: dict = Field(description="Benchmark result before optimization")
-    after: dict = Field(description="Benchmark result after optimization")
-    comparison: dict = Field(description="Comparison metrics between before and after")
-    report_paths: Optional[dict] = Field(
-        default=None, description="Paths to written benchmark reports if generated"
-    )
-    charts: Optional[dict] = Field(
-        default=None, description="Chart render results if generated"
-    )
-    summary: str = Field(
-        description="Short summary of the benchmark outcome and any errors"
-    )
-
-
-class BenchmarkAgent(BaseAgent):
-    """Agent that executes external benchmark commands and reports results."""
-
-    prompt = """You are an automation agent that runs benchmark commands.
-
-Input is JSON with:
-- command_env: env var for benchmark command (default: BENCHMARK_CMD)
-- workdir: working directory for the command
-- timeout_seconds: timeout in seconds
-- output_dir: optional directory to write benchmark artifacts
-- render_charts: whether to render benchmark charts (requires matplotlib)
-
-## Workflow
-1) Call run_benchmark_comparison with the provided inputs.
-2) Produce a BenchmarkReport JSON response.
-3) If the command errors or times out, reflect it in the summary.
-
-## Output Constraints
-- Always return a single JSON object that conforms to BenchmarkReport.
-- Do not emit tool calls in the final response.
-"""
-
-    structured_output_type = BenchmarkReport
-    return_state_field = "benchmark_report"
-
-    tools = [
-        run_benchmark_comparison,
-    ]
diff --git a/evaluate.py b/evaluate.py
index 75ff064..8ba896f 100644
--- a/evaluate.py
+++ b/evaluate.py
@@ -29,7 +29,7 @@
 
 # Import all agents
 from agents import (AnalyzerAgent, BaseAgent, BehaviorSummarizerAgent,
-                    BenchmarkAgent, ComponentSummarizerAgent,
+                    ComponentSummarizerAgent,
                     EnvironmentSummarizerAgent, OptimizerAgent)
 from utils import RunManager
 # Import all workflows
@@ -42,7 +42,6 @@
     "BehaviorSummarizerAgent": BehaviorSummarizerAgent,
     "ComponentSummarizerAgent": ComponentSummarizerAgent,
     "EnvironmentSummarizerAgent": EnvironmentSummarizerAgent,
-    "BenchmarkAgent": BenchmarkAgent,
 }
 
 WORKFLOWS: dict[str, Callable] = {
@@ -104,17 +103,7 @@ async def evaluate_agent(agent_class: Type[BaseAgent], repo_path: str) -> None:
     try:
         logger.info(f"Starting agent execution for repository: {repo_path_obj.absolute()}")
         start_time = time.time()
-        if agent_class is BenchmarkAgent:
-            benchmark_dir = run_dir / "benchmark"
-            payload = {
-                "command_env": "BENCHMARK_CMD",
-                "workdir": str(repo_path_obj.absolute()),
-                "output_dir": str(benchmark_dir),
-                "render_charts": True,
-            }
-            result = await agent.run(json.dumps(payload))
-        else:
-            result = await agent.run(str(repo_path_obj.absolute()))
+        result = await agent.run(str(repo_path_obj.absolute()))
         execution_time = time.time() - start_time
         logger.info(f"Agent execution completed successfully, result length: {len(result) if result else 0}")
         logger.info(f"RESULT:\n{result}")
@@ -193,7 +182,14 @@ async def evaluate_workflow(workflow_func: Callable, repo_path: str) -> None:
     try:
         logger.info(f"Starting workflow execution for repository: {repo_path_obj.absolute()}")
         start_time = time.time()
-        result = await workflow_func(str(repo_path_obj.absolute()))
+        if workflow_func.__name__ == "orchestrate_complete_pipeline":
+            benchmark_dir = run_dir / "benchmark"
+            result = await workflow_func(
+                str(repo_path_obj.absolute()),
+                benchmark_output_dir=str(benchmark_dir),
+            )
+        else:
+            result = await workflow_func(str(repo_path_obj.absolute()))
         execution_time = time.time() - start_time
         logger.info(f"Workflow execution completed successfully")
         logger.info(f"RESULT:\n{result}")
diff --git a/tools/__init__.py b/tools/__init__.py
index 25e5d5d..ed9d742 100644
--- a/tools/__init__.py
+++ b/tools/__init__.py
@@ -2,7 +2,7 @@
 
 from .analysis import (build_analysis_bundle, read_code_snippet, read_file,
                        search_codebase)
-from .benchmark import run_benchmark_command, run_benchmark_comparison
+from .benchmark import run_benchmark_command
 from .behavior import (analyze_code_complexity, analyze_code_structure,
                        analyze_data_dependencies,
                        analyze_error_handling_strategy,
diff --git a/workflows/complete_pipeline.py b/workflows/complete_pipeline.py
index 46b9a58..0aae513 100644
--- a/workflows/complete_pipeline.py
+++ b/workflows/complete_pipeline.py
@@ -1,12 +1,13 @@
 """LangGraph-based orchestrator for the complete optimization pipeline.
 
-This module implements a five-phase iterative pipeline:
-1. PHASE 1 - SUMMARIZATION: Run environment, component, and behavior summarizers in parallel
-2. PHASE 2 - ANALYSIS: Analyze summaries to produce optimization guidance with static signals
-3. PHASE 3 - OPTIMIZATION: Apply safe code improvements based on analysis
-4. PHASE 4 - CORRECTNESS CHECK: Validate applied changes for correctness and quality
-5. PHASE 5 - BENCHMARK: Run external benchmark command if configured
-6. LOOP: Conditionally loop back to summarization or exit based on iteration count
+This module implements a six-phase iterative pipeline:
+1. PHASE 1 - BENCHMARK (BEFORE): Run external benchmark command to capture baseline
+2. PHASE 2 - SUMMARIZATION: Run environment, component, and behavior summarizers in parallel
+3. PHASE 3 - ANALYSIS: Analyze summaries to produce optimization guidance with static signals
+4. PHASE 4 - OPTIMIZATION: Apply safe code improvements based on analysis
+5. PHASE 5 - CORRECTNESS CHECK: Validate applied changes for correctness and quality
+6. PHASE 6 - BENCHMARK (AFTER): Run external benchmark command to capture post-change
+7. LOOP: Conditionally loop back to benchmark-before or exit based on iteration count
 
 The pipeline uses temporary files for inter-agent communication and manages state flow
 through a LangGraph workflow with iterative refinement.
@@ -23,9 +24,10 @@
 from langgraph.graph import END, START, StateGraph
 from langgraph.graph.state import CompiledStateGraph
 
-from agents import (AnalysisReport, AnalyzerAgent, BenchmarkAgent,
-                    BenchmarkReport, OptimizationReport, OptimizerAgent,
-                    orchestrate_code_correctness)
+from agents import (AnalysisReport, AnalyzerAgent, OptimizationReport,
+                    OptimizerAgent, orchestrate_code_correctness)
+from tools.benchmark import (compare_benchmark_metrics, render_benchmark_charts,
+                             run_benchmark_command, write_benchmark_report)
 
 from .summary_orchestrator import orchestrate_summarizers
 
@@ -81,6 +83,7 @@ def build_pipeline_state(phases: list[PipelinePhase]) -> Type[TypedDict]:
     state_fields: dict[str, Type] = {
         "code_path": str,
         "iteration_count": int,
+        "benchmark_output_dir": str,
     }
 
     # Add state fields from each phase's return fields
@@ -93,6 +96,7 @@ def build_pipeline_state(phases: list[PipelinePhase]) -> Type[TypedDict]:
 
 # Define pipeline phases in execution order
 _PIPELINE_PHASES = [
+    PipelinePhase("benchmark_before", None, return_fields=["benchmark_before"]),
     PipelinePhase("summarization", None, return_fields=["summary_text"]),
     PipelinePhase(
         "analysis",
@@ -112,18 +116,28 @@ def build_pipeline_state(phases: list[PipelinePhase]) -> Type[TypedDict]:
         return_fields=["analysis_result", "correctness_verdict", "correctness_report"],
         return_class=tuple,  # Composite: (AnalysisResult, CorrectnessVerdict)
     ),
-    PipelinePhase(
-        "benchmark",
-        None,
-        agent_class=BenchmarkAgent,
-        return_class=BenchmarkReport,
-    ),
+    PipelinePhase("benchmark_after", None, return_fields=["benchmark_after", "benchmark_report"]),
 ]
 
 # Dynamically build PipelineState from phase definitions
 PipelineState = build_pipeline_state(_PIPELINE_PHASES)
 
 
+async def benchmark_before_node(state: PipelineState) -> dict:
+    """PHASE 1: Run baseline benchmark before optimization."""
+    logger.info("PHASE 1: Running baseline benchmark (before)")
+    output_dir = state.get("benchmark_output_dir", "")
+    payload = {
+        "command_env": "BENCHMARK_CMD",
+        "workdir": state["code_path"],
+        "label": "before",
+        "output_dir": output_dir,
+    }
+    result = await run_benchmark_command.ainvoke(payload)
+    logger.info("PHASE 1: Baseline benchmark complete")
+    return {"benchmark_before": result}
+
+
 async def summary_node(state: PipelineState) -> dict:
     """PHASE 1: Run summarization workflow and combine summaries.
 
@@ -138,7 +152,7 @@ async def summary_node(state: PipelineState) -> dict:
         Dictionary with combined summary_text and incremented iteration_count
     """
     iteration = state.get("iteration_count", 0) + 1
-    logger.info(f"PHASE 1: Running summarization orchestrator (iteration {iteration}/{MAX_ITERATIONS})")
+    logger.info(f"PHASE 2: Running summarization orchestrator (iteration {iteration}/{MAX_ITERATIONS})")
     summaries = await orchestrate_summarizers(state["code_path"])
 
     summary_text = (
@@ -150,7 +164,7 @@ async def summary_node(state: PipelineState) -> dict:
         f"{summaries.get('behavior_summary', '')}\n"
     )
 
-    logger.info(f"PHASE 1: Summarization complete ({len(summary_text)} chars)")
+    logger.info(f"PHASE 2: Summarization complete ({len(summary_text)} chars)")
     return {"summary_text": summary_text, "iteration_count": iteration}
 
 
@@ -173,7 +187,7 @@ async def analyze_node(state: PipelineState) -> dict:
     Returns:
         Dictionary with analysis_report (JSON string)
     """
-    logger.info("PHASE 2: Running AnalyzerAgent")
+    logger.info("PHASE 3: Running AnalyzerAgent")
     agent = AnalyzerAgent()
 
     with tempfile.TemporaryDirectory(prefix="analysis_inputs_") as temp_dir:
@@ -183,7 +197,7 @@ async def analyze_node(state: PipelineState) -> dict:
         # Write summary to temporary file
         summary_path.write_text(state.get("summary_text", ""), encoding="utf-8")
 
-        logger.info("PHASE 2: Analyzer input files created")
+        logger.info("PHASE 3: Analyzer input files created")
 
         # Create payload with file paths for the agent to read
         payload = {
@@ -192,7 +206,7 @@ async def analyze_node(state: PipelineState) -> dict:
         }
 
         result = await agent.run(json.dumps(payload))
-        logger.info("PHASE 2: Analysis complete")
+        logger.info("PHASE 3: Analysis complete")
 
     return {"analysis_report": result}
 
@@ -218,7 +232,7 @@ async def optimize_node(state: PipelineState) -> dict:
     Returns:
         Dictionary with optimization_report (JSON string)
     """
-    logger.info("PHASE 3: Running OptimizerAgent")
+    logger.info("PHASE 4: Running OptimizerAgent")
     agent = OptimizerAgent()
 
     with tempfile.TemporaryDirectory(prefix="optimizer_inputs_") as temp_dir:
@@ -230,7 +244,7 @@ async def optimize_node(state: PipelineState) -> dict:
         analysis_path.write_text(state.get("analysis_report", ""), encoding="utf-8")
         summary_path.write_text(state.get("summary_text", ""), encoding="utf-8")
 
-        logger.info("PHASE 3: Optimizer input files created")
+        logger.info("PHASE 4: Optimizer input files created")
 
         # Create payload with file paths for the agent to read
         payload = {
@@ -240,40 +254,64 @@ async def optimize_node(state: PipelineState) -> dict:
         }
 
         result = await agent.run(json.dumps(payload))
-        logger.info("PHASE 3: Optimization complete")
+        logger.info("PHASE 4: Optimization complete")
 
     return {"optimization_report": result}
 
 
-async def benchmark_node(state: PipelineState) -> dict:
-    """PHASE 5: Run benchmark command to measure performance impact.
-
-    Uses BenchmarkAgent to execute an external benchmark command configured
-    via environment variable (BENCHMARK_CMD by default). The benchmark
-    runs in the repository root unless overridden.
-
-    Args:
-        state: Pipeline state containing code_path
+async def benchmark_after_node(state: PipelineState) -> dict:
+    """PHASE 6: Run benchmark after optimization and compare with baseline."""
+    logger.info("PHASE 6: Running post-optimization benchmark (after)")
+    output_dir = state.get("benchmark_output_dir", "")
+    payload = {
+        "command_env": "BENCHMARK_CMD",
+        "workdir": state["code_path"],
+        "label": "after",
+        "output_dir": output_dir,
+    }
+    after_raw = await run_benchmark_command.ainvoke(payload)
+    logger.info("PHASE 6: Post-optimization benchmark complete")
 
-    Returns:
-        Dictionary with benchmark_report (JSON string)
-    """
-    logger.info("PHASE 5: Running BenchmarkAgent")
-    agent = BenchmarkAgent()
+    before_raw = state.get("benchmark_before", "")
+    try:
+        before = json.loads(before_raw) if before_raw else {}
+    except json.JSONDecodeError:
+        before = {}
+    try:
+        after = json.loads(after_raw) if after_raw else {}
+    except json.JSONDecodeError:
+        after = {}
 
-    payload = {
+    comparison = compare_benchmark_metrics(before, after)
+    report = {
+        "benchmark_name": Path(state["code_path"]).name,
         "command_env": "BENCHMARK_CMD",
         "workdir": state["code_path"],
+        "before": before,
+        "after": after,
+        "comparison": comparison,
     }
 
-    result = await agent.run(json.dumps(payload))
-    logger.info("PHASE 5: Benchmark complete")
+    report_paths = {}
+    charts = {}
+    if output_dir:
+        report_paths = write_benchmark_report(
+            Path(output_dir), before, after, comparison
+        )
+        charts = render_benchmark_charts(
+            Path(output_dir), comparison, before, after
+        )
+        report["report_paths"] = report_paths
+        report["charts"] = charts
 
-    return {"benchmark_report": result}
+    return {
+        "benchmark_after": after_raw,
+        "benchmark_report": json.dumps(report, indent=2),
+    }
 
 
 async def correctness_check_node(state: PipelineState) -> dict:
-    """PHASE 4: Run code correctness workflow.
+    """PHASE 5: Run code correctness workflow.
 
     Orchestrates a two-phase correctness check:
     - Phase 4a: Detailed analysis of applied code changes
@@ -285,7 +323,7 @@ async def correctness_check_node(state: PipelineState) -> dict:
     Returns:
         Dictionary with analysis_result and correctness_verdict
     """
-    logger.info("PHASE 4: Running code correctness workflow")
+    logger.info("PHASE 5: Running code correctness workflow")
 
     # Parse the optimization report to extract applied changes
     try:
@@ -319,7 +357,7 @@ async def correctness_check_node(state: PipelineState) -> dict:
         code_snippet=code_snippet,
     )
 
-    logger.info("PHASE 4: Code correctness workflow complete")
+    logger.info("PHASE 5: Code correctness workflow complete")
 
     return {
         "analysis_result": correctness_state.get("analysis_result", ""),
@@ -357,15 +395,16 @@ def should_continue_loop(state: PipelineState) -> Literal["summarization", str]:
 
 
 def build_complete_pipeline() -> CompiledStateGraph:
-    """Build the complete five-phase optimization pipeline with iteration loop.
+    """Build the complete six-phase optimization pipeline with iteration loop.
 
     Constructs a LangGraph StateGraph from phase definitions that orchestrates:
     1. Summarization (parallel environment, component, behavior summaries)
     2. Analysis (structured guidance based on summaries with static signals)
     3. Optimization (apply safe code improvements)
-    4. Correctness Check (orchestrates pain analysis and structured verdict)
-    5. Benchmark (run external benchmark command)
-    6. Loop decision (conditionally loop back to summarization or end)
+    4. Optimization (apply safe code improvements)
+    5. Correctness Check (orchestrates pain analysis and structured verdict)
+    6. Benchmark (after) (run external benchmark command)
+    7. Loop decision (conditionally loop back to benchmark-before or end)
 
     The workflow follows a sequential DAG with conditional loop:
     START → summarization → analysis → optimization → correctness_check → (loop decision) → {summarization | END}
@@ -373,9 +412,10 @@ def build_complete_pipeline() -> CompiledStateGraph:
     State fields are dynamically generated from phase definitions:
     - summarization → summary_text
     - analysis → analysis_report
+    - benchmark_before → benchmark_before
     - optimization → optimization_report
     - correctness_check → analysis_result, correctness_verdict, correctness_report
-    - benchmark → benchmark_report
+    - benchmark_after → benchmark_after, benchmark_report
 
     Returns:
         Compiled LangGraph StateGraph ready for invocation
@@ -385,11 +425,12 @@ def build_complete_pipeline() -> CompiledStateGraph:
 
     # Map phase names to node functions
     node_functions = {
+        "benchmark_before": benchmark_before_node,
         "summarization": summary_node,
         "analysis": analyze_node,
         "optimization": optimize_node,
         "correctness_check": correctness_check_node,
-        "benchmark": benchmark_node,
+        "benchmark_after": benchmark_after_node,
     }
 
     # Add nodes from phase definitions
@@ -426,7 +467,7 @@ def build_complete_pipeline() -> CompiledStateGraph:
     return workflow.compile()
 
 
-def _build_initial_state(code_path: str) -> dict[str, Any]:
+def _build_initial_state(code_path: str, benchmark_output_dir: str = "") -> dict[str, Any]:
     """Build initial state with all dynamic fields initialized.
 
     Args:
@@ -438,6 +479,7 @@ def _build_initial_state(code_path: str) -> dict[str, Any]:
     initial_state = {
         "code_path": code_path,
         "iteration_count": 0,
+        "benchmark_output_dir": benchmark_output_dir,
     }
 
     # Initialize all return fields from phases with empty strings
@@ -448,7 +490,9 @@ def _build_initial_state(code_path: str) -> dict[str, Any]:
     return initial_state
 
 
-async def orchestrate_complete_pipeline(code_path: str) -> PipelineState:
+async def orchestrate_complete_pipeline(
+    code_path: str, benchmark_output_dir: str = ""
+) -> PipelineState:
     """Run the complete optimization pipeline end-to-end.
 
     Executes the optimization pipeline with iterative refinement through dynamically
@@ -456,9 +500,10 @@ async def orchestrate_complete_pipeline(code_path: str) -> PipelineState:
     1. Summarization: Generates environment, component, and behavior summaries
     2. Analysis: Produces structured optimization guidance with static analysis
     3. Optimization: Applies safe code improvements
-    4. Correctness Check: Validates applied changes for correctness and quality
-    5. Benchmark: Runs external benchmark command if configured
-    6. Loop: Conditionally iterates up to MAX_ITERATIONS times
+    4. Optimization: Applies safe code improvements
+    5. Correctness Check: Validates applied changes for correctness and quality
+    6. Benchmark (after): Runs external benchmark command if configured
+    7. Loop: Conditionally iterates up to MAX_ITERATIONS times
 
     State fields are dynamically generated from phase definitions:
     - Phase outputs are automatically added to state based on agent.return_state_field
@@ -474,10 +519,12 @@ async def orchestrate_complete_pipeline(code_path: str) -> PipelineState:
         - summary_text: Combined summaries from final iteration
         - analysis_report: Structured AnalysisReport JSON from final iteration
         - optimization_report: Final OptimizationReport JSON from final iteration
+        - benchmark_before: Baseline benchmark JSON from final iteration
         - analysis_result: Detailed correctness analysis from final iteration
         - correctness_verdict: Yes/No verdict from final iteration
         - correctness_report: Combined correctness report JSON from final iteration
-        - benchmark_report: BenchmarkReport JSON from final iteration
+        - benchmark_after: Post-optimization benchmark JSON from final iteration
+        - benchmark_report: Combined benchmark comparison JSON from final iteration
         - iteration_count: Number of iterations completed
     """
     logger.info(f"Starting complete optimization pipeline for {code_path}")
@@ -486,7 +533,7 @@ async def orchestrate_complete_pipeline(code_path: str) -> PipelineState:
 
     workflow = build_complete_pipeline()
 
-    initial_state = _build_initial_state(code_path)
+    initial_state = _build_initial_state(code_path, benchmark_output_dir)
     final_state: PipelineState = await workflow.ainvoke(initial_state)
 
     logger.info("Complete optimization pipeline finished")

From 8f2b275e925137040ba7e1025e950cdaf32fe55d Mon Sep 17 00:00:00 2001
From: Huiyun Peng <peng25h@mtholyoke.edu>
Date: Tue, 20 Jan 2026 19:41:50 -0500
Subject: [PATCH 3/5] cleanup

---
 tools/benchmark.py             | 55 ----------------------------------
 workflows/complete_pipeline.py | 18 +++++------
 2 files changed, 9 insertions(+), 64 deletions(-)

diff --git a/tools/benchmark.py b/tools/benchmark.py
index d741508..89593eb 100644
--- a/tools/benchmark.py
+++ b/tools/benchmark.py
@@ -386,58 +386,3 @@ async def run_benchmark_command(
     )
     return json.dumps(payload)
 
-
-@tool
-async def run_benchmark_comparison(
-    command_env: str = "BENCHMARK_CMD",
-    workdir: str = "",
-    timeout_seconds: int = 1800,
-    output_dir: str = "",
-    render_charts: bool = True,
-) -> str:
-    """Run benchmark commands before/after and return comparison payload."""
-    command_before = os.environ.get(command_env, "")
-    command_after = os.environ.get(command_env, "")
-
-    before = await asyncio.to_thread(
-        _run_benchmark_command_impl,
-        command_before,
-        command_env,
-        workdir,
-        timeout_seconds,
-        output_dir,
-        "before",
-    )
-    after = await asyncio.to_thread(
-        _run_benchmark_command_impl,
-        command_after,
-        command_env,
-        workdir,
-        timeout_seconds,
-        output_dir,
-        "after",
-    )
-
-    if "error" in before:
-        return json.dumps({"error": "before_failed", "before": before})
-    if "error" in after:
-        return json.dumps({"error": "after_failed", "after": after})
-
-    comparison = compare_benchmark_metrics(before, after)
-    payload: Dict[str, Any] = {
-        "before": before,
-        "after": after,
-        "comparison": comparison,
-    }
-
-    if output_dir:
-        report_paths = write_benchmark_report(
-            Path(output_dir), before, after, comparison
-        )
-        payload["report_paths"] = report_paths
-        if render_charts:
-            payload["charts"] = render_benchmark_charts(
-                Path(output_dir), comparison, before, after
-            )
-
-    return json.dumps(payload)
diff --git a/workflows/complete_pipeline.py b/workflows/complete_pipeline.py
index 0aae513..6691a65 100644
--- a/workflows/complete_pipeline.py
+++ b/workflows/complete_pipeline.py
@@ -398,21 +398,21 @@ def build_complete_pipeline() -> CompiledStateGraph:
     """Build the complete six-phase optimization pipeline with iteration loop.
 
     Constructs a LangGraph StateGraph from phase definitions that orchestrates:
-    1. Summarization (parallel environment, component, behavior summaries)
-    2. Analysis (structured guidance based on summaries with static signals)
-    3. Optimization (apply safe code improvements)
+    1. Benchmark (before) (run external benchmark command)
+    2. Summarization (parallel environment, component, behavior summaries)
+    3. Analysis (structured guidance based on summaries with static signals)
     4. Optimization (apply safe code improvements)
     5. Correctness Check (orchestrates pain analysis and structured verdict)
     6. Benchmark (after) (run external benchmark command)
     7. Loop decision (conditionally loop back to benchmark-before or end)
 
     The workflow follows a sequential DAG with conditional loop:
-    START → summarization → analysis → optimization → correctness_check → (loop decision) → {summarization | END}
+    START → benchmark_before → summarization → analysis → optimization → correctness_check → benchmark_after → (loop decision) → {benchmark_before | END}
 
     State fields are dynamically generated from phase definitions:
+    - benchmark_before → benchmark_before
     - summarization → summary_text
     - analysis → analysis_report
-    - benchmark_before → benchmark_before
     - optimization → optimization_report
     - correctness_check → analysis_result, correctness_verdict, correctness_report
     - benchmark_after → benchmark_after, benchmark_report
@@ -497,9 +497,9 @@ async def orchestrate_complete_pipeline(
 
     Executes the optimization pipeline with iterative refinement through dynamically
     configured phases:
-    1. Summarization: Generates environment, component, and behavior summaries
-    2. Analysis: Produces structured optimization guidance with static analysis
-    3. Optimization: Applies safe code improvements
+    1. Benchmark (before): Captures baseline benchmark metrics
+    2. Summarization: Generates environment, component, and behavior summaries
+    3. Analysis: Produces structured optimization guidance with static analysis
     4. Optimization: Applies safe code improvements
     5. Correctness Check: Validates applied changes for correctness and quality
     6. Benchmark (after): Runs external benchmark command if configured
@@ -516,10 +516,10 @@ async def orchestrate_complete_pipeline(
     Returns:
         Final pipeline state containing dynamically generated fields:
         - code_path: Input repository path
+        - benchmark_before: Baseline benchmark JSON from final iteration
         - summary_text: Combined summaries from final iteration
         - analysis_report: Structured AnalysisReport JSON from final iteration
         - optimization_report: Final OptimizationReport JSON from final iteration
-        - benchmark_before: Baseline benchmark JSON from final iteration
         - analysis_result: Detailed correctness analysis from final iteration
         - correctness_verdict: Yes/No verdict from final iteration
         - correctness_report: Combined correctness report JSON from final iteration

From 762c4ffba351b90f8cf71bda7aea951cb5250f38 Mon Sep 17 00:00:00 2001
From: Huiyun Peng <peng25h@mtholyoke.edu>
Date: Tue, 20 Jan 2026 19:45:38 -0500
Subject: [PATCH 4/5] cleanup

---
 workflows/complete_pipeline.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/complete_pipeline.py b/workflows/complete_pipeline.py
index 6691a65..146cd1e 100644
--- a/workflows/complete_pipeline.py
+++ b/workflows/complete_pipeline.py
@@ -407,7 +407,7 @@ def build_complete_pipeline() -> CompiledStateGraph:
     7. Loop decision (conditionally loop back to benchmark-before or end)
 
     The workflow follows a sequential DAG with conditional loop:
-    START → benchmark_before → summarization → analysis → optimization → correctness_check → benchmark_after → (loop decision) → {benchmark_before | END}
+    START → benchmark_before → summarization → analysis → optimization → correctness_check → benchmark_after → (loop decision) → {summarization | END}
 
     State fields are dynamically generated from phase definitions:
     - benchmark_before → benchmark_before

From b44b5c59112285933f293aa516cee1f395ee1b64 Mon Sep 17 00:00:00 2001
From: Fonio <91513797+Fonio22@users.noreply.github.com>
Date: Wed, 21 Jan 2026 02:41:59 -0500
Subject: [PATCH 5/5] feat: Remove Semgrep rule files and enhance benchmark
 tool with JMeter output parsing and TeaStore auto-detection.

---
 .semgrep/aspnet-endpoints.yml |  70 -----------
 .semgrep/cache-queue-grpc.yml | 228 ----------------------------------
 .semgrep/cpp-services.yml     | 112 -----------------
 .semgrep/csharp-clients.yml   |  55 --------
 .semgrep/database-clients.yml | 175 --------------------------
 .semgrep/entityframework.yml  |  49 --------
 .semgrep/http-clients.yml     | 173 --------------------------
 .semgrep/jakarta-ee.yml       |  33 -----
 tools/benchmark.py            | 168 ++++++++++++++++++++++---
 9 files changed, 154 insertions(+), 909 deletions(-)
 delete mode 100644 .semgrep/aspnet-endpoints.yml
 delete mode 100644 .semgrep/cache-queue-grpc.yml
 delete mode 100644 .semgrep/cpp-services.yml
 delete mode 100644 .semgrep/csharp-clients.yml
 delete mode 100644 .semgrep/database-clients.yml
 delete mode 100644 .semgrep/entityframework.yml
 delete mode 100644 .semgrep/http-clients.yml
 delete mode 100644 .semgrep/jakarta-ee.yml

diff --git a/.semgrep/aspnet-endpoints.yml b/.semgrep/aspnet-endpoints.yml
deleted file mode 100644
index 0e823a7..0000000
--- a/.semgrep/aspnet-endpoints.yml
+++ /dev/null
@@ -1,70 +0,0 @@
-rules:
-  - id: aspnet-http-get-endpoint
-    patterns:
-      - pattern: |
-          [HttpGet(...)]
-          $RETURN $METHOD(...) { ... }
-    message: "Found HTTP GET endpoint: $METHOD"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: endpoint
-      http_method: GET
-
-  - id: aspnet-http-post-endpoint
-    patterns:
-      - pattern: |
-          [HttpPost(...)]
-          $RETURN $METHOD(...) { ... }
-    message: "Found HTTP POST endpoint: $METHOD"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: endpoint
-      http_method: POST
-
-  - id: aspnet-http-put-endpoint
-    patterns:
-      - pattern: |
-          [HttpPut(...)]
-          $RETURN $METHOD(...) { ... }
-    message: "Found HTTP PUT endpoint: $METHOD"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: endpoint
-      http_method: PUT
-
-  - id: aspnet-http-delete-endpoint
-    patterns:
-      - pattern: |
-          [HttpDelete(...)]
-          $RETURN $METHOD(...) { ... }
-    message: "Found HTTP DELETE endpoint: $METHOD"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: endpoint
-      http_method: DELETE
-
-  - id: aspnet-route-attribute
-    patterns:
-      - pattern: |
-          [Route($ROUTE)]
-          class $CONTROLLER : ControllerBase { ... }
-    message: "Found API controller: $CONTROLLER with route $ROUTE"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: controller
-
-  - id: aspnet-api-controller
-    patterns:
-      - pattern: |
-          [ApiController]
-          class $NAME : ControllerBase { ... }
-    message: "Found API controller: $NAME"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: controller
diff --git a/.semgrep/cache-queue-grpc.yml b/.semgrep/cache-queue-grpc.yml
deleted file mode 100644
index dc5fe89..0000000
--- a/.semgrep/cache-queue-grpc.yml
+++ /dev/null
@@ -1,228 +0,0 @@
-rules:
-  # Redis/Cache Clients
-  - id: cache-java-jedis
-    pattern-either:
-      - pattern: new Jedis(...)
-      - pattern: new JedisPool(...)
-      - pattern: $CLIENT.get(...)
-      - pattern: $CLIENT.set(...)
-    message: "Redis client usage via Jedis"
-    languages: [java]
-    severity: INFO
-    metadata:
-      category: cache
-      library: Jedis
-      type: redis
-
-  - id: cache-java-lettuce
-    pattern-either:
-      - pattern: RedisClient.create(...)
-      - pattern: $CLIENT.connect()
-    message: "Redis client usage via Lettuce"
-    languages: [java]
-    severity: INFO
-    metadata:
-      category: cache
-      library: Lettuce
-      type: redis
-
-  - id: cache-csharp-stackexchange
-    pattern-either:
-      - pattern: ConnectionMultiplexer.Connect(...)
-      - pattern: $CONN.GetDatabase(...)
-      - pattern: $DB.StringGet(...)
-      - pattern: $DB.StringSet(...)
-    message: "Redis client usage via StackExchange.Redis"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: cache
-      library: StackExchange.Redis
-      type: redis
-
-  - id: cache-go-redis
-    pattern-either:
-      - pattern: redis.NewClient(...)
-      - pattern: $CLIENT.Get(...)
-      - pattern: $CLIENT.Set(...)
-    message: "Redis client usage via go-redis"
-    languages: [go]
-    severity: INFO
-    metadata:
-      category: cache
-      library: go-redis
-      type: redis
-
-  - id: cache-python-redis
-    pattern-either:
-      - pattern: redis.Redis(...)
-      - pattern: redis.StrictRedis(...)
-      - pattern: $CLIENT.get(...)
-      - pattern: $CLIENT.set(...)
-    message: "Redis client usage via redis-py"
-    languages: [python]
-    severity: INFO
-    metadata:
-      category: cache
-      library: redis-py
-      type: redis
-
-  - id: cache-ts-ioredis
-    pattern-either:
-      - pattern: new Redis(...)
-      - pattern: $CLIENT.get(...)
-      - pattern: $CLIENT.set(...)
-    message: "Redis client usage via ioredis"
-    languages: [typescript, javascript]
-    severity: INFO
-    metadata:
-      category: cache
-      library: ioredis
-      type: redis
-
-  # Message Queue Clients - Kafka
-  - id: queue-java-kafka-producer
-    pattern-either:
-      - pattern: new KafkaProducer(...)
-      - pattern: $PRODUCER.send(...)
-    message: "Kafka producer usage"
-    languages: [java]
-    severity: INFO
-    metadata:
-      category: queue
-      library: kafka-clients
-      type: kafka
-      direction: producer
-
-  - id: queue-java-kafka-consumer
-    pattern-either:
-      - pattern: new KafkaConsumer(...)
-      - pattern: $CONSUMER.subscribe(...)
-      - pattern: $CONSUMER.poll(...)
-    message: "Kafka consumer usage"
-    languages: [java]
-    severity: INFO
-    metadata:
-      category: queue
-      library: kafka-clients
-      type: kafka
-      direction: consumer
-
-  - id: queue-go-kafka
-    pattern-either:
-      - pattern: kafka.NewWriter(...)
-      - pattern: kafka.NewReader(...)
-      - pattern: $WRITER.WriteMessages(...)
-      - pattern: $READER.ReadMessage(...)
-    message: "Kafka usage via kafka-go"
-    languages: [go]
-    severity: INFO
-    metadata:
-      category: queue
-      library: kafka-go
-      type: kafka
-
-  - id: queue-python-kafka
-    pattern-either:
-      - pattern: KafkaProducer(...)
-      - pattern: KafkaConsumer(...)
-      - pattern: $PRODUCER.send(...)
-    message: "Kafka usage via kafka-python"
-    languages: [python]
-    severity: INFO
-    metadata:
-      category: queue
-      library: kafka-python
-      type: kafka
-
-  # Message Queue Clients - RabbitMQ
-  - id: queue-java-rabbitmq
-    pattern-either:
-      - pattern: new ConnectionFactory()
-      - pattern: $FACTORY.newConnection()
-      - pattern: $CONN.createChannel()
-      - pattern: $CHANNEL.basicPublish(...)
-      - pattern: $CHANNEL.basicConsume(...)
-    message: "RabbitMQ usage via amqp-client"
-    languages: [java]
-    severity: INFO
-    metadata:
-      category: queue
-      library: amqp-client
-      type: rabbitmq
-
-  - id: queue-csharp-rabbitmq
-    pattern-either:
-      - pattern: new ConnectionFactory()
-      - pattern: $FACTORY.CreateConnection()
-      - pattern: $CONN.CreateModel()
-      - pattern: $CHANNEL.BasicPublish(...)
-      - pattern: $CHANNEL.BasicConsume(...)
-    message: "RabbitMQ usage via RabbitMQ.Client"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: queue
-      library: RabbitMQ.Client
-      type: rabbitmq
-
-  - id: queue-python-pika
-    pattern-either:
-      - pattern: pika.BlockingConnection(...)
-      - pattern: $CONN.channel()
-      - pattern: $CHANNEL.basic_publish(...)
-      - pattern: $CHANNEL.basic_consume(...)
-    message: "RabbitMQ usage via pika"
-    languages: [python]
-    severity: INFO
-    metadata:
-      category: queue
-      library: pika
-      type: rabbitmq
-
-  # gRPC Clients
-  - id: grpc-java-client
-    pattern-either:
-      - pattern: ManagedChannelBuilder.forAddress(...)
-      - pattern: $STUB.newBlockingStub(...)
-      - pattern: $STUB.newStub(...)
-    message: "gRPC client usage"
-    languages: [java]
-    severity: INFO
-    metadata:
-      category: grpc
-      library: grpc-java
-
-  - id: grpc-csharp-client
-    pattern-either:
-      - pattern: new Channel(...)
-      - pattern: new GrpcChannel(...)
-      - pattern: new $SERVICE.$ SERVICEClient(...)
-    message: "gRPC client usage"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: grpc
-      library: Grpc.Net.Client
-
-  - id: grpc-go-client
-    pattern-either:
-      - pattern: grpc.Dial(...)
-      - pattern: grpc.DialContext(...)
-    message: "gRPC client usage"
-    languages: [go]
-    severity: INFO
-    metadata:
-      category: grpc
-      library: grpc-go
-
-  - id: grpc-python-client
-    pattern-either:
-      - pattern: grpc.insecure_channel(...)
-      - pattern: grpc.secure_channel(...)
-    message: "gRPC client usage"
-    languages: [python]
-    severity: INFO
-    metadata:
-      category: grpc
-      library: grpcio
diff --git a/.semgrep/cpp-services.yml b/.semgrep/cpp-services.yml
deleted file mode 100644
index 4b45e36..0000000
--- a/.semgrep/cpp-services.yml
+++ /dev/null
@@ -1,112 +0,0 @@
-rules:
-  # Apache Thrift
-  - id: cpp-grpc-thrift-client
-    pattern-either:
-      - pattern: new $CLIENT(...)
-      - pattern: apache::thrift::protocol::TProtocol
-      - pattern: apache::thrift::transport::TSocket
-      - pattern: ThriftClient<$SERVICE>
-      - pattern: ClientPool<ThriftClient<$SERVICE>>
-      - pattern: $POOL->Pop()
-    message: "Apache Thrift client usage"
-    languages: [cpp, c, c++]
-    severity: INFO
-    metadata:
-      category: grpc
-      library: thrift
-      type: rpc
-
-  - id: cpp-service-thrift-server
-    pattern-either:
-      - pattern: new apache::thrift::server::TThreadedServer(...)
-      - pattern: new apache::thrift::server::TNonblockingServer(...)
-      - pattern: new apache::thrift::server::TSimpleServer(...)
-      - pattern: new TThreadedServer(...)
-      - pattern: new TNonblockingServer(...)
-      # Stack allocation patterns (common in C++)
-      - pattern: TThreadedServer $SERVER(...);
-      - pattern: TNonblockingServer $SERVER(...);
-      - pattern: TSimpleServer $SERVER(...);
-      - pattern: apache::thrift::server::TThreadedServer $SERVER(...)
-      # Fallback regex for robust detection
-      - pattern-regex: (TThreadedServer|TNonblockingServer|TSimpleServer)\s
-    message: "Apache Thrift server definition"
-    languages: [cpp, c, c++]
-    severity: INFO
-    metadata:
-      category: service
-      library: thrift
-      type: rpc
-
-  # Redis
-  - id: cpp-cache-redis
-    pattern-either:
-      - pattern: RedisClient(...)
-      - pattern: redisConnect(...)
-      - pattern: redisCommand(...)
-      - pattern: cpp_redis::client
-      - pattern: $CLIENT->zadd(...)
-      - pattern: $CLIENT->zrevrange(...)
-      - pattern: $CLIENT->get(...)
-      - pattern: $CLIENT->set(...)
-      - pattern: $CLIENT->hget(...)
-      - pattern: $CLIENT->hset(...)
-      - pattern: sw::redis::Redis
-    message: "Redis client usage"
-    languages: [cpp, c, c++]
-    severity: INFO
-    metadata:
-      category: cache
-      library: hiredis
-      type: redis
-
-  # Memcached
-  - id: cpp-cache-memcached
-    pattern-either:
-      - pattern: memcached_create(...)
-      - pattern: memcached_server_push(...)
-      - pattern: memcached_set(...)
-      - pattern: memcached_get(...)
-    message: "Memcached client usage"
-    languages: [cpp, c, c++]
-    severity: INFO
-    metadata:
-      category: cache
-      library: libmemcached
-      type: memcached
-
-  # MongoDB
-  - id: cpp-database-mongo
-    pattern-either:
-      - pattern: mongocxx::client(...)
-      - pattern: mongocxx::instance(...)
-      - pattern: $COLL.find(...)
-      - pattern: $COLL.insert_one(...)
-      # C Driver patterns found in socialNetwork
-      - pattern: mongoc_client_pool_pop(...)
-      - pattern: mongoc_client_get_collection(...)
-      - pattern: mongoc_collection_find(...)
-      - pattern: mongoc_collection_find_with_opts(...)
-      - pattern: mongoc_collection_find_and_modify(...)
-    message: "MongoDB client usage"
-    languages: [cpp, c, c++]
-    severity: INFO
-    metadata:
-      category: database
-      library: mongoc
-      type: mongodb
-
-  # AMQP / RabbitMQ
-  - id: cpp-queue-amqp
-    pattern-either:
-      - pattern: AmqpClient::Channel(...)
-      - pattern: AmqpClient::Channel::Create(...)
-      - pattern: $CHAN->BasicPublish(...)
-      - pattern: $CHAN->BasicConsume(...)
-    message: "AMQP client usage"
-    languages: [cpp, c, c++]
-    severity: INFO
-    metadata:
-      category: queue
-      library: SimpleAmqpClient
-      type: rabbitmq
diff --git a/.semgrep/csharp-clients.yml b/.semgrep/csharp-clients.yml
deleted file mode 100644
index 6409b32..0000000
--- a/.semgrep/csharp-clients.yml
+++ /dev/null
@@ -1,55 +0,0 @@
-rules:
-  - id: csharp-httpclient-usage
-    patterns:
-      - pattern-either:
-          - pattern: |
-              new HttpClient()
-          - pattern: |
-              IHttpClientFactory $FACTORY
-          - pattern: |
-              CreateClient($NAME)
-    message: "Found HttpClient usage"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: http-client
-
-  - id: csharp-redis-cache
-    patterns:
-      - pattern-either:
-          - pattern: |
-              IDistributedCache $CACHE
-          - pattern: |
-              AddStackExchangeRedisCache(...)
-          - pattern: |
-              ConnectionMultiplexer.Connect($CONN)
-    message: "Found Redis cache usage"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: cache
-      type: redis
-
-  - id: csharp-rabbitmq-client
-    patterns:
-      - pattern-either:
-          - pattern: |
-              new ConnectionFactory()
-          - pattern: |
-              IModel $CHANNEL
-    message: "Found RabbitMQ usage"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: queue
-      type: rabbitmq
-
-  - id: csharp-grpc-client
-    patterns:
-      - pattern: |
-          $CLIENT.CallInvoker
-    message: "Found gRPC client usage"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: grpc
diff --git a/.semgrep/database-clients.yml b/.semgrep/database-clients.yml
deleted file mode 100644
index 25a807b..0000000
--- a/.semgrep/database-clients.yml
+++ /dev/null
@@ -1,175 +0,0 @@
-rules:
-  # Java Database Clients
-  - id: database-java-jdbc
-    pattern-either:
-      - pattern: DriverManager.getConnection(...)
-      - pattern: DataSource.getConnection(...)
-      - pattern: $CONN.createStatement()
-      - pattern: $CONN.prepareStatement(...)
-    message: "Database connection using JDBC"
-    languages: [java]
-    severity: INFO
-    metadata:
-      category: database
-      library: JDBC
-
-  - id: database-java-jpa
-    pattern-either:
-      - pattern: |
-          @Entity
-          class $CLASS { ... }
-      - pattern: EntityManager.createQuery(...)
-      - pattern: $EM.persist(...)
-      - pattern: $EM.find(...)
-    message: "Database usage via JPA/Hibernate"
-    languages: [java]
-    severity: INFO
-    metadata:
-      category: database
-      library: JPA
-
-  # C# Database Clients
-  - id: database-csharp-ef
-    pattern-either:
-      - pattern: |
-          class $CLASS : DbContext { ... }
-      - pattern: new DbContext(...)
-      - pattern: $CTX.Set<$TYPE>()
-      - pattern: $CTX.SaveChanges(...)
-      - pattern: $CTX.SaveChangesAsync(...)
-    message: "Database usage via Entity Framework"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: database
-      library: EntityFramework
-
-  - id: database-csharp-sqlclient
-    pattern-either:
-      - pattern: new SqlConnection(...)
-      - pattern: new SqlCommand(...)
-      - pattern: $CMD.ExecuteReader()
-      - pattern: $CMD.ExecuteNonQuery()
-    message: "Database connection using ADO.NET SqlClient"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: database
-      library: SqlClient
-
-  - id: database-csharp-npgsql
-    pattern-either:
-      - pattern: new NpgsqlConnection(...)
-      - pattern: new NpgsqlCommand(...)
-    message: "PostgreSQL connection using Npgsql"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: database
-      library: Npgsql
-
-  # Go Database Clients
-  - id: database-go-sql
-    pattern-either:
-      - pattern: sql.Open(...)
-      - pattern: $DB.Query(...)
-      - pattern: $DB.Exec(...)
-      - pattern: $DB.QueryRow(...)
-    message: "Database connection using database/sql"
-    languages: [go]
-    severity: INFO
-    metadata:
-      category: database
-      library: database/sql
-
-  - id: database-go-gorm
-    pattern-either:
-      - pattern: gorm.Open(...)
-      - pattern: $DB.Create(...)
-      - pattern: $DB.Find(...)
-      - pattern: $DB.Where(...)
-    message: "Database usage via GORM"
-    languages: [go]
-    severity: INFO
-    metadata:
-      category: database
-      library: GORM
-
-  # Python Database Clients
-  - id: database-python-sqlalchemy
-    pattern-either:
-      - pattern: create_engine(...)
-      - pattern: sessionmaker(...)
-      - pattern: $SESSION.query(...)
-      - pattern: $SESSION.add(...)
-      - pattern: $SESSION.commit()
-    message: "Database usage via SQLAlchemy"
-    languages: [python]
-    severity: INFO
-    metadata:
-      category: database
-      library: SQLAlchemy
-
-  - id: database-python-psycopg
-    pattern-either:
-      - pattern: psycopg2.connect(...)
-      - pattern: $CONN.cursor()
-      - pattern: $CURSOR.execute(...)
-    message: "PostgreSQL connection using psycopg2"
-    languages: [python]
-    severity: INFO
-    metadata:
-      category: database
-      library: psycopg2
-
-  - id: database-python-pymongo
-    pattern-either:
-      - pattern: MongoClient(...)
-      - pattern: $CLIENT[$DB][$COLLECTION]
-      - pattern: $COLLECTION.find(...)
-      - pattern: $COLLECTION.insert_one(...)
-    message: "MongoDB connection using pymongo"
-    languages: [python]
-    severity: INFO
-    metadata:
-      category: database
-      library: pymongo
-
-  # TypeScript/JavaScript Database Clients
-  - id: database-ts-typeorm
-    pattern-either:
-      - pattern: createConnection(...)
-      - pattern: getRepository(...)
-      - pattern: $REPO.find(...)
-      - pattern: $REPO.save(...)
-    message: "Database usage via TypeORM"
-    languages: [typescript, javascript]
-    severity: INFO
-    metadata:
-      category: database
-      library: TypeORM
-
-  - id: database-ts-mongoose
-    pattern-either:
-      - pattern: mongoose.connect(...)
-      - pattern: mongoose.model(...)
-      - pattern: $MODEL.find(...)
-      - pattern: $MODEL.create(...)
-    message: "MongoDB connection using Mongoose"
-    languages: [typescript, javascript]
-    severity: INFO
-    metadata:
-      category: database
-      library: Mongoose
-
-  - id: database-ts-prisma
-    pattern-either:
-      - pattern: new PrismaClient()
-      - pattern: $PRISMA.$ENTITY.findMany(...)
-      - pattern: $PRISMA.$ENTITY.create(...)
-    message: "Database usage via Prisma"
-    languages: [typescript, javascript]
-    severity: INFO
-    metadata:
-      category: database
-      library: Prisma
diff --git a/.semgrep/entityframework.yml b/.semgrep/entityframework.yml
deleted file mode 100644
index b8b5535..0000000
--- a/.semgrep/entityframework.yml
+++ /dev/null
@@ -1,49 +0,0 @@
-rules:
-  - id: ef-dbcontext-class
-    patterns:
-      - pattern: |
-          class $NAME : DbContext { ... }
-    message: "Found Entity Framework DbContext: $NAME"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: database
-      orm: entity-framework
-
-  - id: ef-dbset-property
-    patterns:
-      - pattern: |
-          public DbSet<$ENTITY> $PROP { get; set; }
-    message: "Found Entity Framework entity: $ENTITY"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: entity
-      orm: entity-framework
-
-  - id: ef-connection-string
-    patterns:
-      - pattern-either:
-          - pattern: |
-              UseSqlServer($CONN)
-          - pattern: |
-              UseNpgsql($CONN)
-          - pattern: |
-              UseMySql($CONN)
-          - pattern: |
-              UseSqlite($CONN)
-    message: "Found database connection configuration"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: database-config
-
-  - id: ef-migration
-    patterns:
-      - pattern: |
-          public partial class $NAME : Migration { ... }
-    message: "Found Entity Framework migration: $NAME"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: migration
diff --git a/.semgrep/http-clients.yml b/.semgrep/http-clients.yml
deleted file mode 100644
index 9a6e27b..0000000
--- a/.semgrep/http-clients.yml
+++ /dev/null
@@ -1,173 +0,0 @@
-rules:
-  # Java HTTP Clients
-  - id: http-client-java-resttemplate
-    pattern-either:
-      - pattern: new RestTemplate()
-      - pattern: $CLIENT.exchange(...)
-      - pattern: $CLIENT.getForObject(...)
-      - pattern: $CLIENT.postForObject(...)
-      - pattern: $CLIENT.put(...)
-      - pattern: $CLIENT.delete(...)
-    message: "HTTP client call using Spring RestTemplate"
-    languages: [java]
-    severity: INFO
-    metadata:
-      category: http-client
-      library: RestTemplate
-      
-  - id: http-client-java-webclient
-    pattern-either:
-      - pattern: WebClient.create(...)
-      - pattern: $CLIENT.get().uri(...)
-      - pattern: $CLIENT.post().uri(...)
-      - pattern: $CLIENT.put().uri(...)
-      - pattern: $CLIENT.delete().uri(...)
-    message: "HTTP client call using Spring WebClient"
-    languages: [java]
-    severity: INFO
-    metadata:
-      category: http-client
-      library: WebClient
-
-  - id: http-client-java-okhttp
-    pattern-either:
-      - pattern: new OkHttpClient()
-      - pattern: $CLIENT.newCall(...)
-      - pattern: new Request.Builder()
-    message: "HTTP client call using OkHttp"
-    languages: [java]
-    severity: INFO
-    metadata:
-      category: http-client
-      library: OkHttp
-
-  - id: http-client-java-apache
-    pattern-either:
-      - pattern: HttpClients.createDefault()
-      - pattern: $CLIENT.execute(...)
-    message: "HTTP client call using Apache HttpClient"
-    languages: [java]
-    severity: INFO
-    metadata:
-      category: http-client
-      library: ApacheHttpClient
-
-  # C# HTTP Clients
-  - id: http-client-csharp-httpclient
-    pattern-either:
-      - pattern: new HttpClient()
-      - pattern: $CLIENT.GetAsync(...)
-      - pattern: $CLIENT.PostAsync(...)
-      - pattern: $CLIENT.PutAsync(...)
-      - pattern: $CLIENT.DeleteAsync(...)
-      - pattern: $CLIENT.SendAsync(...)
-    message: "HTTP client call using HttpClient"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: http-client
-      library: HttpClient
-
-  - id: http-client-csharp-restsharp
-    pattern-either:
-      - pattern: new RestClient(...)
-      - pattern: $CLIENT.Execute(...)
-      - pattern: $CLIENT.ExecuteAsync(...)
-    message: "HTTP client call using RestSharp"
-    languages: [csharp]
-    severity: INFO
-    metadata:
-      category: http-client
-      library: RestSharp
-
-  # Go HTTP Clients
-  - id: http-client-go-http
-    pattern-either:
-      - pattern: http.Get(...)
-      - pattern: http.Post(...)
-      - pattern: http.NewRequest(...)
-      - pattern: $CLIENT.Do(...)
-    message: "HTTP client call using net/http"
-    languages: [go]
-    severity: INFO
-    metadata:
-      category: http-client
-      library: net/http
-
-  - id: http-client-go-resty
-    pattern-either:
-      - pattern: resty.New()
-      - pattern: $CLIENT.R().Get(...)
-      - pattern: $CLIENT.R().Post(...)
-    message: "HTTP client call using Resty"
-    languages: [go]
-    severity: INFO
-    metadata:
-      category: http-client
-      library: resty
-
-  # Python HTTP Clients
-  - id: http-client-python-requests
-    pattern-either:
-      - pattern: requests.get(...)
-      - pattern: requests.post(...)
-      - pattern: requests.put(...)
-      - pattern: requests.delete(...)
-      - pattern: requests.request(...)
-    message: "HTTP client call using requests"
-    languages: [python]
-    severity: INFO
-    metadata:
-      category: http-client
-      library: requests
-
-  - id: http-client-python-httpx
-    pattern-either:
-      - pattern: httpx.get(...)
-      - pattern: httpx.post(...)
-      - pattern: httpx.Client()
-      - pattern: httpx.AsyncClient()
-    message: "HTTP client call using httpx"
-    languages: [python]
-    severity: INFO
-    metadata:
-      category: http-client
-      library: httpx
-
-  - id: http-client-python-aiohttp
-    pattern-either:
-      - pattern: aiohttp.ClientSession()
-      - pattern: $SESSION.get(...)
-      - pattern: $SESSION.post(...)
-    message: "HTTP client call using aiohttp"
-    languages: [python]
-    severity: INFO
-    metadata:
-      category: http-client
-      library: aiohttp
-
-  # TypeScript/JavaScript HTTP Clients
-  - id: http-client-ts-axios
-    pattern-either:
-      - pattern: axios.get(...)
-      - pattern: axios.post(...)
-      - pattern: axios.put(...)
-      - pattern: axios.delete(...)
-      - pattern: axios.request(...)
-    message: "HTTP client call using axios"
-    languages: [typescript, javascript]
-    severity: INFO
-    metadata:
-      category: http-client
-      library: axios
-
-  - id: http-client-ts-fetch
-    pattern-either:
-      - pattern: fetch(...)
-      - pattern: window.fetch(...)
-    message: "HTTP client call using fetch API"
-    languages: [typescript, javascript]
-    severity: INFO
-    metadata:
-      category: http-client
-      library: fetch
diff --git a/.semgrep/jakarta-ee.yml b/.semgrep/jakarta-ee.yml
deleted file mode 100644
index 17fdadf..0000000
--- a/.semgrep/jakarta-ee.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-rules:
-  - id: jakarta-persistence-database
-    patterns:
-      - pattern-either:
-          - pattern: EntityManager $EM = ...;
-          - pattern: private EntityManager $EM;
-      - pattern-inside: |
-          import jakarta.persistence.EntityManager;
-          ...
-    message: "Jakarta Persistence EntityManager detected"
-    languages: [java]
-    severity: INFO
-    metadata:
-      category: database
-      technology: jakarta-persistence
-      type: SQL
-      confidence: HIGH
-
-  - id: jakarta-jaxrs-http-client
-    patterns:
-      - pattern-either:
-          - pattern: WebTarget $T = ...;
-          - pattern: $CLIENT.target(...)
-      - pattern-inside: |
-          import jakarta.ws.rs.client.$X;
-          ...
-    message: "Jakarta JAX-RS Client detected"
-    languages: [java]
-    severity: INFO
-    metadata:
-      category: http_client
-      technology: jakarta-jaxrs
-      confidence: MEDIUM
diff --git a/tools/benchmark.py b/tools/benchmark.py
index 89593eb..6b20171 100644
--- a/tools/benchmark.py
+++ b/tools/benchmark.py
@@ -125,6 +125,49 @@ def parse_wrk_output(output: str) -> Dict[str, Any]:
     return metrics
 
 
+def parse_jmeter_output(output: str) -> Dict[str, Any]:
+    """Parse JMeter CLI output and extract summary metrics."""
+    metrics: Dict[str, Any] = {
+        "latency_ms": {},
+        "percentiles_ms": {},
+        "requests_per_sec": 0.0,
+        "error_rate": 0.0,
+    }
+
+    # Example line:
+    # summary = 1809335 in 00:14:41 = 2054.3/s Avg: 4 Min: 0 Max: 15003 Err: 1780589 (98.41%)
+    # summary +  55747 in 00:00:10 = 5653.3/s Avg:     1 Min:     0 Max:   202 Err: 55747 (100.00%) Active: 10 Started: 10 Finished: 0
+    matches = re.findall(
+        r"summary \s*[+=]\s+(\d+)\s+in\s+([\d:]+)\s+=\s+([\d\.]+)/s\s+Avg:\s+(\d+)\s+Min:\s+(\d+)\s+Max:\s+(\d+)\s+Err:\s+(\d+)\s+\(([\d\.]+)%\)",
+        output,
+    )
+    if matches:
+        # Take the last match (most recent summary)
+        last_match = matches[-1]
+        metrics["total_requests"] = int(last_match[0])
+        duration_str = last_match[1]
+        # simplistic duration parsing hh:mm:ss
+        parts = list(map(int, duration_str.split(":")))
+        duration_seconds = 0.0
+        if len(parts) == 3:
+             duration_seconds = parts[0] * 3600 + parts[1] * 60 + parts[2]
+        elif len(parts) == 2:
+             duration_seconds = parts[0] * 60 + parts[1]
+        metrics["duration_seconds"] = duration_seconds
+        
+        metrics["requests_per_sec"] = float(last_match[2])
+        metrics["latency_ms"]["avg"] = float(last_match[3])
+        metrics["latency_ms"]["min"] = float(last_match[4])
+        metrics["latency_ms"]["max"] = float(last_match[5])
+        
+        err_count = int(last_match[6])
+        err_pct = float(last_match[7])
+        metrics["error_count"] = err_count
+        metrics["error_rate"] = err_pct
+
+    return metrics
+
+
 def compare_benchmark_metrics(before: Dict[str, Any], after: Dict[str, Any]) -> Dict[str, Any]:
     """Compare benchmark metrics before vs after."""
     def delta(before_val: Optional[float], after_val: Optional[float]) -> Dict[str, Any]:
@@ -164,6 +207,8 @@ def delta(before_val: Optional[float], after_val: Optional[float]) -> Dict[str,
             before_metrics.get("percentiles_ms", {}).get("99.000"),
             after_metrics.get("percentiles_ms", {}).get("99.000"),
         ),
+        # metrics specific to JMeter
+        "error_rate": delta(before_metrics.get("error_rate"), after_metrics.get("error_rate")),
     }
     return comparison
 
@@ -212,17 +257,17 @@ def render_benchmark_charts(
     labels = [
         name
         for name, item in latency_items
-        if item and item["before"] is not None and item["after"] is not None
+        if item and (item.get("before") is not None or item.get("after") is not None)
     ]
     before_vals = [
-        item["before"]
+        item.get("before") or 0.0
         for name, item in latency_items
-        if item and item["before"] is not None and item["after"] is not None
+        if item and (item.get("before") is not None or item.get("after") is not None)
     ]
     after_vals = [
-        item["after"]
+        item.get("after") or 0.0
         for name, item in latency_items
-        if item and item["before"] is not None and item["after"] is not None
+        if item and (item.get("before") is not None or item.get("after") is not None)
     ]
 
     if labels:
@@ -248,17 +293,17 @@ def render_benchmark_charts(
     labels = [
         name
         for name, item in throughput_items
-        if item and item["before"] is not None and item["after"] is not None
+        if item and (item.get("before") is not None or item.get("after") is not None)
     ]
     before_vals = [
-        item["before"]
+        item.get("before") or 0.0
         for name, item in throughput_items
-        if item and item["before"] is not None and item["after"] is not None
+        if item and (item.get("before") is not None or item.get("after") is not None)
     ]
     after_vals = [
-        item["after"]
+        item.get("after") or 0.0
         for name, item in throughput_items
-        if item and item["before"] is not None and item["after"] is not None
+        if item and (item.get("before") is not None or item.get("after") is not None)
     ]
 
     if labels:
@@ -288,15 +333,45 @@ def _run_benchmark_command_impl(
     output_dir: str,
     label: str,
 ) -> Dict[str, Any]:
+    cwd = Path(workdir).resolve() if workdir else Path.cwd().resolve()
+    
+    # Auto-detection for default command
     if not command:
-        command = os.environ.get(command_env, "")
+        # Check if we are in or near TeaStore
+        teastore_dir = cwd / "TeaStore"
+        teastore_parent = cwd.parent / "TeaStore"
+        
+        is_teastore = False
+        if cwd.name == "TeaStore" or (cwd / "examples" / "jmeter").exists():
+             is_teastore = True
+        elif teastore_dir.exists():
+             cwd = teastore_dir
+             is_teastore = True
+        elif teastore_parent.exists():
+             cwd = teastore_parent
+             is_teastore = True
+             
+        if is_teastore:
+            # Default TeaStore JMeter command
+            jmeter_script = "examples/jmeter/teastore_browse_nogui.jmx"
+            if (cwd / jmeter_script).exists():
+                command = f"jmeter -n -t {jmeter_script} -Jhostname localhost -Jport 8080 -JnumUser 10 -JrampUp 1"
+            else:
+                 # Fallback or error if script not found?
+                 pass
+        else:
+             # Default to existing behavior (DeathStarBench env var or empty)
+             command = os.environ.get(command_env, "")
+
+    if not command:
+         # Double check if command_env is set even if we didn't suspect TeaStore
+         command = os.environ.get(command_env, "")
 
     use_shell = isinstance(command, str)
     argv = _normalize_command(command) if not use_shell else []
     if (use_shell and not command) or (not use_shell and not argv):
         return {"error": "empty_command", "command_env": command_env}
-
-    cwd = Path(workdir).resolve() if workdir else Path.cwd().resolve()
+    
     if not cwd.exists():
         return {"error": "workdir_not_found", "workdir": str(cwd)}
 
@@ -319,6 +394,10 @@ def _run_benchmark_command_impl(
         timed_out = True
         stdout_text = exc.stdout or ""
         stderr_text = exc.stderr or ""
+        if isinstance(stdout_text, bytes):
+            stdout_text = stdout_text.decode("utf-8", errors="replace")
+        if isinstance(stderr_text, bytes):
+            stderr_text = stderr_text.decode("utf-8", errors="replace")
         exit_code = None
     except FileNotFoundError:
         return {"error": "command_not_found", "command": argv}
@@ -327,7 +406,12 @@ def _run_benchmark_command_impl(
 
     duration = time.monotonic() - start
 
-    metrics = parse_wrk_output(stdout_text) or parse_wrk_output(stderr_text)
+    # Detect parser
+    cmd_str = command if use_shell else " ".join(argv)
+    if "jmeter" in cmd_str:
+        metrics = parse_jmeter_output(stdout_text)
+    else:
+        metrics = parse_wrk_output(stdout_text) or parse_wrk_output(stderr_text)
 
     payload: Dict[str, Any] = {
         "command": command if use_shell else argv,
@@ -386,3 +470,59 @@ async def run_benchmark_command(
     )
     return json.dumps(payload)
 
+
+@tool
+async def run_benchmark_comparison(
+    command_env: str = "BENCHMARK_CMD",
+    workdir: str = "",
+    timeout_seconds: int = 1800,
+    output_dir: str = "",
+    render_charts: bool = True,
+) -> str:
+    """Run benchmark commands before/after and return comparison payload."""
+    command_before = os.environ.get(command_env, "")
+    command_after = os.environ.get(command_env, "")
+
+    before = await asyncio.to_thread(
+        _run_benchmark_command_impl,
+        command_before,
+        command_env,
+        workdir,
+        timeout_seconds,
+        output_dir,
+        "before",
+    )
+    after = await asyncio.to_thread(
+        _run_benchmark_command_impl,
+        command_after,
+        command_env,
+        workdir,
+        timeout_seconds,
+        output_dir,
+        "after",
+    )
+
+    if "error" in before:
+        return json.dumps({"error": "before_failed", "before": before})
+    if "error" in after:
+        return json.dumps({"error": "after_failed", "after": after})
+
+    comparison = compare_benchmark_metrics(before, after)
+    payload: Dict[str, Any] = {
+        "before": before,
+        "after": after,
+        "comparison": comparison,
+    }
+
+    if output_dir:
+        report_paths = write_benchmark_report(
+            Path(output_dir), before, after, comparison
+        )
+        payload["report_paths"] = report_paths
+        if render_charts:
+            payload["charts"] = render_benchmark_charts(
+                Path(output_dir), comparison, before, after
+            )
+
+    return json.dumps(payload)
+