From cd959182ed1ea109cdcb970c8749d227c58de6eb Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 29 Aug 2025 14:58:23 -0400
Subject: [PATCH 01/55] Enforce UTF-8 for Goose session files.

---
 src/metacoder/coders/goose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index 514dc2b..f3e378c 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -165,7 +165,7 @@ def run(self, input_text: str) -> CoderOutput:
                     session_file = Path(session_file_str)
                     break
             if session_file and session_file.exists():
-                with open(session_file, "r") as f:
+                with open(session_file, "r", encoding="utf-8") as f:
                     ao.structured_messages = [
                         json.loads(line) for line in f if line.strip()
                     ]

From a791ce5875cc759f475b22443ad01f55e8e43e1b Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 29 Aug 2025 15:41:08 -0400
Subject: [PATCH 02/55] Fixes issue #15. Prevents divide by zero errors and
 cleans up summaries by using consistent printing methods.

---
 src/metacoder/metacoder.py | 42 ++++++++++++--------------------------
 1 file changed, 13 insertions(+), 29 deletions(-)

diff --git a/src/metacoder/metacoder.py b/src/metacoder/metacoder.py
index f62d3df..3ba1e1b 100644
--- a/src/metacoder/metacoder.py
+++ b/src/metacoder/metacoder.py
@@ -421,9 +421,7 @@ def run(
                 coder_config.ai_model.name = model
 
         # Show the model configuration
-        click.echo(
-            f"🧠 AI Model: {coder_config.ai_model.name} (provider: {coder_config.ai_model.provider})"
-        )
+        click.echo(f"🧠 AI Model: {coder_config.ai_model.name} (provider: {coder_config.ai_model.provider})")
 
     if coder_config and coder_config.extensions:
         for mcp in coder_config.extensions:
@@ -481,16 +479,12 @@ def run(
         click.echo("\n📋 Tool uses:")
         for tool_use in result.tool_uses:
             success = "✅" if tool_use.success else "❌"
-            click.echo(
-                f"  {success} {tool_use.name} with arguments: {tool_use.arguments}"
-            )
+            click.echo(f"  {success} {tool_use.name} with arguments: {tool_use.arguments}")
             if tool_use.error:
                 click.echo(f"    Error: {tool_use.error}")
 
     if verbose and result.structured_messages:
-        click.echo(
-            f"\n📋 Structured messages ({len(result.structured_messages)} total)"
-        )
+        click.echo(f"\n📋 Structured messages ({len(result.structured_messages)} total)")
         for i, msg in enumerate(result.structured_messages):
             click.echo(f"  {i + 1}. {msg}")
 
@@ -592,38 +586,28 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose:
 
     # Print summary
     summary = runner.generate_summary(results)
+    frac_passed = summary['passed'] / summary['total_evaluations'] if summary['total_evaluations'] else 0
+    frac_failed = summary['failed'] / summary['total_evaluations'] if summary['total_evaluations'] else 0
+
     click.echo("\n📈 Summary:")
     click.echo(f"   Total: {summary['total_evaluations']}")
-    click.echo(
-        f"   Passed: {summary['passed']} ({summary['passed'] / summary['total_evaluations'] * 100:.1f}%)"
-    )
-    click.echo(
-        f"   Failed: {summary['failed']} ({summary['failed'] / summary['total_evaluations'] * 100:.1f}%)"
-    )
-    if summary["errors"] > 0:
-        click.echo(f"   Errors: {summary['errors']} ⚠️")
+    click.echo(f"   Passed: {summary['passed']} ({frac_passed:.1%})")
+    click.echo(f"   Failed: {summary['failed']} ({frac_failed:.1%})")
+    click.echo(f"   Errors: {summary['errors']} ⚠️") if summary["errors"] else None
 
     # Print by-coder summary
     if len(summary["by_coder"]) > 1:
         click.echo("\n   By Coder:")
         for coder, stats in summary["by_coder"].items():
-            pass_rate = (
-                stats["passed"] / stats["total"] * 100 if stats["total"] > 0 else 0
-            )
-            click.echo(
-                f"     {coder}: {stats['passed']}/{stats['total']} ({pass_rate:.1f}%)"
-            )
+            coder_frac_passed = stats['passed'] / stats['total'] if stats['total'] else 0
+            click.echo(f"     {coder}: {stats['passed']} / {stats['total']} ({coder_frac_passed:.1%})")
 
     # Print by-model summary
     if len(summary["by_model"]) > 1:
         click.echo("\n   By Model:")
         for model, stats in summary["by_model"].items():
-            pass_rate = (
-                stats["passed"] / stats["total"] * 100 if stats["total"] > 0 else 0
-            )
-            click.echo(
-                f"     {model}: {stats['passed']}/{stats['total']} ({pass_rate:.1f}%)"
-            )
+            model_frac_passed = stats['passed'] / stats['total'] if stats['total'] else 0
+            click.echo(f"     {model}: {stats['passed']} / {stats['total']} ({model_frac_passed:.1%})")
 
     click.echo("\n✅ Evaluation complete!")
 

From 49891a3e01e1dcd43ba9255e933b28a64c84aa7f Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 29 Aug 2025 16:19:57 -0400
Subject: [PATCH 03/55] Cleaned up output by using consistent printing methods.

---
 src/metacoder/coders/claude.py | 2 +-
 src/metacoder/coders/codex.py  | 2 +-
 src/metacoder/coders/gemini.py | 2 +-
 src/metacoder/coders/goose.py  | 2 +-
 src/metacoder/coders/qwen.py   | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/metacoder/coders/claude.py b/src/metacoder/coders/claude.py
index ee31b74..1a43295 100644
--- a/src/metacoder/coders/claude.py
+++ b/src/metacoder/coders/claude.py
@@ -246,7 +246,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                 ao.tool_uses = tool_uses
 
             end_time = time.time()
-            logger.info(f"🤖 Command took {end_time - start_time} seconds")
+            logger.info(f"🤖 Command took {end_time - start_time:.2f} seconds")
             ao.total_cost_usd = total_cost_usd
             ao.success = not is_error
             if not ao.success:
diff --git a/src/metacoder/coders/codex.py b/src/metacoder/coders/codex.py
index 8e9169e..3451ebe 100644
--- a/src/metacoder/coders/codex.py
+++ b/src/metacoder/coders/codex.py
@@ -115,7 +115,7 @@ def run(self, input_text: str) -> CoderOutput:
             if "result" in message:
                 ao.result_text = message["result"]
         end_time = time.time()
-        print(f"🤖 Command took {end_time - start_time} seconds")
+        print(f"🤖 Command took {end_time - start_time:.2f} seconds")
         ao.total_cost_usd = total_cost_usd
         ao.success = not is_error
         if not ao.success:
diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index 20564a9..6af35c4 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput:
                 )
 
             end_time = time.time()
-            logger.info(f"💎 Command took {end_time - start_time} seconds")
+            logger.info(f"💎 Command took {end_time - start_time:.2f} seconds")
 
             # Parse the output
             ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)
diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index f3e378c..6b0b5c0 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput:
             result = self.run_process(command, env)
             end_time = time.time()
             ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)
-            logger.info(f"🦆 Command took {end_time - start_time} seconds")
+            logger.info(f"🦆 Command took {end_time - start_time:.2f} seconds")
             # look in output text for a file like: logging to ./.local/share/goose/sessions/20250613_120403.jsonl
             session_file: Optional[Path] = None
             for line in result.stdout.split("\n"):
diff --git a/src/metacoder/coders/qwen.py b/src/metacoder/coders/qwen.py
index 43aefb6..b6f4080 100644
--- a/src/metacoder/coders/qwen.py
+++ b/src/metacoder/coders/qwen.py
@@ -90,7 +90,7 @@ def run(self, input_text: str) -> CoderOutput:
                 )
 
             end_time = time.time()
-            print(f"🤖 Command took {end_time - start_time} seconds")
+            print(f"🤖 Command took {end_time - start_time:.2f} seconds")
 
             # Create output - Qwen CLI doesn't provide structured output
             ao = CoderOutput(

From 46ad344d476b69aabb2023d1f4ff698d0d380452 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 29 Aug 2025 21:47:25 -0400
Subject: [PATCH 04/55] Fixes Issue #18 by implementing metric downgrades to
 Claude if OpenAPI calls fail, and to DummyMetric if Claude fails.

---
 src/metacoder/evals/judges.py |  55 ++++++++++++++++
 src/metacoder/evals/runner.py | 120 ++++++++++++++++++++++++++--------
 2 files changed, 148 insertions(+), 27 deletions(-)
 create mode 100644 src/metacoder/evals/judges.py

diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py
new file mode 100644
index 0000000..7ca74c6
--- /dev/null
+++ b/src/metacoder/evals/judges.py
@@ -0,0 +1,55 @@
+# metacoder/evals/judges.py
+
+import os
+
+from anthropic import Anthropic
+from anthropic.types import MessageParam, TextBlockParam, TextBlock
+from deepeval.models.base_model import DeepEvalBaseLLM
+
+class ClaudeJudge(DeepEvalBaseLLM):
+    """
+    Wraps Anthropic's Claude models so they can be used as
+    the `model` parameter to DeepEval metrics like GEval.
+    """
+
+    def __init__(
+        self,
+        model_name: str = "claude-3-5-sonnet-20240620",
+        max_tokens: int = 1024,
+        temperature: float = 0.0,
+    ):
+        super().__init__()
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            raise RuntimeError("ANTHROPIC_API_KEY is not set in environment.")
+        self.client = Anthropic(api_key = api_key)
+        self.model_name = model_name
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+
+    def load_model(self):
+        return self
+
+    def generate(self, prompt: str) -> str:
+        # Build typed content blocks and messages to satisfy the SDK's type hints
+        content: list[TextBlockParam] = [{"type": "text", "text": prompt}]
+        messages: list[MessageParam] = [{"role": "user", "content": content}]
+        resp = self.client.messages.create(
+            model = self.model_name,
+            max_tokens = self.max_tokens,
+            temperature = self.temperature,
+            messages = messages
+        )
+        # anthropic returns a list of content blocks; collect only the text blocks.
+        parts: list[str] = []
+        for block in resp.content:
+            if isinstance(block, TextBlock):
+                parts.append(block.text)
+        return "".join(parts)
+
+    async def a_generate(self, prompt: str) -> str:
+        # for now just call the sync path
+        return self.generate(prompt)
+
+    def get_model_name(self) -> str:
+        return self.model_name
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 67a9619..e709b8c 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -5,30 +5,33 @@
 """
 
 import copy
+import functools
 import importlib
 import logging
+import os
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Type
+from typing import Any, Dict, List, Optional, Type, cast
 
 from pydantic import BaseModel
 import yaml
+
 from deepeval import evaluate
-from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCase
-from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.evaluate import AsyncConfig, DisplayConfig, CacheConfig, ErrorConfig
+from deepeval.models import DeepEvalBaseLLM
+from deepeval.metrics import BaseMetric, GEval
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
 
+from openai import APIStatusError
+from openai.types.chat import ChatCompletionMessageParam
 
 from metacoder.coders.base_coder import BaseCoder, CoderOutput
 from metacoder.registry import AVAILABLE_CODERS
 from metacoder.evals.eval_model import EvalCase, EvalDataset
 from metacoder.configuration import AIModelConfig, CoderConfig
 
-
 logger = logging.getLogger(__name__)
 
-
 class DummyMetric(BaseMetric):
     """A dummy metric that always returns a perfect score for testing."""
 
@@ -58,27 +61,32 @@ def is_successful(self) -> bool:
         """Check if the metric passed."""
         return self.success
 
+def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval:
+    """Creates a GEval instance with the specified model."""
+    return GEval(
+        name="Correctness",
+        criteria="Determine whether the actual output is factually correct based on the expected output.",
+        # NOTE: you can only provide either criteria or evaluation_steps, and not both
+        evaluation_steps = [
+            "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
+            "You should also heavily penalize omission of detail",
+            "Vague language, or contradicting OPINIONS, are OK",
+        ],
+        threshold = 0.8,
+        evaluation_params = [
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+            LLMTestCaseParams.EXPECTED_OUTPUT,
+        ],
+        model = model # may be None (defaults to OpenAI) or a Claude judge
+    )
+
 
-def get_default_metrics() -> Dict[str, BaseMetric]:
-    """Get default metrics. Creates instances lazily to avoid network calls during import."""
+def get_default_metrics(model: Optional[DeepEvalBaseLLM] = None) -> Dict[str, BaseMetric]:
+    """Get default metrics with the specified model. Creates instances lazily to avoid network calls during import."""
     return {
-        "CorrectnessMetric": GEval(
-            name="Correctness",
-            criteria="Determine whether the actual output is factually correct based on the expected output.",
-            # NOTE: you can only provide either criteria or evaluation_steps, and not both
-            evaluation_steps=[
-                "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
-                "You should also heavily penalize omission of detail",
-                "Vague language, or contradicting OPINIONS, are OK",
-            ],
-            threshold=0.8,
-            evaluation_params=[
-                LLMTestCaseParams.INPUT,
-                LLMTestCaseParams.ACTUAL_OUTPUT,
-                LLMTestCaseParams.EXPECTED_OUTPUT,
-            ],
-        ),
-        "DummyMetric": DummyMetric(threshold=0.5),
+        "CorrectnessMetric": make_geval(model = model), # Note: GEval defaults to OpenAI if no model is specified.
+        "DummyMetric": DummyMetric(threshold = 0.5)
     }
 
 
@@ -123,6 +131,8 @@ class EvalRunner:
 
     def __init__(self, verbose: bool = False):
         self.verbose = verbose
+        self.use_openai = True # GEval will default to OpenAI, avoid it and downgrade to another provider or metric if quota runs out.
+
         if verbose:
             logging.basicConfig(level=logging.DEBUG)
         else:
@@ -183,6 +193,40 @@ def create_test_case(self, case: EvalCase, actual_output: str) -> LLMTestCase:
             additional_metadata=case.additional_metadata,
         )
 
+    @functools.lru_cache(maxsize=1)
+    def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool:
+        if not os.getenv("OPENAI_API_KEY"):
+            logger.warning("OPENAI_API_KEY is not set.")
+            return False
+        """
+            Preflight: detect “no OpenAI quota” and skip/redirect before calling evaluate.
+            Fast probe of the /chat/completions endpoint (the one GEval uses).
+            Returns False on 429 (insufficient_quota) or any exception.
+        """
+        try:
+            from openai import OpenAI
+            # turn off SDK retries for the check so it returns fast
+            client = OpenAI(max_retries=0, timeout=8)  # NO retries, quick fail
+            # messages = cast(List[ChatCompletionMessageParam], [{"role": "user", "content": "ping"}])
+            raw = [{"role": "user", "content": "ping"}]
+            messages = cast(List[ChatCompletionMessageParam], raw)
+            client.chat.completions.create(
+                model = model,
+                messages = messages,
+                max_tokens = 1,
+                temperature = 0,
+            )
+            return True
+        except APIStatusError as e:
+            # 429 insufficient_quota, or other status codes
+            if e.status_code == 429:
+                return False
+            return False
+        except Exception as e:
+            # includes 401 (bad key), 429 (insufficient_quota), network issues, etc.
+            logger.warning(f"OpenAI preflight failed; treating as no-quota: {e}")
+            return False
+
     def run_single_eval(
         self,
         model_name: str,
@@ -236,7 +280,29 @@ def run_single_eval(
 
             # Evaluate
             logger.info(f"Evaluating with {metric_name}")
-            eval_results = evaluate([test_case], [metric])
+
+            if isinstance(metric, GEval):
+                # Assume GEval will hit OpenAI unless we replace it.
+                if self.use_openai and not self._openai_quota_ok():
+                    self.use_openai = False
+                    logger.warning("OpenAI quota exhausted; downgrading to Claude...")
+                    from metacoder.evals.judges import ClaudeJudge
+                    try:
+                        # Downgrade to Claude judge in order to keep a real metric (even if not directly comparable to OpenAI).
+                        metric = make_geval(model = ClaudeJudge("claude-3-5-sonnet-20240620"))
+                    except Exception as e:
+                        # Fallback: if you can't use Claude, downgrade gracefully.
+                        logger.warning("Claude unavailable (%s); downgrading to DummyMetric.", e)
+                        metric = DummyMetric(threshold = 0.5)
+
+            eval_results = evaluate(
+                [test_case],
+                [metric],
+                async_config = AsyncConfig(run_async=False), # disable async
+                display_config = DisplayConfig(show_indicator=False, print_results=False, verbose_mode=self.verbose), # hide the spinner
+                cache_config = CacheConfig(use_cache=False, write_cache=False),
+                error_config = ErrorConfig(ignore_errors=False, skip_on_missing_params=True) # actually fail on failure
+            )
 
             # Extract results - the structure varies by deepeval version
             test_result = eval_results.test_results[0]

From fc7ba41bcda2124e2ab1a6034c3896bd4b937975 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 29 Aug 2025 22:15:47 -0400
Subject: [PATCH 05/55] Satisfied ruff's bizarre rules.

---
 src/metacoder/evals/judges.py | 12 +++++---
 src/metacoder/evals/runner.py | 57 +++++++++++++++++++++++------------
 src/metacoder/metacoder.py    | 40 ++++++++++++++++++------
 3 files changed, 76 insertions(+), 33 deletions(-)

diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py
index 7ca74c6..dc21724 100644
--- a/src/metacoder/evals/judges.py
+++ b/src/metacoder/evals/judges.py
@@ -4,8 +4,10 @@
 
 from anthropic import Anthropic
 from anthropic.types import MessageParam, TextBlockParam, TextBlock
+
 from deepeval.models.base_model import DeepEvalBaseLLM
 
+
 class ClaudeJudge(DeepEvalBaseLLM):
     """
     Wraps Anthropic's Claude models so they can be used as
@@ -22,7 +24,7 @@ def __init__(
         api_key = os.getenv("ANTHROPIC_API_KEY")
         if not api_key:
             raise RuntimeError("ANTHROPIC_API_KEY is not set in environment.")
-        self.client = Anthropic(api_key = api_key)
+        self.client = Anthropic(api_key=api_key)
         self.model_name = model_name
         self.max_tokens = max_tokens
         self.temperature = temperature
@@ -35,10 +37,10 @@ def generate(self, prompt: str) -> str:
         content: list[TextBlockParam] = [{"type": "text", "text": prompt}]
         messages: list[MessageParam] = [{"role": "user", "content": content}]
         resp = self.client.messages.create(
-            model = self.model_name,
-            max_tokens = self.max_tokens,
-            temperature = self.temperature,
-            messages = messages
+            model=self.model_name,
+            max_tokens=self.max_tokens,
+            temperature=self.temperature,
+            messages=messages,
         )
         # anthropic returns a list of content blocks; collect only the text blocks.
         parts: list[str] = []
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index e709b8c..40edd3b 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -32,6 +32,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 class DummyMetric(BaseMetric):
     """A dummy metric that always returns a perfect score for testing."""
 
@@ -61,32 +62,37 @@ def is_successful(self) -> bool:
         """Check if the metric passed."""
         return self.success
 
+
 def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval:
     """Creates a GEval instance with the specified model."""
     return GEval(
         name="Correctness",
         criteria="Determine whether the actual output is factually correct based on the expected output.",
         # NOTE: you can only provide either criteria or evaluation_steps, and not both
-        evaluation_steps = [
+        evaluation_steps=[
             "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
             "You should also heavily penalize omission of detail",
             "Vague language, or contradicting OPINIONS, are OK",
         ],
-        threshold = 0.8,
-        evaluation_params = [
+        threshold=0.8,
+        evaluation_params=[
             LLMTestCaseParams.INPUT,
             LLMTestCaseParams.ACTUAL_OUTPUT,
             LLMTestCaseParams.EXPECTED_OUTPUT,
         ],
-        model = model # may be None (defaults to OpenAI) or a Claude judge
+        model=model,  # may be None (defaults to OpenAI) or a Claude judge
     )
 
 
-def get_default_metrics(model: Optional[DeepEvalBaseLLM] = None) -> Dict[str, BaseMetric]:
+def get_default_metrics(
+    model: Optional[DeepEvalBaseLLM] = None,
+) -> Dict[str, BaseMetric]:
     """Get default metrics with the specified model. Creates instances lazily to avoid network calls during import."""
     return {
-        "CorrectnessMetric": make_geval(model = model), # Note: GEval defaults to OpenAI if no model is specified.
-        "DummyMetric": DummyMetric(threshold = 0.5)
+        "CorrectnessMetric": make_geval(
+            model=model  # Note: GEval defaults to OpenAI if no model is specified.
+        ),
+        "DummyMetric": DummyMetric(threshold=0.5),
     }
 
 
@@ -131,7 +137,7 @@ class EvalRunner:
 
     def __init__(self, verbose: bool = False):
         self.verbose = verbose
-        self.use_openai = True # GEval will default to OpenAI, avoid it and downgrade to another provider or metric if quota runs out.
+        self.use_openai = True  # GEval will default to OpenAI, avoid it and downgrade to another provider or metric if quota runs out.
 
         if verbose:
             logging.basicConfig(level=logging.DEBUG)
@@ -205,16 +211,17 @@ def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool:
         """
         try:
             from openai import OpenAI
+
             # turn off SDK retries for the check so it returns fast
             client = OpenAI(max_retries=0, timeout=8)  # NO retries, quick fail
             # messages = cast(List[ChatCompletionMessageParam], [{"role": "user", "content": "ping"}])
             raw = [{"role": "user", "content": "ping"}]
             messages = cast(List[ChatCompletionMessageParam], raw)
             client.chat.completions.create(
-                model = model,
-                messages = messages,
-                max_tokens = 1,
-                temperature = 0,
+                model=model,
+                messages=messages,
+                max_tokens=1,
+                temperature=0,
             )
             return True
         except APIStatusError as e:
@@ -287,21 +294,33 @@ def run_single_eval(
                     self.use_openai = False
                     logger.warning("OpenAI quota exhausted; downgrading to Claude...")
                     from metacoder.evals.judges import ClaudeJudge
+
                     try:
                         # Downgrade to Claude judge in order to keep a real metric (even if not directly comparable to OpenAI).
-                        metric = make_geval(model = ClaudeJudge("claude-3-5-sonnet-20240620"))
+                        metric = make_geval(
+                            model=ClaudeJudge("claude-3-5-sonnet-20240620")
+                        )
                     except Exception as e:
                         # Fallback: if you can't use Claude, downgrade gracefully.
-                        logger.warning("Claude unavailable (%s); downgrading to DummyMetric.", e)
-                        metric = DummyMetric(threshold = 0.5)
+                        logger.warning(
+                            "Claude unavailable (%s); downgrading to DummyMetric.", e
+                        )
+                        metric = DummyMetric(threshold=0.5)
 
             eval_results = evaluate(
                 [test_case],
                 [metric],
-                async_config = AsyncConfig(run_async=False), # disable async
-                display_config = DisplayConfig(show_indicator=False, print_results=False, verbose_mode=self.verbose), # hide the spinner
-                cache_config = CacheConfig(use_cache=False, write_cache=False),
-                error_config = ErrorConfig(ignore_errors=False, skip_on_missing_params=True) # actually fail on failure
+                async_config=AsyncConfig(run_async=False),  # disable async
+                display_config=DisplayConfig(
+                    show_indicator=False,  # hide the progress meter
+                    print_results=False,
+                    verbose_mode=self.verbose,
+                ),
+                cache_config=CacheConfig(use_cache=False, write_cache=False),
+                error_config=ErrorConfig(
+                    ignore_errors=False,  # actually fail on failure
+                    skip_on_missing_params=True,
+                ),
             )
 
             # Extract results - the structure varies by deepeval version
diff --git a/src/metacoder/metacoder.py b/src/metacoder/metacoder.py
index 3ba1e1b..28113ec 100644
--- a/src/metacoder/metacoder.py
+++ b/src/metacoder/metacoder.py
@@ -421,7 +421,9 @@ def run(
                 coder_config.ai_model.name = model
 
         # Show the model configuration
-        click.echo(f"🧠 AI Model: {coder_config.ai_model.name} (provider: {coder_config.ai_model.provider})")
+        click.echo(
+            f"🧠 AI Model: {coder_config.ai_model.name} (provider: {coder_config.ai_model.provider})"
+        )
 
     if coder_config and coder_config.extensions:
         for mcp in coder_config.extensions:
@@ -479,12 +481,16 @@ def run(
         click.echo("\n📋 Tool uses:")
         for tool_use in result.tool_uses:
             success = "✅" if tool_use.success else "❌"
-            click.echo(f"  {success} {tool_use.name} with arguments: {tool_use.arguments}")
+            click.echo(
+                f"  {success} {tool_use.name} with arguments: {tool_use.arguments}"
+            )
             if tool_use.error:
                 click.echo(f"    Error: {tool_use.error}")
 
     if verbose and result.structured_messages:
-        click.echo(f"\n📋 Structured messages ({len(result.structured_messages)} total)")
+        click.echo(
+            f"\n📋 Structured messages ({len(result.structured_messages)} total)"
+        )
         for i, msg in enumerate(result.structured_messages):
             click.echo(f"  {i + 1}. {msg}")
 
@@ -586,8 +592,16 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose:
 
     # Print summary
     summary = runner.generate_summary(results)
-    frac_passed = summary['passed'] / summary['total_evaluations'] if summary['total_evaluations'] else 0
-    frac_failed = summary['failed'] / summary['total_evaluations'] if summary['total_evaluations'] else 0
+    frac_passed = (
+        summary["passed"] / summary["total_evaluations"]
+        if summary["total_evaluations"]
+        else 0
+    )
+    frac_failed = (
+        summary["failed"] / summary["total_evaluations"]
+        if summary["total_evaluations"]
+        else 0
+    )
 
     click.echo("\n📈 Summary:")
     click.echo(f"   Total: {summary['total_evaluations']}")
@@ -599,15 +613,23 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose:
     if len(summary["by_coder"]) > 1:
         click.echo("\n   By Coder:")
         for coder, stats in summary["by_coder"].items():
-            coder_frac_passed = stats['passed'] / stats['total'] if stats['total'] else 0
-            click.echo(f"     {coder}: {stats['passed']} / {stats['total']} ({coder_frac_passed:.1%})")
+            coder_frac_passed = (
+                stats["passed"] / stats["total"] if stats["total"] else 0
+            )
+            click.echo(
+                f"     {coder}: {stats['passed']} / {stats['total']} ({coder_frac_passed:.1%})"
+            )
 
     # Print by-model summary
     if len(summary["by_model"]) > 1:
         click.echo("\n   By Model:")
         for model, stats in summary["by_model"].items():
-            model_frac_passed = stats['passed'] / stats['total'] if stats['total'] else 0
-            click.echo(f"     {model}: {stats['passed']} / {stats['total']} ({model_frac_passed:.1%})")
+            model_frac_passed = (
+                stats["passed"] / stats["total"] if stats["total"] else 0
+            )
+            click.echo(
+                f"     {model}: {stats['passed']} / {stats['total']} ({model_frac_passed:.1%})"
+            )
 
     click.echo("\n✅ Evaluation complete!")
 

From 54dd3d3f4faf9207dd4f233496f77a488f5075ad Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 29 Aug 2025 23:38:11 -0400
Subject: [PATCH 06/55] Added extra logging and test for goose UTF-8 handling.

---
 src/metacoder/evals/runner.py          |  2 ++
 tests/test_coders/test_coders_basic.py | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 40edd3b..bc40f38 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -227,7 +227,9 @@ def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool:
         except APIStatusError as e:
             # 429 insufficient_quota, or other status codes
             if e.status_code == 429:
+                logger.warning(f"OpenAI API Key has insufficient quota: {e}")
                 return False
+            logger.warning(f"OpenAI API Status Error; treating as no-quota: {e}")
             return False
         except Exception as e:
             # includes 401 (bad key), 429 (insufficient_quota), network issues, etc.
diff --git a/tests/test_coders/test_coders_basic.py b/tests/test_coders/test_coders_basic.py
index a9498b6..5d9daf1 100644
--- a/tests/test_coders/test_coders_basic.py
+++ b/tests/test_coders/test_coders_basic.py
@@ -3,6 +3,7 @@
 These tests check that each coder can handle a simple arithmetic question.
 """
 
+import json
 import tempfile
 import pytest
 
@@ -164,3 +165,16 @@ def test_dummy_coder_always_works():
         assert result is not None
         assert result.result_text == "you said: Hello, world!"
         assert result.stdout == "you said: Hello, world!"
+
+
+@pytest.mark.integration
+def test_goose_utf8_session_file(tmp_path):
+    """Test session files with UTF-8 content are read correctly."""
+    session_content = '{"role": "assistant", "content": "测试 résumé 🚀"}\n'
+    session_file = tmp_path / "test_session.jsonl"
+    session_file.write_text(session_content, encoding="utf-8")
+
+    with open(session_file, "r", encoding="utf-8") as f:
+        messages = [json.loads(line) for line in f if line.strip()]
+    assert len(messages) == 1
+    assert "测试" in messages[0]["content"]

From 72f586c5d8c0f8745aafcbae7f5c290d5eb2cf5a Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Sat, 30 Aug 2025 10:29:22 -0400
Subject: [PATCH 07/55] Added metacoder configuration test cases for claude
 downgrade and no server combinations to support Issues #18, #19, and #20.

---
 .../goose_eval_claude_downgrade_test.yaml     | 30 +++++++++++++++++++
 tests/input/goose_no_server_test.yaml         | 30 +++++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 tests/input/goose_eval_claude_downgrade_test.yaml
 create mode 100644 tests/input/goose_no_server_test.yaml

diff --git a/tests/input/goose_eval_claude_downgrade_test.yaml b/tests/input/goose_eval_claude_downgrade_test.yaml
new file mode 100644
index 0000000..6f0eb31
--- /dev/null
+++ b/tests/input/goose_eval_claude_downgrade_test.yaml
@@ -0,0 +1,30 @@
+name: pubmed tools evals
+description: |
+  Evaluations for multiple pubmed MCPs
+
+coders:
+  goose: {}
+
+models:
+  claude-sonnet:
+    provider: anthropic
+    name: claude-sonnet-4-20250514
+
+servers:
+  mcp-simple-pubmed:
+    name: pubmed
+    command: uvx
+    args: [mcp-simple-pubmed]
+    env:
+      PUBMED_EMAIL: ctparker@lbl.gov
+
+server_combinations:
+  - [mcp-simple-pubmed]
+
+cases:
+- name: PMID_28027860_Full_Text
+  metrics: [CorrectnessMetric]
+  input: "What is the first sentence of section 2 in PMID: 28027860?"
+  expected_output: |
+    Even though many of NFLE's core features have been clarified in the last two decades, some critical issues remain controversial."
+  threshold: 0.9
diff --git a/tests/input/goose_no_server_test.yaml b/tests/input/goose_no_server_test.yaml
new file mode 100644
index 0000000..2dc5551
--- /dev/null
+++ b/tests/input/goose_no_server_test.yaml
@@ -0,0 +1,30 @@
+name: pubmed tools evals
+description: |
+  Evaluations for multiple pubmed MCPs
+
+coders:
+  goose: {}
+
+models:
+  claude-sonnet:
+    provider: anthropic
+    name: claude-sonnet-4-20250514
+
+servers:
+  mcp-simple-pubmed:
+    name: pubmed
+    command: uvx
+    args: [mcp-simple-pubmed]
+    env:
+      PUBMED_EMAIL: ctparker@lbl.gov
+
+#server_combinations:
+#  - [mcp-simple-pubmed]
+
+cases:
+- name: PMID_28027860_Full_Text
+  metrics: [CorrectnessMetric]
+  input: "What is the first sentence of section 2 in PMID: 28027860?"
+  expected_output: |
+    Even though many of NFLE's core features have been clarified in the last two decades, some critical issues remain controversial."
+  threshold: 0.9

From d7beb19baa08632b61f08fa31596324338e77541 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Sat, 30 Aug 2025 12:07:02 -0400
Subject: [PATCH 08/55] Added unit test for claude downgrade to support Issue
 #18. Cleaned up logging in runner.py. Added test configuration to support log
 capture for assertions that downgrade was successful.

---
 src/metacoder/evals/runner.py         | 13 +++++++------
 tests/conftest.py                     | 10 ++++++++++
 tests/test_evals/test_claude_judge.py | 26 ++++++++++++++++++++++++++
 3 files changed, 43 insertions(+), 6 deletions(-)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/test_evals/test_claude_judge.py

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index bc40f38..f31173e 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -202,7 +202,7 @@ def create_test_case(self, case: EvalCase, actual_output: str) -> LLMTestCase:
     @functools.lru_cache(maxsize=1)
     def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool:
         if not os.getenv("OPENAI_API_KEY"):
-            logger.warning("OPENAI_API_KEY is not set.")
+            logger.info("OPENAI_API_KEY is not set.")
             return False
         """
             Preflight: detect “no OpenAI quota” and skip/redirect before calling evaluate.
@@ -227,13 +227,13 @@ def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool:
         except APIStatusError as e:
             # 429 insufficient_quota, or other status codes
             if e.status_code == 429:
-                logger.warning(f"OpenAI API Key has insufficient quota: {e}")
+                logger.info(f"OpenAI API Key has insufficient quota: {e}")
                 return False
-            logger.warning(f"OpenAI API Status Error; treating as no-quota: {e}")
+            logger.info(f"OpenAI API Status Error; treating as no-quota: {e}")
             return False
         except Exception as e:
             # includes 401 (bad key), 429 (insufficient_quota), network issues, etc.
-            logger.warning(f"OpenAI preflight failed; treating as no-quota: {e}")
+            logger.info(f"OpenAI preflight failed; treating as no-quota: {e}")
             return False
 
     def run_single_eval(
@@ -294,13 +294,14 @@ def run_single_eval(
                 # Assume GEval will hit OpenAI unless we replace it.
                 if self.use_openai and not self._openai_quota_ok():
                     self.use_openai = False
-                    logger.warning("OpenAI quota exhausted; downgrading to Claude...")
+                    claude_model = "claude-3-5-sonnet-20240620"
+                    logger.warning(f"OpenAI API quota exhausted or server unavailable; downgrading to {claude_model}")
                     from metacoder.evals.judges import ClaudeJudge
 
                     try:
                         # Downgrade to Claude judge in order to keep a real metric (even if not directly comparable to OpenAI).
                         metric = make_geval(
-                            model=ClaudeJudge("claude-3-5-sonnet-20240620")
+                            model=ClaudeJudge(claude_model)
                         )
                     except Exception as e:
                         # Fallback: if you can't use Claude, downgrade gracefully.
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..95f4c37
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,10 @@
+import logging
+import sys
+
+
+def pytest_configure(config):
+    logging.basicConfig(
+        level=logging.WARNING,
+        format="\n%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+        stream=sys.stdout,
+    )
diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
new file mode 100644
index 0000000..2ddc2ce
--- /dev/null
+++ b/tests/test_evals/test_claude_judge.py
@@ -0,0 +1,26 @@
+import logging
+from pathlib import Path
+
+from metacoder.evals.runner import EvalRunner
+
+
+def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
+    """Test that ClaudeJudge is used when OpenAI is disabled."""
+
+    # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
+    # (no need to reset, `monkeypatch` automatically reverts after the test)
+    monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
+
+    runner = EvalRunner()
+
+    try:
+        dataset = runner.load_dataset(Path("tests/input/goose_eval_claude_downgrade_test.yaml"))
+
+        # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used.
+        # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
+        with caplog.at_level(logging.WARNING):
+            results = runner.run_all_evals(dataset, workdir=tmp_path, coders=["goose"])
+            assert "OpenAI API quota exhausted or server unavailable; downgrading to claude-3-5-sonnet-20240620" in caplog.text
+
+    finally:
+        pass

From d88ca905eb356ff589e0a2f17ae82e5d4735376f Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Sat, 30 Aug 2025 12:09:41 -0400
Subject: [PATCH 09/55] Added unit test for claude downgrade to support Issue
 #18. Cleaned up logging in runner.py. Added test configuration to support log
 capture for assertions that downgrade was successful. Addressed ruff
 warnings.

---
 src/metacoder/evals/runner.py         | 8 ++++----
 tests/test_evals/test_claude_judge.py | 9 +++++++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index f31173e..bf231e9 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -295,14 +295,14 @@ def run_single_eval(
                 if self.use_openai and not self._openai_quota_ok():
                     self.use_openai = False
                     claude_model = "claude-3-5-sonnet-20240620"
-                    logger.warning(f"OpenAI API quota exhausted or server unavailable; downgrading to {claude_model}")
+                    logger.warning(
+                        f"OpenAI API quota exhausted or server unavailable; downgrading to {claude_model}"
+                    )
                     from metacoder.evals.judges import ClaudeJudge
 
                     try:
                         # Downgrade to Claude judge in order to keep a real metric (even if not directly comparable to OpenAI).
-                        metric = make_geval(
-                            model=ClaudeJudge(claude_model)
-                        )
+                        metric = make_geval(model=ClaudeJudge(claude_model))
                     except Exception as e:
                         # Fallback: if you can't use Claude, downgrade gracefully.
                         logger.warning(
diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
index 2ddc2ce..42047bc 100644
--- a/tests/test_evals/test_claude_judge.py
+++ b/tests/test_evals/test_claude_judge.py
@@ -14,13 +14,18 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
     runner = EvalRunner()
 
     try:
-        dataset = runner.load_dataset(Path("tests/input/goose_eval_claude_downgrade_test.yaml"))
+        dataset = runner.load_dataset(
+            Path("tests/input/goose_eval_claude_downgrade_test.yaml")
+        )
 
         # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used.
         # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
         with caplog.at_level(logging.WARNING):
             results = runner.run_all_evals(dataset, workdir=tmp_path, coders=["goose"])
-            assert "OpenAI API quota exhausted or server unavailable; downgrading to claude-3-5-sonnet-20240620" in caplog.text
+            assert (
+                "OpenAI API quota exhausted or server unavailable; downgrading to claude-3-5-sonnet-20240620"
+                in caplog.text
+            )
 
     finally:
         pass

From e7bba401faddb0b93316f3e3f8d7c0f8341c7c39 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Sat, 30 Aug 2025 12:31:12 -0400
Subject: [PATCH 10/55] Added assertion to confirm that ClaudeJudge completed
 scoring the metric after the downgrade.

---
 tests/test_evals/test_claude_judge.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
index 42047bc..8e1fbac 100644
--- a/tests/test_evals/test_claude_judge.py
+++ b/tests/test_evals/test_claude_judge.py
@@ -3,6 +3,8 @@
 
 from metacoder.evals.runner import EvalRunner
 
+logger = logging.getLogger(__name__)
+
 
 def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
     """Test that ClaudeJudge is used when OpenAI is disabled."""
@@ -19,13 +21,21 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
         )
 
         # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used.
+        # One enhancement might be to introduce metric_model=claude-3-5-sonnet-20240620 to each result at eval time.
         # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
         with caplog.at_level(logging.WARNING):
             results = runner.run_all_evals(dataset, workdir=tmp_path, coders=["goose"])
+
+            # Verfiy that the downgrade happened.
             assert (
                 "OpenAI API quota exhausted or server unavailable; downgrading to claude-3-5-sonnet-20240620"
                 in caplog.text
             )
 
+            # Verify that the eval completed by checking for a non-zero score.
+            assert results[0].score > 0, (
+                f"Expected ClaudeJudge to score {results[0].metric_name} for {results[0].case_name}"
+            )
+
     finally:
         pass

From d27277b49111a4ea6399ed14ade1cded6cbf1db2 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Sat, 30 Aug 2025 13:54:27 -0400
Subject: [PATCH 11/55] Added assertion to force test to fail on Exception.
 Increased logging verbosity temporarily to debug Claude judge unit test on
 build server. Adjusted logic to work when multiple coders are specified.
 Improved log messages.

---
 src/metacoder/evals/runner.py         | 46 +++++++++++++++++++++------
 tests/conftest.py                     |  2 +-
 tests/test_evals/test_claude_judge.py | 31 +++++++++++++++---
 3 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index bf231e9..d0ae3c6 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -10,6 +10,7 @@
 import logging
 import os
 import time
+import traceback
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Type, cast
 
@@ -225,15 +226,20 @@ def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool:
             )
             return True
         except APIStatusError as e:
-            # 429 insufficient_quota, or other status codes
+            # 429 insufficient quota or too many requests
             if e.status_code == 429:
-                logger.info(f"OpenAI API Key has insufficient quota: {e}")
+                logger.warning(f"OpenAI API Key has insufficient quota: {e}")
                 return False
-            logger.info(f"OpenAI API Status Error; treating as no-quota: {e}")
+            # 401 authentication problem, including invalid API key
+            if e.status_code == 401:
+                logger.warning(f"OpenAI API Authentication Error: {e}")
+                return False
+            # all other errors
+            logger.warning(f"OpenAI API Status Error; treating as no-quota: {e}")
             return False
         except Exception as e:
-            # includes 401 (bad key), 429 (insufficient_quota), network issues, etc.
-            logger.info(f"OpenAI preflight failed; treating as no-quota: {e}")
+            # includes network issues, etc.
+            logger.warning(f"OpenAI preflight failed; treating as no-quota: {e}")
             return False
 
     def run_single_eval(
@@ -288,27 +294,47 @@ def run_single_eval(
             test_case = self.create_test_case(case, actual_output)
 
             # Evaluate
-            logger.info(f"Evaluating with {metric_name}")
+            logger.info(
+                f"Evaluating {metric_name} using model {metric.model.model_name}"
+            )
 
             if isinstance(metric, GEval):
-                # Assume GEval will hit OpenAI unless we replace it.
+                # Assume GEval will use OpenAI until is disabled.
                 if self.use_openai and not self._openai_quota_ok():
+                    logger.warning(
+                        f"OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
+                    )
                     self.use_openai = False
+
+                # Note: This will downgrade a metric if needed each time it is about to be used without modifying the default metrics.
+                if not self.use_openai:
+                    from metacoder.evals.judges import ClaudeJudge
+
                     claude_model = "claude-3-5-sonnet-20240620"
                     logger.warning(
-                        f"OpenAI API quota exhausted or server unavailable; downgrading to {claude_model}"
+                        f"Downgrading {metric_name} model from {metric.model.model_name} to {claude_model}."
                     )
-                    from metacoder.evals.judges import ClaudeJudge
 
                     try:
-                        # Downgrade to Claude judge in order to keep a real metric (even if not directly comparable to OpenAI).
+                        # Downgrade metric model to Claude judge.
                         metric = make_geval(model=ClaudeJudge(claude_model))
+                        logger.warning(
+                            f"Successfully downgraded {metric_name} model to {metric.model.model_name}."
+                        )
                     except Exception as e:
                         # Fallback: if you can't use Claude, downgrade gracefully.
+                        logging.error(traceback.format_exc())
                         logger.warning(
                             "Claude unavailable (%s); downgrading to DummyMetric.", e
                         )
                         metric = DummyMetric(threshold=0.5)
+                        logger.warning(
+                            f"Successfully downgraded {metric_name} model to {metric.model.model_name}."
+                        )
+
+            logger.warning(
+                f"Actual {metric_name} model used: {metric.model.model_name}"
+            )
 
             eval_results = evaluate(
                 [test_case],
diff --git a/tests/conftest.py b/tests/conftest.py
index 95f4c37..2416094 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,7 +4,7 @@
 
 def pytest_configure(config):
     logging.basicConfig(
-        level=logging.WARNING,
+        level=logging.DEBUG,
         format="\n%(asctime)s [%(levelname)s] %(name)s: %(message)s",
         stream=sys.stdout,
     )
diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
index 8e1fbac..9b798df 100644
--- a/tests/test_evals/test_claude_judge.py
+++ b/tests/test_evals/test_claude_judge.py
@@ -1,4 +1,6 @@
 import logging
+import os
+import traceback
 from pathlib import Path
 
 from metacoder.evals.runner import EvalRunner
@@ -9,9 +11,9 @@
 def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
     """Test that ClaudeJudge is used when OpenAI is disabled."""
 
-    # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
-    # (no need to reset, `monkeypatch` automatically reverts after the test)
-    monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
+    # # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
+    # # (no need to reset, `monkeypatch` automatically reverts after the test)
+    # monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
 
     runner = EvalRunner()
 
@@ -24,11 +26,25 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
         # One enhancement might be to introduce metric_model=claude-3-5-sonnet-20240620 to each result at eval time.
         # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
         with caplog.at_level(logging.WARNING):
-            results = runner.run_all_evals(dataset, workdir=tmp_path, coders=["goose"])
+            # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
+            # (no need to reset, `monkeypatch` automatically reverts after the test)
+            # Save the original OPENAI_API_KEY if it exists
+            # original_api_key = os.getenv("OPENAI_API_KEY")
+            monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
+
+            results = runner.run_all_evals(
+                dataset, workdir=tmp_path, coders=["goose", "dummy"]
+            )
+
+            # # Revert the OPENAI_API_KEY to its original value
+            # if original_api_key is not None:
+            #     monkeypatch.setenv("OPENAI_API_KEY", original_api_key)
+            # else:
+            #     monkeypatch.delenv("OPENAI_API_KEY", raising=False)
 
             # Verfiy that the downgrade happened.
             assert (
-                "OpenAI API quota exhausted or server unavailable; downgrading to claude-3-5-sonnet-20240620"
+                "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
                 in caplog.text
             )
 
@@ -37,5 +53,10 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
                 f"Expected ClaudeJudge to score {results[0].metric_name} for {results[0].case_name}"
             )
 
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")
+        # traceback.print_exc()
+        logging.error(traceback.format_exc())
+        assert False  # force test to fail if an exception is caught here
     finally:
         pass

From 3f22fc6cfecb87782130b4e83d86664d8c3e67cb Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Sat, 30 Aug 2025 14:08:12 -0400
Subject: [PATCH 12/55] Fixed runtime issues related to metric downgrade from
 CorrectnessMetric to DummyMetric.

---
 src/metacoder/evals/judges.py         |  2 +-
 src/metacoder/evals/runner.py         | 11 +++++------
 tests/test_evals/test_claude_judge.py | 14 --------------
 3 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py
index dc21724..4cff6c3 100644
--- a/src/metacoder/evals/judges.py
+++ b/src/metacoder/evals/judges.py
@@ -23,7 +23,7 @@ def __init__(
         super().__init__()
         api_key = os.getenv("ANTHROPIC_API_KEY")
         if not api_key:
-            raise RuntimeError("ANTHROPIC_API_KEY is not set in environment.")
+            raise Exception("ANTHROPIC_API_KEY is not set in environment")
         self.client = Anthropic(api_key=api_key)
         self.model_name = model_name
         self.max_tokens = max_tokens
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index d0ae3c6..10b1b09 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -302,7 +302,7 @@ def run_single_eval(
                 # Assume GEval will use OpenAI until is disabled.
                 if self.use_openai and not self._openai_quota_ok():
                     logger.warning(
-                        f"OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
+                        "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
                     )
                     self.use_openai = False
 
@@ -325,16 +325,15 @@ def run_single_eval(
                         # Fallback: if you can't use Claude, downgrade gracefully.
                         logging.error(traceback.format_exc())
                         logger.warning(
-                            "Claude unavailable (%s); downgrading to DummyMetric.", e
+                            "Claude unavailable (%s); downgrading {metric_name} to DummyMetric.",
+                            e,
                         )
                         metric = DummyMetric(threshold=0.5)
                         logger.warning(
-                            f"Successfully downgraded {metric_name} model to {metric.model.model_name}."
+                            f"Successfully downgraded {metric_name} to {metric.name}."
                         )
 
-            logger.warning(
-                f"Actual {metric_name} model used: {metric.model.model_name}"
-            )
+            logger.warning(f"Actual metric used: {metric.name}.")
 
             eval_results = evaluate(
                 [test_case],
diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
index 9b798df..0c14861 100644
--- a/tests/test_evals/test_claude_judge.py
+++ b/tests/test_evals/test_claude_judge.py
@@ -1,5 +1,4 @@
 import logging
-import os
 import traceback
 from pathlib import Path
 
@@ -11,10 +10,6 @@
 def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
     """Test that ClaudeJudge is used when OpenAI is disabled."""
 
-    # # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
-    # # (no need to reset, `monkeypatch` automatically reverts after the test)
-    # monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
-
     runner = EvalRunner()
 
     try:
@@ -28,20 +23,12 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
         with caplog.at_level(logging.WARNING):
             # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
             # (no need to reset, `monkeypatch` automatically reverts after the test)
-            # Save the original OPENAI_API_KEY if it exists
-            # original_api_key = os.getenv("OPENAI_API_KEY")
             monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
 
             results = runner.run_all_evals(
                 dataset, workdir=tmp_path, coders=["goose", "dummy"]
             )
 
-            # # Revert the OPENAI_API_KEY to its original value
-            # if original_api_key is not None:
-            #     monkeypatch.setenv("OPENAI_API_KEY", original_api_key)
-            # else:
-            #     monkeypatch.delenv("OPENAI_API_KEY", raising=False)
-
             # Verfiy that the downgrade happened.
             assert (
                 "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
@@ -55,7 +42,6 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
 
     except Exception as e:
         logger.error(f"An error occurred: {e}")
-        # traceback.print_exc()
         logging.error(traceback.format_exc())
         assert False  # force test to fail if an exception is caught here
     finally:

From d6e1e448d1cea5b0c3d0dae6bbeec890536bca59 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Sat, 30 Aug 2025 14:20:02 -0400
Subject: [PATCH 13/55] Added test coverage of new evaluation judge
 functionality. Added test for the quota exhaustion fallback logic.

---
 tests/test_evals/test_claude_judge.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
index 0c14861..70bfa07 100644
--- a/tests/test_evals/test_claude_judge.py
+++ b/tests/test_evals/test_claude_judge.py
@@ -29,20 +29,27 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
                 dataset, workdir=tmp_path, coders=["goose", "dummy"]
             )
 
-            # Verfiy that the downgrade happened.
+            # Test that the quota exhaustion fallback logic worked as expected.
             assert (
                 "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
                 in caplog.text
             )
 
-            # Verify that the eval completed by checking for a non-zero score.
+            # Test that the new evaluation judge was correctly selected for the metric model downgrade.
+            assert (
+                "Downgrading CorrectnessMetric model from gpt-4.1 to claude-3-5-sonnet-20240620."
+                in caplog.text
+            )
+
+            # Test that the eval completed by checking for a non-zero score.
             assert results[0].score > 0, (
-                f"Expected ClaudeJudge to score {results[0].metric_name} for {results[0].case_name}"
+                f"Expected a {results[0].metric_name} score for {results[0].case_name}."
             )
 
     except Exception as e:
+        # Test that fallback logic does not result in an Exception.
         logger.error(f"An error occurred: {e}")
         logging.error(traceback.format_exc())
-        assert False  # force test to fail if an exception is caught here
+        assert False  # This assertion will fail if an Exception is caught here.
     finally:
         pass

From 882a3d91379835b9e213dbfb7d5744ecbfd2c91b Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Tue, 2 Sep 2025 12:51:41 -0400
Subject: [PATCH 14/55] Reduced logging verbosity. Added Anthropic quota check.
 Added automatic downgrade to DummyMetric on quota check failure. Added notes
 on potential improvements to unit tests.

---
 src/metacoder/evals/judges.py         | 30 +++++++++-
 src/metacoder/evals/runner.py         | 27 +++++----
 tests/conftest.py                     |  2 +-
 tests/test_evals/test_claude_judge.py | 84 +++++++++++++++++++++++++++
 4 files changed, 129 insertions(+), 14 deletions(-)

diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py
index 4cff6c3..a8b8d48 100644
--- a/src/metacoder/evals/judges.py
+++ b/src/metacoder/evals/judges.py
@@ -1,5 +1,5 @@
 # metacoder/evals/judges.py
-
+import logging
 import os
 
 from anthropic import Anthropic
@@ -7,6 +7,8 @@
 
 from deepeval.models.base_model import DeepEvalBaseLLM
 
+logger = logging.getLogger(__name__)
+
 
 class ClaudeJudge(DeepEvalBaseLLM):
     """
@@ -55,3 +57,29 @@ async def a_generate(self, prompt: str) -> str:
 
     def get_model_name(self) -> str:
         return self.model_name
+
+    def has_available_quota(self) -> bool:
+        """
+        Try a very lightweight request to check if quota is available.
+        Returns True if quota exists, False if Anthropic responds with
+        quota-related errors.
+        """
+        try:
+            # Use a minimal "ping" request
+            content: list[TextBlockParam] = [{"type": "text", "text": "ping"}]
+            messages: list[MessageParam] = [{"role": "user", "content": content}]
+            self.client.messages.create(
+                model=self.model_name,
+                max_tokens=1,  # cheapest possible
+                temperature=0.0,
+                messages=messages,
+            )
+            return True
+        except Exception as e:
+            msg = str(e).lower()
+            # Check for insufficient quota:
+            # 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.
+            if "credit balance is too low" in msg or "400" in msg:
+                logger.warning(f"ClaudeJudge quota check failed: {e}")
+                return False
+            raise
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 10b1b09..0a8dc42 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -308,8 +308,6 @@ def run_single_eval(
 
                 # Note: This will downgrade a metric if needed each time it is about to be used without modifying the default metrics.
                 if not self.use_openai:
-                    from metacoder.evals.judges import ClaudeJudge
-
                     claude_model = "claude-3-5-sonnet-20240620"
                     logger.warning(
                         f"Downgrading {metric_name} model from {metric.model.model_name} to {claude_model}."
@@ -317,23 +315,28 @@ def run_single_eval(
 
                     try:
                         # Downgrade metric model to Claude judge.
-                        metric = make_geval(model=ClaudeJudge(claude_model))
-                        logger.warning(
+                        from metacoder.evals.judges import ClaudeJudge
+
+                        judge = ClaudeJudge(claude_model)
+
+                        if not judge.has_available_quota():
+                            raise Exception(
+                                "No Anthropic credits available for ClaudeJudge."
+                            )
+
+                        metric = make_geval(model=judge)
+                        logger.info(
                             f"Successfully downgraded {metric_name} model to {metric.model.model_name}."
                         )
                     except Exception as e:
                         # Fallback: if you can't use Claude, downgrade gracefully.
-                        logging.error(traceback.format_exc())
+                        logging.debug(traceback.format_exc())
+                        logger.debug(e)
                         logger.warning(
-                            "Claude unavailable (%s); downgrading {metric_name} to DummyMetric.",
-                            e,
+                            f"Claude unavailable ({e}); downgrading {metric_name} to DummyMetric."
                         )
                         metric = DummyMetric(threshold=0.5)
-                        logger.warning(
-                            f"Successfully downgraded {metric_name} to {metric.name}."
-                        )
-
-            logger.warning(f"Actual metric used: {metric.name}.")
+                        logger.warning(f"Downgraded {metric_name} to {metric.name}.")
 
             eval_results = evaluate(
                 [test_case],
diff --git a/tests/conftest.py b/tests/conftest.py
index 2416094..95f4c37 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,7 +4,7 @@
 
 def pytest_configure(config):
     logging.basicConfig(
-        level=logging.DEBUG,
+        level=logging.WARNING,
         format="\n%(asctime)s [%(levelname)s] %(name)s: %(message)s",
         stream=sys.stdout,
     )
diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
index 70bfa07..9ed23a6 100644
--- a/tests/test_evals/test_claude_judge.py
+++ b/tests/test_evals/test_claude_judge.py
@@ -9,6 +9,8 @@
 
 def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
     """Test that ClaudeJudge is used when OpenAI is disabled."""
+    # TODO: This test should avoid running the coder and only perform the eval step.
+    # Otherwise, it is impossible to get to the eval step if no valid API key is present or no quota is available (testing the wrong part of the process).
 
     runner = EvalRunner()
 
@@ -53,3 +55,85 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
         assert False  # This assertion will fail if an Exception is caught here.
     finally:
         pass
+
+
+def test_correctnessmetric_downgrade_success(tmp_path, caplog, monkeypatch):
+    """Test that the CorrectnessMatric is successfully downgraded to DummyMetric if no model is available."""
+
+    runner = EvalRunner()
+
+    try:
+        dataset = runner.load_dataset(
+            Path("tests/input/goose_eval_claude_downgrade_test.yaml")
+        )
+
+        # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used.
+        # One enhancement might be to introduce metric_model=claude-3-5-sonnet-20240620 to each result at eval time.
+        # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
+        with caplog.at_level(logging.WARNING):
+            # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
+            # (no need to reset, `monkeypatch` automatically reverts after the test)
+            monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
+
+            # Delete the Anthropic API Key from the environment to force ClaudeJudge instantiation to fail.
+            # (no need to reset, `monkeypatch` automatically reverts after the test)
+            monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
+
+            # One more OpenAI API test case also needs to be handled (401 errors):
+            # Temporarily set an invalid ANTHROPIC_API_KEY in order to force ClaudeJudge to fail.
+            # monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+
+            # One more Anthropic API test case also needs to be handled (401 errors):
+            # Temporarily set an invalid ANTHROPIC_API_KEY in order to force ClaudeJudge to fail.
+            # monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-api-key-for-testing")
+
+            # TODO: Also need to test this for Anthropic:
+            # Provider
+            # request
+            # failed
+            # with status: 400
+            # Bad
+            # Request.Payload: Some(Object
+            # {"error": Object {"message": String("Your credit balance is too low
+            #                   to access the Anthropic API.Please go to Plans & Billing to upgrade or purchase
+            #                   credits."), "type": String("invalid_request_error")}, "request_id": String("
+            #                   req_011CSeQZTjJvmcxzrhXuPES4"), "type": Strin
+            #                   g("error")}).Returning
+            # error: RequestFailed(
+            #     "Request failed with status: 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits."
+
+            results = runner.run_all_evals(dataset, workdir=tmp_path, coders=["dummy"])
+
+            # Test that the quota exhaustion fallback logic worked as expected.
+            assert (
+                "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
+                in caplog.text
+            )
+
+            # Test that the new evaluation judge was correctly selected for the metric model downgrade.
+            assert (
+                "Downgrading CorrectnessMetric model from gpt-4.1 to claude-3-5-sonnet-20240620."
+                in caplog.text
+            )
+
+            # Test that the ClaudeJudge was unable to be used as the model for the CorrectnessMetric.
+            assert (
+                "Claude unavailable (ANTHROPIC_API_KEY is not set in environment); downgrading CorrectnessMetric to DummyMetric."
+                in caplog.text
+            )
+
+            # Test that the CorrectnessMetric was successfully downgraded to DummyMetric.
+            assert "Downgraded CorrectnessMetric to DummyMetric." in caplog.text
+
+            # Test that the eval completed by checking for a non-zero score.
+            assert results[0].score > 0, (
+                f"Expected a {results[0].metric_name} score for {results[0].case_name}."
+            )
+
+    except Exception as e:
+        # Test that fallback logic does not result in an Exception.
+        logger.error(f"An error occurred: {e}")
+        logging.error(traceback.format_exc())
+        assert False  # This assertion will fail if an Exception is caught here.
+    finally:
+        pass

From c98c9d7b2645838c404185f8d4baf97fd27f4269 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Tue, 2 Sep 2025 18:32:26 -0400
Subject: [PATCH 15/55] Fixed issue #23. Forced processes to be launched with
 UTF-8 encoding to avoid default encoding errors.

---
 src/metacoder/coders/base_coder.py            | 14 ++++++++-
 tests/input/goose_eval_test.yaml              |  9 ++++--
 tests/input/literature_mcp_encoding_test.yaml | 29 +++++++++++++++++++
 3 files changed, 48 insertions(+), 4 deletions(-)
 create mode 100644 tests/input/literature_mcp_encoding_test.yaml

diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py
index b44c6ec..f19d146 100644
--- a/src/metacoder/coders/base_coder.py
+++ b/src/metacoder/coders/base_coder.py
@@ -173,11 +173,15 @@ def run_process(
         """
         if env is None:
             env = self.expand_env(self.env)
+
+        # Decode the child process output as UTF-8 (instead of default encoding)
         process = subprocess.Popen(
             command,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             text=True,
+            encoding="utf-8",
+            errors="replace",  # avoid crashes on the occasional bad byte
             env=env,
             bufsize=1,
             universal_newlines=True,
@@ -189,7 +193,15 @@ def run_process(
         # check verbosity level
         quiet_mode = logger.getEffectiveLevel() <= logging.INFO
 
-        def stream_output(pipe, output_lines, stream):
+        # Ensure our own stdout/stderr won't choke on non-ASCII (Windows consoles often do).
+        for s in (sys.stdout, sys.stderr):
+            try:
+                s.reconfigure(encoding="utf-8", errors="replace")  # Python 3.7+
+            except Exception as e:
+                logger.info(f"{e}")
+                pass  # OK if not available (e.g., redirected or older Python)
+
+        def stream_output(pipe, output_lines, stream):  # lines are already str decoded as UTF-8
             for line in iter(pipe.readline, ""):
                 if not quiet_mode:
                     print(line.rstrip(), file=stream)
diff --git a/tests/input/goose_eval_test.yaml b/tests/input/goose_eval_test.yaml
index 1037215..f41e249 100644
--- a/tests/input/goose_eval_test.yaml
+++ b/tests/input/goose_eval_test.yaml
@@ -7,7 +7,7 @@ coders:
   goose: {}
 
 models:
-  gpt-4o:
+  claude-sonnet:
     provider: anthropic
     name: claude-sonnet-4-20250514
 
@@ -34,6 +34,9 @@ cases:
       MONDO:0011694 (spinocerebellar ataxia type 15/16, aka SCA15)
       MONDO:0007298 (spinocerebellar ataxia type 29, aka SCA29)
       MONDO:0008795 (aniridia-cerebellar ataxia-intellectual disability syndrome; aka Gillespie syndrome)
-
     threshold: 0.7
-    
+  - name: character_encoding_test
+    metrics: [CorrectnessMetric]
+    input: Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?
+    expected_output: 'The paper says No but it is retracted so the results should not be trusted.'
+    threshold: 0.9
diff --git a/tests/input/literature_mcp_encoding_test.yaml b/tests/input/literature_mcp_encoding_test.yaml
new file mode 100644
index 0000000..d0fea1b
--- /dev/null
+++ b/tests/input/literature_mcp_encoding_test.yaml
@@ -0,0 +1,29 @@
+name: pubmed tools evals
+description: |
+  Evaluations for multiple pubmed MCPs
+
+
+coders:
+  goose: {}
+
+models:
+  claude-sonnet:
+    provider: anthropic
+    name: claude-sonnet-4-20250514
+
+servers:
+  ols:
+    name: ols
+    command: uvx
+    args: [mcp-ols]
+
+server_combinations:
+  - [simple-pubmed]
+
+cases:
+- name: character_encoding_test
+  metrics:
+  - CorrectnessMetric
+  input: Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?
+  expected_output: 'The paper says No but it is retracted so the results should not be trusted.'
+  threshold: 0.9

From 4761d1977b7763b840257cb863aea307ebc2a8ed Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Tue, 2 Sep 2025 18:36:29 -0400
Subject: [PATCH 16/55] Addressed ruff formatting issue.

---
 src/metacoder/coders/base_coder.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py
index f19d146..f2fa7d9 100644
--- a/src/metacoder/coders/base_coder.py
+++ b/src/metacoder/coders/base_coder.py
@@ -201,7 +201,8 @@ def run_process(
                 logger.info(f"{e}")
                 pass  # OK if not available (e.g., redirected or older Python)
 
-        def stream_output(pipe, output_lines, stream):  # lines are already str decoded as UTF-8
+        # lines are already str decoded as UTF-8
+        def stream_output(pipe, output_lines, stream):
             for line in iter(pipe.readline, ""):
                 if not quiet_mode:
                     print(line.rstrip(), file=stream)

From 6b64a794593899d643c553200cf35490c9d98dda Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Tue, 2 Sep 2025 19:12:44 -0400
Subject: [PATCH 17/55] Added output file check to fail if the output file
 already exists. Otherwise, create an empty file as UTF-8. Partially addresses
 Issue #24.

---
 src/metacoder/metacoder.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/metacoder/metacoder.py b/src/metacoder/metacoder.py
index 28113ec..5e1d616 100644
--- a/src/metacoder/metacoder.py
+++ b/src/metacoder/metacoder.py
@@ -1,4 +1,5 @@
 import logging
+import sys
 from pathlib import Path
 from typing import Optional, Union
 
@@ -543,6 +544,17 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose:
     output_path = Path(output)
     workdir_path = Path(workdir)
 
+    try:
+        # Create the output file only if it doesn't exist; fail if it does
+        with output_path.open("x", encoding="utf-8") as _:
+            pass
+    except FileExistsError:
+        print(
+            f"Error: '{output_path}' already exists. Please delete it or specify a different filename.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
     # Convert coders tuple to list (empty tuple if not specified)
     coders_list = list(coders) if coders else None
 

From c436e7fe4698df2c8e3d3e04fa5af8224ade0f9a Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Tue, 2 Sep 2025 19:29:38 -0400
Subject: [PATCH 18/55] Modified save_results to append to existing output file
 rather than overwrite. Enforced UTF-8 encoding, switched to safe_dump and
 added document delimiter between records. Also simplified document
 generation. Fixes issue #24. Added second test case to
 literature_mcp_encoding_test.yaml for testing.

---
 src/metacoder/evals/runner.py                 | 19 +++++++++++--------
 tests/input/literature_mcp_encoding_test.yaml |  8 ++++++++
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 0a8dc42..21e2671 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -518,18 +518,21 @@ def run_all_evals(
 
     def save_results(self, results: List[EvalResult], output_path: Path):
         """Save evaluation results to file."""
-        # Convert to list of dicts
-        results_data = []
-        for result in results:
-            results_data.append(result.model_dump())
+        # output_path.parent.mkdir(parents=True, exist_ok=True)  # Not sure if the folder should be created here
+        data = {
+            "results": [r.model_dump() for r in results],
+            "summary": self.generate_summary(results),
+        }
 
-        # Save as YAML
-        with open(output_path, "w") as f:
-            yaml.dump(
-                {"results": results_data, "summary": self.generate_summary(results)},
+        # Append a new YAML document to the output file.
+        with open(output_path, "a", encoding="utf-8", newline="") as f:
+            yaml.safe_dump(
+                data,
                 f,
+                explicit_start=True,  # writes '---' to mark a new document
                 default_flow_style=False,
                 sort_keys=False,
+                allow_unicode=True,
             )
 
     def generate_summary(self, results: List[EvalResult]) -> Dict[str, Any]:
diff --git a/tests/input/literature_mcp_encoding_test.yaml b/tests/input/literature_mcp_encoding_test.yaml
index d0fea1b..f7b5b75 100644
--- a/tests/input/literature_mcp_encoding_test.yaml
+++ b/tests/input/literature_mcp_encoding_test.yaml
@@ -27,3 +27,11 @@ cases:
   input: Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?
   expected_output: 'The paper says No but it is retracted so the results should not be trusted.'
   threshold: 0.9
+- name: "disease"
+  metrics: [CorrectnessMetric]
+  input: "According to PMID:35743164, What 3 diseases are associated with ITPR1 mutations? Give me disease names and MONDO IDs"
+  expected_output: |
+    MONDO:0011694 (spinocerebellar ataxia type 15/16, aka SCA15)
+    MONDO:0007298 (spinocerebellar ataxia type 29, aka SCA29)
+    MONDO:0008795 (aniridia-cerebellar ataxia-intellectual disability syndrome; aka Gillespie syndrome)
+  threshold: 0.7

From b0b1c8b0b8833ee02cc9ac7c12ce7943e7b24516 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Wed, 3 Sep 2025 14:28:44 -0400
Subject: [PATCH 19/55] Updated ClaudeJudge model to claude-sonnet-4-20250514.

---
 src/metacoder/evals/judges.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py
index a8b8d48..24b4277 100644
--- a/src/metacoder/evals/judges.py
+++ b/src/metacoder/evals/judges.py
@@ -16,9 +16,17 @@ class ClaudeJudge(DeepEvalBaseLLM):
     the `model` parameter to DeepEval metrics like GEval.
     """
 
+    # Note: Anthropic models can be listed via:
+    # curl https://api.anthropic.com/v1/models --header "x-api-key: %ANTHROPIC_API_KEY%" --header "anthropic-version: 2023-06-01"
+    # {"data": [{"type": "model", "id": "claude-opus-4-1-20250805", "display_name": "Claude Opus 4.1", "created_at": "2025-08-05T00:00:00Z"}, ... ]}
+    # Current list (September 3, 2025):
+    # claude-opus-4-1-20250805, claude-opus-4-20250514, claude-sonnet-4-20250514, claude-3-7-sonnet-20250219,
+    # claude-3-5-sonnet-20241022, claude-3-5-haiku-20241022, claude-3-5-sonnet-20240620, claude-3-haiku-20240307,
+    # claude-3-opus-20240229
+
     def __init__(
         self,
-        model_name: str = "claude-3-5-sonnet-20240620",
+        model_name: str = "claude-sonnet-4-20250514",
         max_tokens: int = 1024,
         temperature: float = 0.0,
     ):

From a7e71e3f4e2d177cccefb53788a94eddefb9b34d Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Wed, 3 Sep 2025 17:02:19 -0400
Subject: [PATCH 20/55] Revert "Modified save_results to append to existing
 output file rather than overwrite. Enforced UTF-8 encoding, switched to
 safe_dump and added document delimiter between records. Also simplified
 document generation. Fixes issue #24. Added second test case to
 literature_mcp_encoding_test.yaml for testing."

This reverts commit c436e7fe4698df2c8e3d3e04fa5af8224ade0f9a.
---
 src/metacoder/evals/runner.py                 | 19 ++++++++-----------
 tests/input/literature_mcp_encoding_test.yaml |  8 --------
 2 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 21e2671..0a8dc42 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -518,21 +518,18 @@ def run_all_evals(
 
     def save_results(self, results: List[EvalResult], output_path: Path):
         """Save evaluation results to file."""
-        # output_path.parent.mkdir(parents=True, exist_ok=True)  # Not sure if the folder should be created here
-        data = {
-            "results": [r.model_dump() for r in results],
-            "summary": self.generate_summary(results),
-        }
+        # Convert to list of dicts
+        results_data = []
+        for result in results:
+            results_data.append(result.model_dump())
 
-        # Append a new YAML document to the output file.
-        with open(output_path, "a", encoding="utf-8", newline="") as f:
-            yaml.safe_dump(
-                data,
+        # Save as YAML
+        with open(output_path, "w") as f:
+            yaml.dump(
+                {"results": results_data, "summary": self.generate_summary(results)},
                 f,
-                explicit_start=True,  # writes '---' to mark a new document
                 default_flow_style=False,
                 sort_keys=False,
-                allow_unicode=True,
             )
 
     def generate_summary(self, results: List[EvalResult]) -> Dict[str, Any]:
diff --git a/tests/input/literature_mcp_encoding_test.yaml b/tests/input/literature_mcp_encoding_test.yaml
index f7b5b75..d0fea1b 100644
--- a/tests/input/literature_mcp_encoding_test.yaml
+++ b/tests/input/literature_mcp_encoding_test.yaml
@@ -27,11 +27,3 @@ cases:
   input: Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?
   expected_output: 'The paper says No but it is retracted so the results should not be trusted.'
   threshold: 0.9
-- name: "disease"
-  metrics: [CorrectnessMetric]
-  input: "According to PMID:35743164, What 3 diseases are associated with ITPR1 mutations? Give me disease names and MONDO IDs"
-  expected_output: |
-    MONDO:0011694 (spinocerebellar ataxia type 15/16, aka SCA15)
-    MONDO:0007298 (spinocerebellar ataxia type 29, aka SCA29)
-    MONDO:0008795 (aniridia-cerebellar ataxia-intellectual disability syndrome; aka Gillespie syndrome)
-  threshold: 0.7

From 7e143da3479909ed45f766cafbfb2190b9d3b4ab Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Thu, 4 Sep 2025 13:31:22 -0400
Subject: [PATCH 21/55] Added UTF-8 encoding to prevent character mangling
 during YAML export on Windows (where the default codepage is cp1252).

---
 src/metacoder/evals/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 0a8dc42..f1c7126 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -524,7 +524,7 @@ def save_results(self, results: List[EvalResult], output_path: Path):
             results_data.append(result.model_dump())
 
         # Save as YAML
-        with open(output_path, "w") as f:
+        with open(output_path, "w", encoding="utf-8") as f:
             yaml.dump(
                 {"results": results_data, "summary": self.generate_summary(results)},
                 f,

From 37cbb2f38ecbc672cda8c765d4ed0129ba7e839f Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Thu, 4 Sep 2025 14:02:15 -0400
Subject: [PATCH 22/55] Added support for grouping test case eval results with
 'group' key in config. Fixes Issue #27.

---
 src/metacoder/evals/eval_model.py             |  3 ++
 src/metacoder/evals/runner.py                 |  2 +
 .../goose_eval_claude_downgrade_test.yaml     |  2 +-
 tests/input/goose_eval_test.yaml              | 45 ++++++++++++-------
 tests/input/goose_no_server_test.yaml         |  2 +-
 5 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/src/metacoder/evals/eval_model.py b/src/metacoder/evals/eval_model.py
index d7dab3e..471c13d 100644
--- a/src/metacoder/evals/eval_model.py
+++ b/src/metacoder/evals/eval_model.py
@@ -21,6 +21,9 @@ class EvalCase(BaseModel):
     """
 
     name: str = Field(..., description="Unique identifier for the test case")
+    group: Optional[str] = Field(
+        default="Default", description="Test category for result grouping."
+    )
     metrics: List[str] = Field(
         ...,
         description="List of metric names to apply (e.g., CorrectnessMetric, FaithfulnessMetric)",
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index f1c7126..95c74bf 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -121,6 +121,7 @@ class EvalResult(BaseModel):
     model: str
     coder: str
     case_name: str
+    case_group: str
     metric_name: str
     score: float
     passed: bool
@@ -386,6 +387,7 @@ def run_single_eval(
                 model=model_name,
                 coder=coder_name,
                 case_name=case.name,
+                case_group=case.group,
                 metric_name=metric_name,
                 score=score if score is not None else 0.0,
                 passed=passed,
diff --git a/tests/input/goose_eval_claude_downgrade_test.yaml b/tests/input/goose_eval_claude_downgrade_test.yaml
index 6f0eb31..7d305ce 100644
--- a/tests/input/goose_eval_claude_downgrade_test.yaml
+++ b/tests/input/goose_eval_claude_downgrade_test.yaml
@@ -16,7 +16,7 @@ servers:
     command: uvx
     args: [mcp-simple-pubmed]
     env:
-      PUBMED_EMAIL: ctparker@lbl.gov
+      PUBMED_EMAIL: cjmungall@lbl.gov
 
 server_combinations:
   - [mcp-simple-pubmed]
diff --git a/tests/input/goose_eval_test.yaml b/tests/input/goose_eval_test.yaml
index f41e249..73b0615 100644
--- a/tests/input/goose_eval_test.yaml
+++ b/tests/input/goose_eval_test.yaml
@@ -11,32 +11,45 @@ models:
     provider: anthropic
     name: claude-sonnet-4-20250514
 
+# Refer to metacoder/src/mcps/registry/scilit.yaml for the list of available MCPs.
 servers:
-  mcp-simple-pubmed:
+  artl:
+    name: artl
+    command: uvx
+    args: [artl-mcp]
+  simple-pubmed:
     name: pubmed
     command: uvx
     args: [mcp-simple-pubmed]
     env:
       PUBMED_EMAIL: cjmungall@lbl.gov
-  ols-mcp:
+  ols:
     name: ols
     command: uvx
     args: [mcp-ols]
 
 server_combinations:
-  - [mcp-simple-pubmed, ols-mcp]
+#  - [artl, simple-pubmed, ols]
+  - [artl]
+  - [simple-pubmed]
+#  - [ols]
 
 cases:
-  - name: "disease"
-    metrics: [CorrectnessMetric]
-    input: "According to PMID:35743164, What 3 diseases are associated with ITPR1 mutations? Give me disease names and MONDO IDs"
-    expected_output: |
-      MONDO:0011694 (spinocerebellar ataxia type 15/16, aka SCA15)
-      MONDO:0007298 (spinocerebellar ataxia type 29, aka SCA29)
-      MONDO:0008795 (aniridia-cerebellar ataxia-intellectual disability syndrome; aka Gillespie syndrome)
-    threshold: 0.7
-  - name: character_encoding_test
-    metrics: [CorrectnessMetric]
-    input: Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?
-    expected_output: 'The paper says No but it is retracted so the results should not be trusted.'
-    threshold: 0.9
+- name: PMID_28027860_Full_Text
+#  group: Text extraction # should default to "Default"
+  metrics:
+  - CorrectnessMetric
+  input: "What is the first sentence of section 2 in PMID: 28027860?"
+  expected_output: "Even though many of NFLE's core features have been clarified in the last two decades, some critical issues remain controversial."
+  threshold: 0.9
+
+# Per convo with Charles, Justin, Mark: this test case is kind of tricky and it seems
+# like an extremely difficult case that even a good LLM + MCP might not pass. We've
+# made some edits to give the LLM + MCP a fair chance
+- name: PMC8086273_Retraction
+  group: Summarization
+  metrics:
+  - CorrectnessMetric
+  input: "Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?"
+  expected_output: "The paper says No but it is retracted so the results should not be trusted."
+  threshold: 0.9
diff --git a/tests/input/goose_no_server_test.yaml b/tests/input/goose_no_server_test.yaml
index 2dc5551..a027f80 100644
--- a/tests/input/goose_no_server_test.yaml
+++ b/tests/input/goose_no_server_test.yaml
@@ -16,7 +16,7 @@ servers:
     command: uvx
     args: [mcp-simple-pubmed]
     env:
-      PUBMED_EMAIL: ctparker@lbl.gov
+      PUBMED_EMAIL: cjmungall@lbl.gov
 
 #server_combinations:
 #  - [mcp-simple-pubmed]

From bdec2e3667f45dbf5c3ac3443564fab6ec73669c Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Thu, 4 Sep 2025 14:13:35 -0400
Subject: [PATCH 23/55] Updated test_runner.py to include Default case_group in
 EvalResults to address validation errors in test suite.

---
 tests/test_evals/test_runner.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_evals/test_runner.py b/tests/test_evals/test_runner.py
index d1f0c3e..838ab60 100644
--- a/tests/test_evals/test_runner.py
+++ b/tests/test_evals/test_runner.py
@@ -174,6 +174,7 @@ def test_generate_summary(self):
                 model="model1",
                 coder="coder1",
                 case_name="case1",
+                case_group="Default",
                 metric_name="metric1",
                 score=0.9,
                 passed=True,
@@ -182,6 +183,7 @@ def test_generate_summary(self):
                 model="model1",
                 coder="coder1",
                 case_name="case2",
+                case_group="Default",
                 metric_name="metric1",
                 score=0.3,
                 passed=False,
@@ -190,6 +192,7 @@ def test_generate_summary(self):
                 model="model2",
                 coder="coder1",
                 case_name="case1",
+                case_group="Default",
                 metric_name="metric1",
                 score=0.8,
                 passed=True,
@@ -225,6 +228,7 @@ def test_save_and_load_results(self, tmp_path):
                 model="model1",
                 coder="coder1",
                 case_name="case1",
+                case_group="Default",
                 metric_name="metric1",
                 score=0.9,
                 passed=True,

From 93860972bf83ff4e716ddcacfa91d1d7c90c7cd4 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Thu, 4 Sep 2025 14:18:35 -0400
Subject: [PATCH 24/55] Updated Anthropic fallback mode from
 claude-3-5-sonnet-20240620 to claude-sonnet-4-20250514.

---
 src/metacoder/evals/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 95c74bf..45f5a2b 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -309,7 +309,7 @@ def run_single_eval(
 
                 # Note: This will downgrade a metric if needed each time it is about to be used without modifying the default metrics.
                 if not self.use_openai:
-                    claude_model = "claude-3-5-sonnet-20240620"
+                    claude_model = "claude-sonnet-4-20250514"
                     logger.warning(
                         f"Downgrading {metric_name} model from {metric.model.model_name} to {claude_model}."
                     )

From 0d855bc03e47e50948ab896e21f93481641eb8a4 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Thu, 4 Sep 2025 14:23:27 -0400
Subject: [PATCH 25/55] Corrected test cases to match the expected Anthropic
 model.

---
 src/metacoder/evals/judges.py         | 4 ----
 tests/test_evals/test_claude_judge.py | 8 ++++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py
index 24b4277..cc20e32 100644
--- a/src/metacoder/evals/judges.py
+++ b/src/metacoder/evals/judges.py
@@ -19,10 +19,6 @@ class ClaudeJudge(DeepEvalBaseLLM):
     # Note: Anthropic models can be listed via:
     # curl https://api.anthropic.com/v1/models --header "x-api-key: %ANTHROPIC_API_KEY%" --header "anthropic-version: 2023-06-01"
     # {"data": [{"type": "model", "id": "claude-opus-4-1-20250805", "display_name": "Claude Opus 4.1", "created_at": "2025-08-05T00:00:00Z"}, ... ]}
-    # Current list (September 3, 2025):
-    # claude-opus-4-1-20250805, claude-opus-4-20250514, claude-sonnet-4-20250514, claude-3-7-sonnet-20250219,
-    # claude-3-5-sonnet-20241022, claude-3-5-haiku-20241022, claude-3-5-sonnet-20240620, claude-3-haiku-20240307,
-    # claude-3-opus-20240229
 
     def __init__(
         self,
diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
index 9ed23a6..f33f2a5 100644
--- a/tests/test_evals/test_claude_judge.py
+++ b/tests/test_evals/test_claude_judge.py
@@ -20,7 +20,7 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
         )
 
         # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used.
-        # One enhancement might be to introduce metric_model=claude-3-5-sonnet-20240620 to each result at eval time.
+        # One enhancement might be to introduce metric_model=claude-sonnet-4-20250514 to each result at eval time.
         # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
         with caplog.at_level(logging.WARNING):
             # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
@@ -39,7 +39,7 @@ def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
 
             # Test that the new evaluation judge was correctly selected for the metric model downgrade.
             assert (
-                "Downgrading CorrectnessMetric model from gpt-4.1 to claude-3-5-sonnet-20240620."
+                "Downgrading CorrectnessMetric model from gpt-4.1 to claude-"
                 in caplog.text
             )
 
@@ -68,7 +68,7 @@ def test_correctnessmetric_downgrade_success(tmp_path, caplog, monkeypatch):
         )
 
         # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used.
-        # One enhancement might be to introduce metric_model=claude-3-5-sonnet-20240620 to each result at eval time.
+        # One enhancement might be to introduce metric_model=claude-sonnet-4-20250514 to each result at eval time.
         # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
         with caplog.at_level(logging.WARNING):
             # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
@@ -112,7 +112,7 @@ def test_correctnessmetric_downgrade_success(tmp_path, caplog, monkeypatch):
 
             # Test that the new evaluation judge was correctly selected for the metric model downgrade.
             assert (
-                "Downgrading CorrectnessMetric model from gpt-4.1 to claude-3-5-sonnet-20240620."
+                "Downgrading CorrectnessMetric model from gpt-4.1 to claude-"
                 in caplog.text
             )
 

From 9d9bca0467d1e4a9074e7a9264148b95151d9d09 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 5 Sep 2025 17:26:12 -0400
Subject: [PATCH 26/55] Removed unnecessary duplicate path element in work
 directory. Readability improvement to support fix for Issue #29. Adding as
 individual commit in case it needs to be rolled back.

---
 src/metacoder/evals/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 45f5a2b..de3323c 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -258,7 +258,7 @@ def run_single_eval(
         # Create coder instance
         coder = create_coder(
             coder_name,
-            workdir=str(workdir / f"{model_name}_{coder_name}_{case.name}"),
+            workdir=str(workdir),
             config=coder_config,
         )
 

From bd474c9e614bbc539cac8cb49b0762ebee74a77d Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 5 Sep 2025 17:42:57 -0400
Subject: [PATCH 27/55] Fix Issue #30. Goose supports an environment variable
 to disable using the system keyring for secrets (GOOSE_DISABLE_KEYRING).

---
 src/metacoder/coders/goose.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index 6b0b5c0..d8723ea 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -145,6 +145,8 @@ def run(self, input_text: str) -> CoderOutput:
         env = self.expand_env(self.env)
         self.prepare_workdir()
         with change_directory(self.workdir):
+            # disable keyring (prevents errors on MacOS and Linux)
+            env["GOOSE_DISABLE_KEYRING"] = "1"
             # important - ensure that only local config files are used
             # we assue chdir has been called beforehand
             env["HOME"] = "."

From 142b8b8afca15be90fdd9a392fabb3c1ebf4de1f Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 5 Sep 2025 17:47:56 -0400
Subject: [PATCH 28/55] Partially addresses Issue #29 Windows compatibility.
 Uses os.cwd() instead of unix-specific "." to specify current working
 directory.

---
 src/metacoder/coders/goose.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index d8723ea..ea91efc 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -1,4 +1,5 @@
 import json
+import os
 from pathlib import Path
 import time
 import logging
@@ -149,7 +150,7 @@ def run(self, input_text: str) -> CoderOutput:
             env["GOOSE_DISABLE_KEYRING"] = "1"
             # important - ensure that only local config files are used
             # we assue chdir has been called beforehand
-            env["HOME"] = "."
+            env["HOME"] = os.getcwd()
             text = self.expand_prompt(input_text)
             command = ["goose", "run", "-t", text]
             logger.info(f"🦆 Running command: {' '.join(command)}")

From b5faef3f22b75a5f552e99c0640f5ff07146d80c Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 5 Sep 2025 17:51:05 -0400
Subject: [PATCH 29/55] Uses safer XDG_CONFIG_HOME instead of changing HOME
 environment variable to avoid interfering with unix environment (shell
 history, etc.). Separate commit in case this needs to be rolled back.

---
 src/metacoder/coders/goose.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index ea91efc..991ba09 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -150,7 +150,7 @@ def run(self, input_text: str) -> CoderOutput:
             env["GOOSE_DISABLE_KEYRING"] = "1"
             # important - ensure that only local config files are used
             # we assue chdir has been called beforehand
-            env["HOME"] = os.getcwd()
+            env["XDG_CONFIG_HOME"] = os.getcwd()
             text = self.expand_prompt(input_text)
             command = ["goose", "run", "-t", text]
             logger.info(f"🦆 Running command: {' '.join(command)}")

From 6d6ba8d59c756677e72f8c53dd9dc1914aa06de6 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 5 Sep 2025 18:01:32 -0400
Subject: [PATCH 30/55] Changed informational log message to make it clear that
 a directory path is not being referenced, but rather a server combination.

---
 src/metacoder/evals/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index de3323c..a80060a 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -484,7 +484,7 @@ def run_all_evals(
                             else " (no servers)"
                         )
                         logger.info(
-                            f"Progress: {current}/{total_combinations} - {coder_name}/{model_name}/{case.name}{server_desc}"
+                            f"Progress: {current}/{total_combinations} ({coder_name} | {model_name} | {case.name}{server_desc})"
                         )
 
                         # Create unique workdir for this combination

From 80772c2d99f5930f5f5bc3a29a6dbce5abf889c0 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 5 Sep 2025 18:14:51 -0400
Subject: [PATCH 31/55] The Goose executable is now detected in a
 cross-platform way, and the full path information is propagated into the logs
 for easier debugging of the environment.

---
 src/metacoder/coders/goose.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index 991ba09..931ef12 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -20,6 +20,13 @@
 logger = logging.getLogger(__name__)
 
 
+def find_goose() -> Path:
+    loc = shutil.which("goose")
+    if not loc:
+        raise FileNotFoundError("goose not found on PATH")
+    return Path(loc).resolve()
+
+
 class GooseCoder(BaseCoder):
     """
     Note that running goose involves simulating a home directory in
@@ -146,13 +153,16 @@ def run(self, input_text: str) -> CoderOutput:
         env = self.expand_env(self.env)
         self.prepare_workdir()
         with change_directory(self.workdir):
+            goose_path = find_goose()
+            logger.info(f"Using goose executable at: {goose_path}")
+
             # disable keyring (prevents errors on MacOS and Linux)
             env["GOOSE_DISABLE_KEYRING"] = "1"
             # important - ensure that only local config files are used
             # we assue chdir has been called beforehand
             env["XDG_CONFIG_HOME"] = os.getcwd()
             text = self.expand_prompt(input_text)
-            command = ["goose", "run", "-t", text]
+            command = [str(goose_path), "run", "-t", text]
             logger.info(f"🦆 Running command: {' '.join(command)}")
             # time the command
             start_time = time.time()

From ef6337c187ca8a45df4130be6994f1fb42209902 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 5 Sep 2025 18:25:53 -0400
Subject: [PATCH 32/55] Moved hard-coded values into variables in preparation
 for cross-platform support. Adjusted log level and cleaned up comment for
 readability.

---
 src/metacoder/coders/goose.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index 931ef12..e4ad2a3 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -154,13 +154,18 @@ def run(self, input_text: str) -> CoderOutput:
         self.prepare_workdir()
         with change_directory(self.workdir):
             goose_path = find_goose()
-            logger.info(f"Using goose executable at: {goose_path}")
+            logger.debug(f"Using goose executable at: {goose_path}")
 
             # disable keyring (prevents errors on MacOS and Linux)
             env["GOOSE_DISABLE_KEYRING"] = "1"
-            # important - ensure that only local config files are used
-            # we assue chdir has been called beforehand
-            env["XDG_CONFIG_HOME"] = os.getcwd()
+            # Important:
+            # (1) ensure that only local config files are used;
+            # (2) assume chdir has been called beforehand.
+            cwd = os.getcwd()
+            local_home_path = Path(cwd)
+            home_env_var = "XDG_CONFIG_HOME"
+            env[home_env_var] = str(local_home_path)
+
             text = self.expand_prompt(input_text)
             command = [str(goose_path), "run", "-t", text]
             logger.info(f"🦆 Running command: {' '.join(command)}")

From 87d556d1133e96c4e6cafd0a6c9272b920f37a1e Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Fri, 5 Sep 2025 19:03:08 -0400
Subject: [PATCH 33/55] Added OS-specific Goose config folder structures.
 Replaced hard-coded paths with function call to generate correct path. Added
 OS-specific home directory environment variables. Added logging of Goose
 config path for confirmation with base coder.

---
 src/metacoder/coders/goose.py | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index e4ad2a3..382542c 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -1,5 +1,6 @@
 import json
 import os
+import platform
 from pathlib import Path
 import time
 import logging
@@ -27,6 +28,14 @@ def find_goose() -> Path:
     return Path(loc).resolve()
 
 
+def get_goose_config_path() -> str:
+    # OS-specific config layout
+    if platform.system().lower().startswith("win"):
+        return "Block\\goose\\config\\"
+
+    return ".config/goose/"
+
+
 class GooseCoder(BaseCoder):
     """
     Note that running goose involves simulating a home directory in
@@ -140,7 +149,7 @@ def default_config_objects(self) -> list[CoderConfigObject]:
         return [
             CoderConfigObject(
                 file_type=FileType.YAML,
-                relative_path=".config/goose/config.yaml",
+                relative_path=get_goose_config_path() + "config.yaml",
                 content=config_content,
             )
         ]
@@ -156,14 +165,29 @@ def run(self, input_text: str) -> CoderOutput:
             goose_path = find_goose()
             logger.debug(f"Using goose executable at: {goose_path}")
 
+            # Build environment with redirected config
+
             # disable keyring (prevents errors on MacOS and Linux)
             env["GOOSE_DISABLE_KEYRING"] = "1"
+
             # Important:
             # (1) ensure that only local config files are used;
             # (2) assume chdir has been called beforehand.
             cwd = os.getcwd()
             local_home_path = Path(cwd)
-            home_env_var = "XDG_CONFIG_HOME"
+
+            # OS-specific config layout
+            goose_config_dir = local_home_path / get_goose_config_path()
+            # OS-specific home directory environment variable
+            if platform.system().lower().startswith("win"):
+                home_env_var = "APPDATA"
+            else:
+                home_env_var = "XDG_CONFIG_HOME"
+
+            goose_cfg_path = goose_config_dir / "config.yaml"
+
+            logger.info(f"Goose config: {goose_cfg_path}\n")
+
             env[home_env_var] = str(local_home_path)
 
             text = self.expand_prompt(input_text)

From a9113817bf69d0f2d6b06386a24f006037fa7805 Mon Sep 17 00:00:00 2001
From: Charles Parker <ct-parker@users.noreply.github.com>
Date: Thu, 18 Sep 2025 14:10:47 -0700
Subject: [PATCH 34/55] Refactored OS environment detection to create relative
 paths for coder configs consistently.

---
 src/metacoder/coders/base_coder.py |   8 ++-
 src/metacoder/coders/goose.py      |  91 ++++++++++++++++++++-----
 tests/test_goose_paths.py          | 105 +++++++++++++++++++++++++++++
 3 files changed, 187 insertions(+), 17 deletions(-)
 create mode 100644 tests/test_goose_paths.py

diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py
index f2fa7d9..408a794 100644
--- a/src/metacoder/coders/base_coder.py
+++ b/src/metacoder/coders/base_coder.py
@@ -365,7 +365,8 @@ def prepare_workdir(self):
 
         if self.config_objects is None:
             self.config_objects = self.default_config_objects()
-        logger.info(f"📁 Preparing workdir: {self.workdir}")
+        logger.info(f"📁 Preparing workdir (relative): {self.workdir}")
+        logger.info(f"                     (resolved): {Path(self.workdir).resolve()}")
         with change_directory(self.workdir):
             # clear old config objects
             for path, _type in self.default_config_paths().items():
@@ -379,7 +380,10 @@ def prepare_workdir(self):
                         path.unlink()
             logger.debug(f"🔧 Writing config objects: {self.config_objects}")
             for config_object in self.config_objects:
-                path = Path(config_object.relative_path)
+                rel = Path(config_object.relative_path)
+                if rel.is_absolute():
+                    raise ValueError(f"Config object path must be relative: {rel}")
+                path = rel
                 path.parent.mkdir(parents=True, exist_ok=True)
                 logger.info(
                     f"🔧 Writing config object: {config_object.relative_path} type={config_object.file_type}"
diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index 382542c..d6c4b35 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -28,12 +28,71 @@ def find_goose() -> Path:
     return Path(loc).resolve()
 
 
-def get_goose_config_path() -> str:
-    # OS-specific config layout
+def get_home_env_var() -> str:
+    """
+    Determine the environment variable Goose should treat as "home"
+    for locating configuration files.
+
+    Windows:
+        Goose expects its configuration under:
+            %APPDATA%\\Block\\goose\\config\\
+        Therefore, we override APPDATA to point into the working directory.
+
+    Unix-like (Linux, macOS):
+        Goose follows the XDG Base Directory spec:
+            - If $XDG_CONFIG_HOME is set, config goes under:
+                  $XDG_CONFIG_HOME/goose/config.yaml
+            - Otherwise it falls back to:
+                  $HOME/.config/goose/config.yaml
+
+        We mirror this behavior by checking whether XDG_CONFIG_HOME is set
+        in the environment. If it is set, return "XDG_CONFIG_HOME";
+        otherwise, return "HOME".
+
+    Returns:
+        str: The environment variable name that should be overridden to
+             redirect Goose’s config into the working directory.
+    """
     if platform.system().lower().startswith("win"):
-        return "Block\\goose\\config\\"
+        return "APPDATA"
+
+    if "XDG_CONFIG_HOME" in os.environ and os.environ["XDG_CONFIG_HOME"]:
+        return "XDG_CONFIG_HOME"
+    return "HOME"
+
+
+def get_goose_config_path() -> Path:
+    """
+    Get the relative config path (from the simulated home directory)
+    where Goose expects its configuration, based on the home
+    environment variable chosen by get_home_env_var().
+
+    Returns:
+        pathlib.Path: The relative config directory path.
+
+    Behavior:
+        - If get_home_env_var() == "APPDATA":
+            Path -> "Block/goose/config/"
+            (matches %APPDATA%\\Block\\goose\\config\\ on Windows)
+
+        - If get_home_env_var() == "HOME":
+            Path -> ".config/goose/"
+            (matches $HOME/.config/goose/ on Unix-like systems)
+
+        - If get_home_env_var() == "XDG_CONFIG_HOME":
+            Path -> "goose/"
+            (matches $XDG_CONFIG_HOME/goose/ on Unix-like systems)
+    """
+    home_env_var = get_home_env_var()
 
-    return ".config/goose/"
+    if home_env_var == "APPDATA":
+        return Path("Block/goose/config/")
+    elif home_env_var == "HOME":
+        return Path(".config/goose/")
+    elif home_env_var == "XDG_CONFIG_HOME":
+        return Path("goose/")
+    else:
+        raise RuntimeError(f"Unhandled home env var: {home_env_var}")
 
 
 class GooseCoder(BaseCoder):
@@ -66,6 +125,11 @@ def mcp_config_to_goose_extension(self, mcp: MCPConfig) -> dict:
             "type": "stdio" if mcp.type == MCPType.STDIO else mcp.type.value,
         }
 
+        is_stdio = mcp.type == MCPType.STDIO
+
+        if is_stdio and not mcp.command:
+            raise ValueError("STDIO MCP configuration requires 'command'.")
+
         if mcp.description:
             extension["description"] = mcp.description
 
@@ -146,10 +210,12 @@ def default_config_objects(self) -> list[CoderConfigObject]:
 
         config_content["extensions"] = extensions
 
+        cfg_rel = get_goose_config_path() / "config.yaml"
+
         return [
             CoderConfigObject(
                 file_type=FileType.YAML,
-                relative_path=get_goose_config_path() + "config.yaml",
+                relative_path=str(cfg_rel),
                 content=config_content,
             )
         ]
@@ -177,18 +243,13 @@ def run(self, input_text: str) -> CoderOutput:
             local_home_path = Path(cwd)
 
             # OS-specific config layout
-            goose_config_dir = local_home_path / get_goose_config_path()
-            # OS-specific home directory environment variable
-            if platform.system().lower().startswith("win"):
-                home_env_var = "APPDATA"
-            else:
-                home_env_var = "XDG_CONFIG_HOME"
+            home_env_var = get_home_env_var()
+            env[home_env_var] = str(local_home_path)
 
+            goose_config_dir = local_home_path / get_goose_config_path()
             goose_cfg_path = goose_config_dir / "config.yaml"
-
-            logger.info(f"Goose config: {goose_cfg_path}\n")
-
-            env[home_env_var] = str(local_home_path)
+            logger.info(f"Goose home var: {home_env_var} -> {env[home_env_var]}")
+            logger.info(f"Goose config (expected at): {goose_cfg_path}")
 
             text = self.expand_prompt(input_text)
             command = [str(goose_path), "run", "-t", text]
diff --git a/tests/test_goose_paths.py b/tests/test_goose_paths.py
new file mode 100644
index 0000000..5a5fb53
--- /dev/null
+++ b/tests/test_goose_paths.py
@@ -0,0 +1,105 @@
+from pathlib import Path
+import pytest
+
+from metacoder.coders.goose import get_home_env_var, get_goose_config_path
+
+
+def _norm(p: Path | str) -> str:
+    """Normalize path separators & strip trailing slashes for stable compares."""
+    s = str(p).replace("\\", "/")
+    return s[:-1] if s.endswith("/") else s
+
+
+@pytest.mark.parametrize(
+    "platform_name, xdg_value, expected_env",
+    [
+        ("Windows", None, "APPDATA"),
+        ("Linux", None, "HOME"),
+        ("Darwin", None, "HOME"),
+        ("Linux", "/custom/xdg", "XDG_CONFIG_HOME"),
+        ("Darwin", "/Users/alice/.conf", "XDG_CONFIG_HOME"),
+    ],
+)
+def test_env_var_selection(monkeypatch, platform_name, xdg_value, expected_env):
+    # Simulate platform
+    import platform as _platform
+
+    monkeypatch.setattr(_platform, "system", lambda: platform_name)
+
+    # Simulate XDG presence/absence
+    if xdg_value is not None:
+        monkeypatch.setenv("XDG_CONFIG_HOME", xdg_value)
+    else:
+        monkeypatch.delenv("XDG_CONFIG_HOME", raising=False)
+
+    actual = get_home_env_var()
+    assert actual == expected_env
+
+
+@pytest.mark.parametrize(
+    "platform_name, xdg_value, expected_env, expected_rel_dir",
+    [
+        ("Windows", None, "APPDATA", "Block/goose/config"),
+        ("Linux", None, "HOME", ".config/goose"),
+        ("Darwin", None, "HOME", ".config/goose"),
+        ("Linux", "/custom/xdg", "XDG_CONFIG_HOME", "goose"),
+        ("Darwin", "/Users/alice/.conf", "XDG_CONFIG_HOME", "goose"),
+    ],
+)
+def test_config_path_matches_env(
+    monkeypatch, platform_name, xdg_value, expected_env, expected_rel_dir
+):
+    import platform as _platform
+
+    monkeypatch.setattr(_platform, "system", lambda: platform_name)
+
+    if xdg_value is not None:
+        monkeypatch.setenv("XDG_CONFIG_HOME", xdg_value)
+    else:
+        monkeypatch.delenv("XDG_CONFIG_HOME", raising=False)
+
+    env_var = get_home_env_var()
+    rel_path = get_goose_config_path()
+
+    assert env_var == expected_env
+    assert _norm(rel_path) == expected_rel_dir
+
+
+@pytest.mark.parametrize(
+    "platform_name, xdg_value, workdir, expected_effective_dir",
+    [
+        ("Windows", None, "C:/tmp/work", "C:/tmp/work/Block/goose/config/config.yaml"),
+        ("Linux", None, "/tmp/work", "/tmp/work/.config/goose/config.yaml"),
+        (
+            "Darwin",
+            None,
+            "/Users/alice/work",
+            "/Users/alice/work/.config/goose/config.yaml",
+        ),
+        ("Linux", "/custom/xdg", "/tmp/work", "/tmp/work/goose/config.yaml"),
+        (
+            "Darwin",
+            "/Users/alice/.conf",
+            "/Users/alice/work",
+            "/Users/alice/work/goose/config.yaml",
+        ),
+    ],
+)
+def test_effective_config_location(
+    monkeypatch, platform_name, xdg_value, workdir, expected_effective_dir
+):
+    import platform as _platform
+
+    monkeypatch.setattr(_platform, "system", lambda: platform_name)
+
+    if xdg_value is not None:
+        monkeypatch.setenv("XDG_CONFIG_HOME", xdg_value)
+    else:
+        monkeypatch.delenv("XDG_CONFIG_HOME", raising=False)
+
+    local_home_path = Path(workdir)
+
+    goose_config_dir = local_home_path / get_goose_config_path()
+    goose_cfg_file = goose_config_dir / "config.yaml"
+
+    assert _norm(goose_cfg_file) == _norm(expected_effective_dir)

From a7429aad61794d69d1331bb69b628f89bc4f5bb0 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Mon, 24 Nov 2025 15:57:27 -0500
Subject: [PATCH 35/55] Add custom GEval support with criteria, rubric, and
 evaluation_steps

- Add MetricConfig class with support for custom criteria, rubric, and evaluation_steps
- Add RubricItem class for structured rubric definitions
- Implement make_custom_geval() to create GEval instances from MetricConfig
- Add model_validator to enforce mutual exclusivity of criteria/evaluation_steps
- Update EvalCase to accept both string and MetricConfig metric specifications
- Fix make_geval() bug: remove duplicate criteria parameter (keep evaluation_steps only)
- Improve error handling in Claude coder (warning instead of exception)
- Add comprehensive test suite for MetricConfig validation

Addresses limitations with generic evaluation criteria not suitable for:
- Exact text extraction tasks
- Specific metadata field retrieval
- Binary decision evaluations

The custom GEval feature allows test-specific evaluation criteria for better
accuracy and reliability. evaluation_steps provides more control than criteria
according to DeepEval documentation.

Tests: Added test_custom_geval.py with 7 test cases covering all validation scenarios
---
 src/metacoder/coders/claude.py    |  3 +-
 src/metacoder/evals/eval_model.py | 77 ++++++++++++++++++++++--
 src/metacoder/evals/runner.py     | 97 +++++++++++++++++++++++++++----
 tests/test_custom_geval.py        | 70 ++++++++++++++++++++++
 4 files changed, 232 insertions(+), 15 deletions(-)
 create mode 100644 tests/test_custom_geval.py

diff --git a/src/metacoder/coders/claude.py b/src/metacoder/coders/claude.py
index 1a43295..d380032 100644
--- a/src/metacoder/coders/claude.py
+++ b/src/metacoder/coders/claude.py
@@ -260,5 +260,6 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                         f"Claude authentication failed. Try setting ANTHROPIC_AUTH_TOKEN environment variable or run 'claude setup-token'. "
                         f"For custom endpoints, also set ANTHROPIC_BASE_URL. Original error: {ao.stderr} // {ao}"
                     )
-                raise ValueError(f"Claude failed with error: {ao.stderr} // {ao}")
+                # Don't raise for other errors - let evaluation continue and mark test as failed
+                logger.warning(f"Claude returned error (test will be marked as failed): {ao.result_text}")
             return ao
diff --git a/src/metacoder/evals/eval_model.py b/src/metacoder/evals/eval_model.py
index 471c13d..d3905cb 100644
--- a/src/metacoder/evals/eval_model.py
+++ b/src/metacoder/evals/eval_model.py
@@ -1,8 +1,49 @@
-from typing import Any, Optional, List, Dict
-from pydantic import BaseModel, Field
+from typing import Any, Optional, List, Dict, Union
+from pydantic import BaseModel, Field, field_validator, model_validator
 from metacoder.configuration import AIModelConfig, MCPConfig
 
 
+class RubricItem(BaseModel):
+    """A single rubric scoring guideline."""
+
+    score: float = Field(..., description="Score value (typically 0.0 or 1.0)")
+    criteria: str = Field(..., description="Criteria for this score")
+
+
+class MetricConfig(BaseModel):
+    """Configuration for a metric with optional custom rubric."""
+
+    name: str = Field(..., description="Metric name (e.g., CorrectnessMetric)")
+    rubric: Optional[List[RubricItem]] = Field(
+        default=None, description="Custom rubric for evaluation"
+    )
+    criteria: Optional[str] = Field(
+        default=None, description="Custom criteria for evaluation (mutually exclusive with evaluation_steps)"
+    )
+    evaluation_steps: Optional[List[str]] = Field(
+        default=None, description="Custom evaluation steps (mutually exclusive with criteria)"
+    )
+
+    @model_validator(mode='after')
+    def validate_mutual_exclusivity(self):
+        """Ensure criteria and evaluation_steps are mutually exclusive and at least one is provided."""
+        # Check mutual exclusivity
+        if self.criteria is not None and self.evaluation_steps is not None:
+            raise ValueError(
+                "Cannot specify both 'criteria' and 'evaluation_steps'. "
+                "Use one or the other. evaluation_steps provides more control, "
+                "while criteria auto-generates steps."
+            )
+
+        # Check that at least one is provided
+        if self.criteria is None and self.evaluation_steps is None and self.rubric is None:
+            raise ValueError(
+                "Must provide at least one of: criteria, evaluation_steps, or rubric"
+            )
+
+        return self
+
+
 class EvalCase(BaseModel):
     """
     A single evaluation test case.
@@ -18,15 +59,43 @@ class EvalCase(BaseModel):
         expected_output: "Example Paper Title"
         threshold: 0.9
         ```
+
+    Example with custom rubric:
+        ```yaml
+        name: "retraction_check"
+        metrics:
+          - name: CorrectnessMetric
+            rubric:
+              - score: 0.0
+                criteria: "Output indicates paper not retracted"
+              - score: 1.0
+                criteria: "Output indicates paper is retracted"
+        input: "Is PMC4831113 retracted?"
+        expected_output: "Yes"
+        ```
+
+    Example with custom evaluation_steps:
+        ```yaml
+        name: "exact_text_extraction"
+        metrics:
+          - name: CorrectnessMetric
+            evaluation_steps:
+              - "Check whether the actual output contains the exact text from expected output"
+              - "Heavily penalize any deviation, paraphrasing, or added explanations"
+              - "The text must be verbatim, not approximate"
+        input: "What is the first sentence of section 2?"
+        expected_output: "This is the exact sentence."
+        threshold: 0.9
+        ```
     """
 
     name: str = Field(..., description="Unique identifier for the test case")
     group: Optional[str] = Field(
         default="Default", description="Test category for result grouping."
     )
-    metrics: List[str] = Field(
+    metrics: List[Union[str, MetricConfig]] = Field(
         ...,
-        description="List of metric names to apply (e.g., CorrectnessMetric, FaithfulnessMetric)",
+        description="List of metric names or metric configurations with custom rubrics",
     )
     input: str = Field(
         ..., description="The prompt or question to send to the AI coder"
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index a80060a..f65ca03 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -28,7 +28,7 @@
 
 from metacoder.coders.base_coder import BaseCoder, CoderOutput
 from metacoder.registry import AVAILABLE_CODERS
-from metacoder.evals.eval_model import EvalCase, EvalDataset
+from metacoder.evals.eval_model import EvalCase, EvalDataset, MetricConfig
 from metacoder.configuration import AIModelConfig, CoderConfig
 
 logger = logging.getLogger(__name__)
@@ -65,11 +65,14 @@ def is_successful(self) -> bool:
 
 
 def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval:
-    """Creates a GEval instance with the specified model."""
+    """Creates a GEval instance with the specified model.
+
+    Uses evaluation_steps (not criteria) for more reliable scoring across runs.
+    """
     return GEval(
         name="Correctness",
-        criteria="Determine whether the actual output is factually correct based on the expected output.",
         # NOTE: you can only provide either criteria or evaluation_steps, and not both
+        # Using evaluation_steps for more control and reliability
         evaluation_steps=[
             "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
             "You should also heavily penalize omission of detail",
@@ -85,6 +88,63 @@ def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval:
     )
 
 
+def make_custom_geval(
+    metric_config: MetricConfig, model: Optional[DeepEvalBaseLLM] = None
+) -> GEval:
+    """Creates a GEval instance with custom criteria/rubric/evaluation_steps from MetricConfig.
+
+    Args:
+        metric_config: Configuration with custom evaluation parameters
+        model: Optional LLM model (defaults to OpenAI GPT-4)
+
+    Returns:
+        Configured GEval instance
+
+    Note:
+        criteria and evaluation_steps are mutually exclusive (enforced by MetricConfig validator).
+        evaluation_steps provides more control and reliability, while criteria auto-generates steps.
+    """
+    from deepeval.metrics.g_eval.utils import Rubric
+
+    # Convert rubric if provided
+    rubrics = []
+    if metric_config.rubric:
+        for item in metric_config.rubric:
+            rubrics.append(
+                Rubric(
+                    score_range=(item.score, item.score),
+                    expected_outcome=item.criteria,
+                )
+            )
+
+    # Build kwargs for GEval
+    kwargs = {
+        "name": metric_config.name,
+        "evaluation_params": [
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+            LLMTestCaseParams.EXPECTED_OUTPUT,
+        ],
+        "model": model,
+    }
+
+    # Add evaluation_steps OR criteria (mutually exclusive)
+    # Note: Pydantic validator already ensures mutual exclusivity
+    if metric_config.evaluation_steps:
+        kwargs["evaluation_steps"] = metric_config.evaluation_steps
+    elif metric_config.criteria:
+        kwargs["criteria"] = metric_config.criteria
+    else:
+        # Default criteria if only rubric provided
+        kwargs["criteria"] = "Evaluate the actual output based on the rubric criteria."
+
+    # Add rubric if provided
+    if rubrics:
+        kwargs["rubric"] = rubrics
+
+    return GEval(**kwargs)
+
+
 def get_default_metrics(
     model: Optional[DeepEvalBaseLLM] = None,
 ) -> Dict[str, BaseMetric]:
@@ -282,14 +342,31 @@ def run_single_eval(
         execution_time = time.time() - start_time
 
         # Run each metric
-        for metric_name in case.metrics:
-            default_metrics = get_default_metrics()
-            if metric_name in default_metrics:
-                metric = default_metrics[metric_name]
+        for metric_item in case.metrics:
+            # Handle both string metrics and MetricConfig objects
+            if isinstance(metric_item, str):
+                # Original behavior: string metric name
+                metric_name = metric_item
+                metric_config = None
+            else:
+                # New behavior: MetricConfig object with potential custom rubric
+                metric_name = metric_item.name
+                metric_config = metric_item
+
+            # Create the metric instance
+            if metric_config and (metric_config.rubric or metric_config.criteria or metric_config.evaluation_steps):
+                # Use custom configuration if provided
+                logger.info(f"Using custom configuration for {metric_name}")
+                metric = make_custom_geval(metric_config, model=None)
             else:
-                # Get metric class and instantiate
-                metric_class = self.get_metric_class(metric_name)
-                metric = metric_class(threshold=case.threshold)  # type: ignore
+                # Use default metric behavior
+                default_metrics = get_default_metrics()
+                if metric_name in default_metrics:
+                    metric = default_metrics[metric_name]
+                else:
+                    # Get metric class and instantiate
+                    metric_class = self.get_metric_class(metric_name)
+                    metric = metric_class(threshold=case.threshold)  # type: ignore
 
             # Create test case
             test_case = self.create_test_case(case, actual_output)
diff --git a/tests/test_custom_geval.py b/tests/test_custom_geval.py
new file mode 100644
index 0000000..83cdbf1
--- /dev/null
+++ b/tests/test_custom_geval.py
@@ -0,0 +1,70 @@
+"""Test custom GEval metric configuration."""
+
+import pytest
+from metacoder.evals.eval_model import MetricConfig, RubricItem
+
+
+def test_evaluation_steps_only():
+    """Test creating MetricConfig with only evaluation_steps."""
+    m = MetricConfig(
+        name="CorrectnessMetric", evaluation_steps=["Check if output is correct"]
+    )
+    assert m.evaluation_steps == ["Check if output is correct"]
+    assert m.criteria is None
+    assert m.rubric is None
+
+
+def test_criteria_only():
+    """Test creating MetricConfig with only criteria."""
+    m = MetricConfig(name="CorrectnessMetric", criteria="Check correctness")
+    assert m.criteria == "Check correctness"
+    assert m.evaluation_steps is None
+    assert m.rubric is None
+
+
+def test_rubric_only():
+    """Test creating MetricConfig with only rubric."""
+    rubric = [
+        RubricItem(score=0.0, criteria="Wrong"),
+        RubricItem(score=1.0, criteria="Correct"),
+    ]
+    m = MetricConfig(name="CorrectnessMetric", rubric=rubric)
+    assert len(m.rubric) == 2
+    assert m.criteria is None
+    assert m.evaluation_steps is None
+
+
+def test_criteria_and_evaluation_steps_mutually_exclusive():
+    """Test that providing both criteria and evaluation_steps raises ValueError."""
+    with pytest.raises(ValueError, match="Cannot specify both"):
+        MetricConfig(
+            name="CorrectnessMetric",
+            criteria="Check correctness",
+            evaluation_steps=["Step 1"],
+        )
+
+
+def test_requires_at_least_one():
+    """Test that at least one of criteria/evaluation_steps/rubric is required."""
+    with pytest.raises(ValueError, match="Must provide at least one"):
+        MetricConfig(name="CorrectnessMetric")
+
+
+def test_criteria_with_rubric():
+    """Test that criteria can be combined with rubric."""
+    rubric = [RubricItem(score=0.0, criteria="Wrong")]
+    m = MetricConfig(
+        name="CorrectnessMetric", criteria="Check correctness", rubric=rubric
+    )
+    assert m.criteria == "Check correctness"
+    assert len(m.rubric) == 1
+
+
+def test_evaluation_steps_with_rubric():
+    """Test that evaluation_steps can be combined with rubric."""
+    rubric = [RubricItem(score=1.0, criteria="Correct")]
+    m = MetricConfig(
+        name="CorrectnessMetric", evaluation_steps=["Step 1"], rubric=rubric
+    )
+    assert m.evaluation_steps == ["Step 1"]
+    assert len(m.rubric) == 1

From 3e90e9bac9fa2e2fd3f62637ca82f7ecfe0b4cca Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Mon, 24 Nov 2025 16:05:52 -0500
Subject: [PATCH 36/55] Fix ruff linting error: remove unused field_validator
 import

---
 src/metacoder/evals/eval_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/metacoder/evals/eval_model.py b/src/metacoder/evals/eval_model.py
index d3905cb..6a60f03 100644
--- a/src/metacoder/evals/eval_model.py
+++ b/src/metacoder/evals/eval_model.py
@@ -1,5 +1,5 @@
 from typing import Any, Optional, List, Dict, Union
-from pydantic import BaseModel, Field, field_validator, model_validator
+from pydantic import BaseModel, Field, model_validator
 from metacoder.configuration import AIModelConfig, MCPConfig
 
 

From 3bf28166ea4c6911ede726644878300faa1dcc15 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Wed, 26 Nov 2025 17:32:26 -0500
Subject: [PATCH 37/55] Fix GeminiCoder CLI invocation to use positional
 arguments

The previous implementation used echo|gemini which fails because
the gemini CLI expects prompts as positional arguments, not stdin.

Changes:
- Changed from: echo "prompt" | gemini
- Changed to: gemini "prompt"
- Avoids shell escaping issues
- Works correctly with MCP servers

Testing verified basic invocation and MCP config work correctly.
---
 src/metacoder/coders/gemini.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index 6af35c4..5fab08d 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -136,9 +136,9 @@ def run(self, input_text: str) -> CoderOutput:
             text = self.expand_prompt(input_text)
 
             # Build the command
-            # The gemini CLI uses conversational interface, so we need to handle it differently
-            # For now, we'll use echo to pipe the prompt
-            command = ["sh", "-c", f'echo "{text}" | gemini']
+            # The gemini CLI accepts the prompt as a positional argument
+            # We pass it directly as an argument to avoid shell escaping issues
+            command = ["gemini", text]
 
             logger.info("💎 Running command: gemini with prompt")
             logger.debug(f"💎 Full command: {' '.join(command)}")

From 808e0ae2803e2c7abe05b4971fbf5a93ab80faba Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Mon, 1 Dec 2025 11:46:39 -0500
Subject: [PATCH 38/55] Copy OAuth credentials to workdir for gemini CLI

Gemini CLI requires OAuth credentials which are normally stored in
~/.gemini/oauth_creds.json. Since we set HOME=.

 in the run() method
to read MCP config from the workdir, we need to also copy the OAuth
credentials to workdir/.gemini/oauth_creds.json so gemini can find them.
---
 src/metacoder/coders/gemini.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index 5fab08d..b350019 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -4,6 +4,8 @@
 import logging
 import shutil
 import re
+import json
+import os
 from typing import Any
 
 from metacoder.coders.base_coder import (
@@ -117,6 +119,25 @@ def default_config_objects(self) -> list[CoderConfigObject]:
                 )
             )
 
+        # Copy OAuth credentials from user's home directory if they exist
+        # Gemini CLI uses OAuth for authentication, and credentials are stored
+        # in ~/.gemini/oauth_creds.json. Since we set HOME="." in run(),
+        # we need to copy these credentials to the workdir.
+        oauth_creds_path = Path(os.path.expanduser("~/.gemini/oauth_creds.json"))
+        if oauth_creds_path.exists():
+            try:
+                with open(oauth_creds_path, "r") as f:
+                    oauth_creds = json.load(f)
+                config_objects.append(
+                    CoderConfigObject(
+                        file_type=FileType.JSON,
+                        relative_path=".gemini/oauth_creds.json",
+                        content=oauth_creds,
+                    )
+                )
+            except (IOError, json.JSONDecodeError) as e:
+                logger.warning(f"Failed to load OAuth credentials from {oauth_creds_path}: {e}")
+
         # Add GEMINI.md if present in config
         # This could contain instructions specific to the task
 

From b2bcfffb7e625c8ff7630b91c8a52f6eea77fb19 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Mon, 1 Dec 2025 12:03:06 -0500
Subject: [PATCH 39/55] Remove HOME="." override - gemini works fine without it

The gemini CLI reads .gemini/settings.json from the current directory
and OAuth credentials from ~/.gemini/oauth_creds.json. Setting HOME="."
breaks OAuth authentication while providing no benefit.
---
 .claude/settings.local.json                   |  16 ++
 .deepeval/.deepeval_telemetry.txt             |   4 +
 .../.config/goose/config.yaml                 |  34 ++++
 src/metacoder/coders/gemini.py                |  24 ---
 test_gemini_fix.py                            | 183 ++++++++++++++++++
 5 files changed, 237 insertions(+), 24 deletions(-)
 create mode 100644 .claude/settings.local.json
 create mode 100644 .deepeval/.deepeval_telemetry.txt
 create mode 100644 eval_workdir/gpt-4o_goose_disease_mcp-simple-pubmed_ols-mcp/gpt-4o_goose_disease/.config/goose/config.yaml
 create mode 100644 test_gemini_fix.py

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
new file mode 100644
index 0000000..974117e
--- /dev/null
+++ b/.claude/settings.local.json
@@ -0,0 +1,16 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(gh pr view:*)",
+      "Bash(gh pr checkout:*)",
+      "Bash(git stash:*)",
+      "Bash(uv sync:*)",
+      "Bash(make test:*)",
+      "Bash(uv run pytest:*)",
+      "Bash(uv run:*)",
+      "Bash(make:*)"
+    ],
+    "deny": [],
+    "ask": []
+  }
+}
\ No newline at end of file
diff --git a/.deepeval/.deepeval_telemetry.txt b/.deepeval/.deepeval_telemetry.txt
new file mode 100644
index 0000000..e16ea39
--- /dev/null
+++ b/.deepeval/.deepeval_telemetry.txt
@@ -0,0 +1,4 @@
+DEEPEVAL_ID=8f34a7cd-a262-4e39-90b4-f3833cfcfd73
+DEEPEVAL_STATUS=old
+DEEPEVAL_LAST_FEATURE=evaluation
+DEEPEVAL_EVALUATION_STATUS=old
diff --git a/eval_workdir/gpt-4o_goose_disease_mcp-simple-pubmed_ols-mcp/gpt-4o_goose_disease/.config/goose/config.yaml b/eval_workdir/gpt-4o_goose_disease_mcp-simple-pubmed_ols-mcp/gpt-4o_goose_disease/.config/goose/config.yaml
new file mode 100644
index 0000000..b4d5a3a
--- /dev/null
+++ b/eval_workdir/gpt-4o_goose_disease_mcp-simple-pubmed_ols-mcp/gpt-4o_goose_disease/.config/goose/config.yaml
@@ -0,0 +1,34 @@
+GOOSE_MODEL: claude-sonnet-4-20250514
+GOOSE_PROVIDER: anthropic
+extensions:
+  developer:
+    bundled: true
+    display_name: Developer
+    enabled: true
+    name: developer
+    timeout: 300
+    type: builtin
+  ols:
+    args:
+    - mcp-ols
+    bundled: null
+    cmd: uvx
+    enabled: true
+    env_keys: []
+    envs: {}
+    name: ols
+    timeout: 300
+    type: stdio
+  pubmed:
+    args:
+    - mcp-simple-pubmed
+    bundled: null
+    cmd: uvx
+    enabled: true
+    env_keys:
+    - PUBMED_EMAIL
+    envs:
+      PUBMED_EMAIL: cjmungall@lbl.gov
+    name: pubmed
+    timeout: 300
+    type: stdio
diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index b350019..d29bc6c 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -4,8 +4,6 @@
 import logging
 import shutil
 import re
-import json
-import os
 from typing import Any
 
 from metacoder.coders.base_coder import (
@@ -119,25 +117,6 @@ def default_config_objects(self) -> list[CoderConfigObject]:
                 )
             )
 
-        # Copy OAuth credentials from user's home directory if they exist
-        # Gemini CLI uses OAuth for authentication, and credentials are stored
-        # in ~/.gemini/oauth_creds.json. Since we set HOME="." in run(),
-        # we need to copy these credentials to the workdir.
-        oauth_creds_path = Path(os.path.expanduser("~/.gemini/oauth_creds.json"))
-        if oauth_creds_path.exists():
-            try:
-                with open(oauth_creds_path, "r") as f:
-                    oauth_creds = json.load(f)
-                config_objects.append(
-                    CoderConfigObject(
-                        file_type=FileType.JSON,
-                        relative_path=".gemini/oauth_creds.json",
-                        content=oauth_creds,
-                    )
-                )
-            except (IOError, json.JSONDecodeError) as e:
-                logger.warning(f"Failed to load OAuth credentials from {oauth_creds_path}: {e}")
-
         # Add GEMINI.md if present in config
         # This could contain instructions specific to the task
 
@@ -151,9 +130,6 @@ def run(self, input_text: str) -> CoderOutput:
         self.prepare_workdir()
 
         with change_directory(self.workdir):
-            # Gemini expects HOME to be current directory for config
-            env["HOME"] = "."
-
             text = self.expand_prompt(input_text)
 
             # Build the command
diff --git a/test_gemini_fix.py b/test_gemini_fix.py
new file mode 100644
index 0000000..a9ee6c4
--- /dev/null
+++ b/test_gemini_fix.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+"""
+Test script to verify the GeminiCoder fix works correctly.
+This tests that gemini CLI is invoked with positional arguments instead of stdin piping.
+"""
+import tempfile
+import shutil
+from pathlib import Path
+import sys
+import json
+
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent / "src"))
+
+from metacoder.coders.gemini import GeminiCoder
+from metacoder.configuration import RunConfig, MCPConfig, MCPType
+
+def test_gemini_simple_prompt():
+    """Test that gemini can handle a simple prompt correctly."""
+    print("=" * 60)
+    print("TEST 1: Simple arithmetic prompt")
+    print("=" * 60)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        workdir = Path(tmpdir) / "gemini_test"
+        workdir.mkdir()
+
+        # Create minimal config
+        config = RunConfig(
+            workdir=workdir,
+            extensions=[],
+        )
+
+        coder = GeminiCoder(config=config, workdir=workdir)
+
+        # Test with simple prompt
+        result = coder.run("What is 2+2? Just give the number.")
+
+        print(f"Success: {result.success}")
+        print(f"Stdout: {result.stdout}")
+        print(f"Stderr: {result.stderr}")
+        print(f"Result text: {result.result_text}")
+
+        if not result.success:
+            print("❌ TEST FAILED: Command did not succeed")
+            return False
+
+        if "4" not in result.result_text:
+            print(f"❌ TEST FAILED: Expected '4' in result, got: {result.result_text}")
+            return False
+
+        print("✅ TEST PASSED: Simple prompt works")
+        return True
+
+def test_gemini_with_mcp():
+    """Test that gemini works with MCP server configured."""
+    print("\n" + "=" * 60)
+    print("TEST 2: Gemini with MCP server (artl)")
+    print("=" * 60)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        workdir = Path(tmpdir) / "gemini_mcp_test"
+        workdir.mkdir()
+
+        # Create config with MCP
+        mcp = MCPConfig(
+            name="artl",
+            type=MCPType.STDIO,
+            command="uvx",
+            args=["artl-mcp"],
+            enabled=True,
+        )
+
+        config = RunConfig(
+            workdir=workdir,
+            extensions=[mcp],
+        )
+
+        coder = GeminiCoder(config=config, workdir=workdir)
+
+        # Verify settings.json was created
+        settings_file = workdir / ".gemini" / "settings.json"
+        if not settings_file.exists():
+            print(f"❌ TEST FAILED: settings.json not created at {settings_file}")
+            return False
+
+        with open(settings_file) as f:
+            settings = json.load(f)
+
+        if "mcpServers" not in settings:
+            print(f"❌ TEST FAILED: mcpServers not in settings.json: {settings}")
+            return False
+
+        if "artl" not in settings["mcpServers"]:
+            print(f"❌ TEST FAILED: artl server not configured: {settings}")
+            return False
+
+        print(f"✅ MCP configuration verified: {json.dumps(settings, indent=2)}")
+
+        # Test with simple prompt (MCP may not be available in test env, but command should work)
+        result = coder.run("What is 3+3? Just give the number.")
+
+        print(f"Success: {result.success}")
+        print(f"Stdout: {result.stdout}")
+
+        if not result.success:
+            print("❌ TEST FAILED: Command with MCP config did not succeed")
+            return False
+
+        print("✅ TEST PASSED: Gemini works with MCP configuration")
+        return True
+
+def test_gemini_special_characters():
+    """Test that gemini handles prompts with special characters correctly."""
+    print("\n" + "=" * 60)
+    print("TEST 3: Prompt with special characters")
+    print("=" * 60)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        workdir = Path(tmpdir) / "gemini_special_test"
+        workdir.mkdir()
+
+        config = RunConfig(
+            workdir=workdir,
+            extensions=[],
+        )
+
+        coder = GeminiCoder(config=config, workdir=workdir)
+
+        # Test with special characters that would break shell escaping
+        test_prompt = 'Say "Hello!" with quotes and $special chars'
+        result = coder.run(test_prompt)
+
+        print(f"Success: {result.success}")
+        print(f"Stdout: {result.stdout[:200]}")  # First 200 chars
+
+        if not result.success:
+            print("❌ TEST FAILED: Special characters broke the command")
+            return False
+
+        print("✅ TEST PASSED: Special characters handled correctly")
+        return True
+
+def main():
+    # Check if gemini is available
+    if not shutil.which("gemini"):
+        print("⚠️  WARNING: gemini CLI not found. Install with: npm install -g @google/gemini-cli")
+        sys.exit(1)
+
+    print("Testing GeminiCoder fix...\n")
+
+    tests = [
+        test_gemini_simple_prompt,
+        test_gemini_with_mcp,
+        test_gemini_special_characters,
+    ]
+
+    results = []
+    for test in tests:
+        try:
+            results.append(test())
+        except Exception as e:
+            print(f"❌ TEST FAILED with exception: {e}")
+            import traceback
+            traceback.print_exc()
+            results.append(False)
+
+    print("\n" + "=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    passed = sum(results)
+    total = len(results)
+    print(f"Passed: {passed}/{total}")
+
+    if all(results):
+        print("\n✅ ALL TESTS PASSED - Fix is working correctly!")
+        sys.exit(0)
+    else:
+        print("\n❌ SOME TESTS FAILED - Fix needs more work")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()

From 997e9a9cf77cccc057b4621b0a940db573d347df Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Mon, 1 Dec 2025 17:10:34 -0500
Subject: [PATCH 40/55] Fix: Prevent process hangs by closing stdin in
 run_process()

The gemini CLI (and potentially other CLIs) were hanging because they
wait for stdin input. This fix adds stdin=PIPE to the Popen call and
immediately closes it, which is equivalent to redirecting from /dev/null.

This prevents any subprocess from blocking while waiting for stdin input,
without affecting functionality since run_process() doesn't support
passing stdin data - all coders pass prompts via command-line arguments.

Fixes the hanging issue reported with GeminiCoder.
---
 src/metacoder/coders/base_coder.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py
index 408a794..5d16645 100644
--- a/src/metacoder/coders/base_coder.py
+++ b/src/metacoder/coders/base_coder.py
@@ -179,6 +179,7 @@ def run_process(
             command,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
+            stdin=subprocess.PIPE,  # Provide stdin to prevent hanging on processes that read it
             text=True,
             encoding="utf-8",
             errors="replace",  # avoid crashes on the occasional bad byte
@@ -186,6 +187,9 @@ def run_process(
             bufsize=1,
             universal_newlines=True,
         )
+        # Close stdin immediately since we don't write to it
+        # This prevents subprocesses from hanging while waiting for input
+        process.stdin.close()
 
         stdout_lines: list[str] = []
         stderr_lines: list[str] = []

From 009b4c61ec5055d52d2a1149f1965341295b181a Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Mon, 1 Dec 2025 17:21:53 -0500
Subject: [PATCH 41/55] Fix ruff formatting issues

---
 src/metacoder/coders/claude.py    |  4 +++-
 src/metacoder/evals/eval_model.py | 14 ++++++++++----
 src/metacoder/evals/runner.py     |  6 +++++-
 test_gemini_fix.py                | 11 ++++++++++-
 4 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/src/metacoder/coders/claude.py b/src/metacoder/coders/claude.py
index d380032..8cc625b 100644
--- a/src/metacoder/coders/claude.py
+++ b/src/metacoder/coders/claude.py
@@ -261,5 +261,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                         f"For custom endpoints, also set ANTHROPIC_BASE_URL. Original error: {ao.stderr} // {ao}"
                     )
                 # Don't raise for other errors - let evaluation continue and mark test as failed
-                logger.warning(f"Claude returned error (test will be marked as failed): {ao.result_text}")
+                logger.warning(
+                    f"Claude returned error (test will be marked as failed): {ao.result_text}"
+                )
             return ao
diff --git a/src/metacoder/evals/eval_model.py b/src/metacoder/evals/eval_model.py
index 6a60f03..86acae9 100644
--- a/src/metacoder/evals/eval_model.py
+++ b/src/metacoder/evals/eval_model.py
@@ -18,13 +18,15 @@ class MetricConfig(BaseModel):
         default=None, description="Custom rubric for evaluation"
     )
     criteria: Optional[str] = Field(
-        default=None, description="Custom criteria for evaluation (mutually exclusive with evaluation_steps)"
+        default=None,
+        description="Custom criteria for evaluation (mutually exclusive with evaluation_steps)",
     )
     evaluation_steps: Optional[List[str]] = Field(
-        default=None, description="Custom evaluation steps (mutually exclusive with criteria)"
+        default=None,
+        description="Custom evaluation steps (mutually exclusive with criteria)",
     )
 
-    @model_validator(mode='after')
+    @model_validator(mode="after")
     def validate_mutual_exclusivity(self):
         """Ensure criteria and evaluation_steps are mutually exclusive and at least one is provided."""
         # Check mutual exclusivity
@@ -36,7 +38,11 @@ def validate_mutual_exclusivity(self):
             )
 
         # Check that at least one is provided
-        if self.criteria is None and self.evaluation_steps is None and self.rubric is None:
+        if (
+            self.criteria is None
+            and self.evaluation_steps is None
+            and self.rubric is None
+        ):
             raise ValueError(
                 "Must provide at least one of: criteria, evaluation_steps, or rubric"
             )
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index f65ca03..46d4142 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -354,7 +354,11 @@ def run_single_eval(
                 metric_config = metric_item
 
             # Create the metric instance
-            if metric_config and (metric_config.rubric or metric_config.criteria or metric_config.evaluation_steps):
+            if metric_config and (
+                metric_config.rubric
+                or metric_config.criteria
+                or metric_config.evaluation_steps
+            ):
                 # Use custom configuration if provided
                 logger.info(f"Using custom configuration for {metric_name}")
                 metric = make_custom_geval(metric_config, model=None)
diff --git a/test_gemini_fix.py b/test_gemini_fix.py
index a9ee6c4..d85ca3a 100644
--- a/test_gemini_fix.py
+++ b/test_gemini_fix.py
@@ -3,6 +3,7 @@
 Test script to verify the GeminiCoder fix works correctly.
 This tests that gemini CLI is invoked with positional arguments instead of stdin piping.
 """
+
 import tempfile
 import shutil
 from pathlib import Path
@@ -15,6 +16,7 @@
 from metacoder.coders.gemini import GeminiCoder
 from metacoder.configuration import RunConfig, MCPConfig, MCPType
 
+
 def test_gemini_simple_prompt():
     """Test that gemini can handle a simple prompt correctly."""
     print("=" * 60)
@@ -52,6 +54,7 @@ def test_gemini_simple_prompt():
         print("✅ TEST PASSED: Simple prompt works")
         return True
 
+
 def test_gemini_with_mcp():
     """Test that gemini works with MCP server configured."""
     print("\n" + "=" * 60)
@@ -110,6 +113,7 @@ def test_gemini_with_mcp():
         print("✅ TEST PASSED: Gemini works with MCP configuration")
         return True
 
+
 def test_gemini_special_characters():
     """Test that gemini handles prompts with special characters correctly."""
     print("\n" + "=" * 60)
@@ -141,10 +145,13 @@ def test_gemini_special_characters():
         print("✅ TEST PASSED: Special characters handled correctly")
         return True
 
+
 def main():
     # Check if gemini is available
     if not shutil.which("gemini"):
-        print("⚠️  WARNING: gemini CLI not found. Install with: npm install -g @google/gemini-cli")
+        print(
+            "⚠️  WARNING: gemini CLI not found. Install with: npm install -g @google/gemini-cli"
+        )
         sys.exit(1)
 
     print("Testing GeminiCoder fix...\n")
@@ -162,6 +169,7 @@ def main():
         except Exception as e:
             print(f"❌ TEST FAILED with exception: {e}")
             import traceback
+
             traceback.print_exc()
             results.append(False)
 
@@ -179,5 +187,6 @@ def main():
         print("\n❌ SOME TESTS FAILED - Fix needs more work")
         sys.exit(1)
 
+
 if __name__ == "__main__":
     main()

From 45fb77c7ed2fae4e98ee47ace35d94880f56df0e Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Mon, 1 Dec 2025 17:26:27 -0500
Subject: [PATCH 42/55] Remove debug test file that was causing import errors

---
 test_gemini_fix.py | 192 ---------------------------------------------
 1 file changed, 192 deletions(-)
 delete mode 100644 test_gemini_fix.py

diff --git a/test_gemini_fix.py b/test_gemini_fix.py
deleted file mode 100644
index d85ca3a..0000000
--- a/test_gemini_fix.py
+++ /dev/null
@@ -1,192 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test script to verify the GeminiCoder fix works correctly.
-This tests that gemini CLI is invoked with positional arguments instead of stdin piping.
-"""
-
-import tempfile
-import shutil
-from pathlib import Path
-import sys
-import json
-
-# Add src to path
-sys.path.insert(0, str(Path(__file__).parent / "src"))
-
-from metacoder.coders.gemini import GeminiCoder
-from metacoder.configuration import RunConfig, MCPConfig, MCPType
-
-
-def test_gemini_simple_prompt():
-    """Test that gemini can handle a simple prompt correctly."""
-    print("=" * 60)
-    print("TEST 1: Simple arithmetic prompt")
-    print("=" * 60)
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        workdir = Path(tmpdir) / "gemini_test"
-        workdir.mkdir()
-
-        # Create minimal config
-        config = RunConfig(
-            workdir=workdir,
-            extensions=[],
-        )
-
-        coder = GeminiCoder(config=config, workdir=workdir)
-
-        # Test with simple prompt
-        result = coder.run("What is 2+2? Just give the number.")
-
-        print(f"Success: {result.success}")
-        print(f"Stdout: {result.stdout}")
-        print(f"Stderr: {result.stderr}")
-        print(f"Result text: {result.result_text}")
-
-        if not result.success:
-            print("❌ TEST FAILED: Command did not succeed")
-            return False
-
-        if "4" not in result.result_text:
-            print(f"❌ TEST FAILED: Expected '4' in result, got: {result.result_text}")
-            return False
-
-        print("✅ TEST PASSED: Simple prompt works")
-        return True
-
-
-def test_gemini_with_mcp():
-    """Test that gemini works with MCP server configured."""
-    print("\n" + "=" * 60)
-    print("TEST 2: Gemini with MCP server (artl)")
-    print("=" * 60)
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        workdir = Path(tmpdir) / "gemini_mcp_test"
-        workdir.mkdir()
-
-        # Create config with MCP
-        mcp = MCPConfig(
-            name="artl",
-            type=MCPType.STDIO,
-            command="uvx",
-            args=["artl-mcp"],
-            enabled=True,
-        )
-
-        config = RunConfig(
-            workdir=workdir,
-            extensions=[mcp],
-        )
-
-        coder = GeminiCoder(config=config, workdir=workdir)
-
-        # Verify settings.json was created
-        settings_file = workdir / ".gemini" / "settings.json"
-        if not settings_file.exists():
-            print(f"❌ TEST FAILED: settings.json not created at {settings_file}")
-            return False
-
-        with open(settings_file) as f:
-            settings = json.load(f)
-
-        if "mcpServers" not in settings:
-            print(f"❌ TEST FAILED: mcpServers not in settings.json: {settings}")
-            return False
-
-        if "artl" not in settings["mcpServers"]:
-            print(f"❌ TEST FAILED: artl server not configured: {settings}")
-            return False
-
-        print(f"✅ MCP configuration verified: {json.dumps(settings, indent=2)}")
-
-        # Test with simple prompt (MCP may not be available in test env, but command should work)
-        result = coder.run("What is 3+3? Just give the number.")
-
-        print(f"Success: {result.success}")
-        print(f"Stdout: {result.stdout}")
-
-        if not result.success:
-            print("❌ TEST FAILED: Command with MCP config did not succeed")
-            return False
-
-        print("✅ TEST PASSED: Gemini works with MCP configuration")
-        return True
-
-
-def test_gemini_special_characters():
-    """Test that gemini handles prompts with special characters correctly."""
-    print("\n" + "=" * 60)
-    print("TEST 3: Prompt with special characters")
-    print("=" * 60)
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        workdir = Path(tmpdir) / "gemini_special_test"
-        workdir.mkdir()
-
-        config = RunConfig(
-            workdir=workdir,
-            extensions=[],
-        )
-
-        coder = GeminiCoder(config=config, workdir=workdir)
-
-        # Test with special characters that would break shell escaping
-        test_prompt = 'Say "Hello!" with quotes and $special chars'
-        result = coder.run(test_prompt)
-
-        print(f"Success: {result.success}")
-        print(f"Stdout: {result.stdout[:200]}")  # First 200 chars
-
-        if not result.success:
-            print("❌ TEST FAILED: Special characters broke the command")
-            return False
-
-        print("✅ TEST PASSED: Special characters handled correctly")
-        return True
-
-
-def main():
-    # Check if gemini is available
-    if not shutil.which("gemini"):
-        print(
-            "⚠️  WARNING: gemini CLI not found. Install with: npm install -g @google/gemini-cli"
-        )
-        sys.exit(1)
-
-    print("Testing GeminiCoder fix...\n")
-
-    tests = [
-        test_gemini_simple_prompt,
-        test_gemini_with_mcp,
-        test_gemini_special_characters,
-    ]
-
-    results = []
-    for test in tests:
-        try:
-            results.append(test())
-        except Exception as e:
-            print(f"❌ TEST FAILED with exception: {e}")
-            import traceback
-
-            traceback.print_exc()
-            results.append(False)
-
-    print("\n" + "=" * 60)
-    print("SUMMARY")
-    print("=" * 60)
-    passed = sum(results)
-    total = len(results)
-    print(f"Passed: {passed}/{total}")
-
-    if all(results):
-        print("\n✅ ALL TESTS PASSED - Fix is working correctly!")
-        sys.exit(0)
-    else:
-        print("\n❌ SOME TESTS FAILED - Fix needs more work")
-        sys.exit(1)
-
-
-if __name__ == "__main__":
-    main()

From f46e1ed34eddfa59991635dab754e6a2c7fe99f2 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Tue, 2 Dec 2025 08:03:06 -0500
Subject: [PATCH 43/55] Add model selection support to GeminiCoder

- Add -m flag support to pass model parameter to gemini CLI
- Follows same pattern as other coders (e.g., opencode)
- Tested with gemini-2.5-flash model
---
 src/metacoder/coders/gemini.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index d29bc6c..40e4a69 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -135,7 +135,13 @@ def run(self, input_text: str) -> CoderOutput:
             # Build the command
             # The gemini CLI accepts the prompt as a positional argument
             # We pass it directly as an argument to avoid shell escaping issues
-            command = ["gemini", text]
+            command = ["gemini"]
+
+            # Add model parameter if specified
+            if self.params and self.params.get("model"):
+                command.extend(["-m", self.params["model"]])
+
+            command.append(text)
 
             logger.info("💎 Running command: gemini with prompt")
             logger.debug(f"💎 Full command: {' '.join(command)}")

From 55093c8d918a7c5a728502d43f1ed56325858096 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Tue, 2 Dec 2025 16:52:51 -0500
Subject: [PATCH 44/55] Fix gemini CLI invocation with MCP servers using -p
 flag

When MCP servers are configured in .gemini/settings.json, the gemini CLI
0.18.0 ignores positional arguments and enters interactive mode instead
of processing the prompt. This causes the evaluation to hang indefinitely.

The fix changes from using a positional argument:
  gemini "prompt text"

To using the -p flag:
  gemini -p "prompt text"

This workaround allows gemini to correctly process prompts even when MCP
servers are configured.

Tested with artl-mcp server configured and verified that prompts are now
processed correctly instead of hanging.
---
 src/metacoder/coders/gemini.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index 40e4a69..aded671 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -133,15 +133,16 @@ def run(self, input_text: str) -> CoderOutput:
             text = self.expand_prompt(input_text)
 
             # Build the command
-            # The gemini CLI accepts the prompt as a positional argument
-            # We pass it directly as an argument to avoid shell escaping issues
+            # Use -p flag instead of positional argument to work around bug with MCP servers
+            # See: https://github.com/google-gemini/gemini-cli/issues/XXX
             command = ["gemini"]
 
             # Add model parameter if specified
             if self.params and self.params.get("model"):
                 command.extend(["-m", self.params["model"]])
 
-            command.append(text)
+            # Use -p flag for prompt (works with MCP servers, positional doesn't)
+            command.extend(["-p", text])
 
             logger.info("💎 Running command: gemini with prompt")
             logger.debug(f"💎 Full command: {' '.join(command)}")

From 7140ed2e0b644b59bd049a9a30721344d6176f84 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Wed, 3 Dec 2025 11:03:49 -0500
Subject: [PATCH 45/55] Add --include-directories flag for MCP workspace access
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Gemini CLI restricts MCP tool file access to workspace directories.
Without --include-directories, MCP tools like read_file fail with:
"File path must be within one of the workspace directories"

This causes gemini to ask for clarification instead of completing tasks
in headless mode with -p flag.

Solution: Pass --include-directories pointing to the workdir, allowing
MCP tools to access files in the evaluation workspace.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/metacoder/coders/gemini.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index aded671..af4a78a 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -141,6 +141,10 @@ def run(self, input_text: str) -> CoderOutput:
             if self.params and self.params.get("model"):
                 command.extend(["-m", self.params["model"]])
 
+            # Add workspace directory so MCP tools can access files
+            # Without this, gemini will error with "File path must be within workspace directories"
+            command.extend(["--include-directories", str(self.workdir)])
+
             # Use -p flag for prompt (works with MCP servers, positional doesn't)
             command.extend(["-p", text])
 

From 873b7fce9211bd90e5411915f7a683b64b8edfce Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Wed, 3 Dec 2025 13:49:41 -0500
Subject: [PATCH 46/55] Fix --include-directories to use absolute path -
 resolve() needed

---
 src/metacoder/coders/gemini.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index af4a78a..0e836eb 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -143,7 +143,7 @@ def run(self, input_text: str) -> CoderOutput:
 
             # Add workspace directory so MCP tools can access files
             # Without this, gemini will error with "File path must be within workspace directories"
-            command.extend(["--include-directories", str(self.workdir)])
+            command.extend(["--include-directories", str(Path(self.workdir).resolve())])
 
             # Use -p flag for prompt (works with MCP servers, positional doesn't)
             command.extend(["-p", text])

From 74cf1212fa44b072fb9fc273a8e791abc1698e99 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Wed, 3 Dec 2025 14:14:43 -0500
Subject: [PATCH 47/55] Fix path duplication - use Path.cwd() instead of
 Path(workdir).resolve()

---
 src/metacoder/coders/gemini.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index 0e836eb..7bb2307 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -143,7 +143,8 @@ def run(self, input_text: str) -> CoderOutput:
 
             # Add workspace directory so MCP tools can access files
             # Without this, gemini will error with "File path must be within workspace directories"
-            command.extend(["--include-directories", str(Path(self.workdir).resolve())])
+            # Use Path.cwd() since we're already inside the workdir from change_directory()
+            command.extend(["--include-directories", str(Path.cwd())])
 
             # Use -p flag for prompt (works with MCP servers, positional doesn't)
             command.extend(["-p", text])

From 67b6ab0fd38fe11f1384b63c9bc39fdc1c6ba80b Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Thu, 4 Dec 2025 08:30:48 -0500
Subject: [PATCH 48/55] Use stdin=DEVNULL instead of PIPE+close() to prevent
 stdin issues

---
 src/metacoder/coders/base_coder.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py
index 5d16645..608ff45 100644
--- a/src/metacoder/coders/base_coder.py
+++ b/src/metacoder/coders/base_coder.py
@@ -179,7 +179,7 @@ def run_process(
             command,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
-            stdin=subprocess.PIPE,  # Provide stdin to prevent hanging on processes that read it
+            stdin=subprocess.DEVNULL,  # Use DEVNULL instead of PIPE to prevent interactive prompts
             text=True,
             encoding="utf-8",
             errors="replace",  # avoid crashes on the occasional bad byte
@@ -187,9 +187,6 @@ def run_process(
             bufsize=1,
             universal_newlines=True,
         )
-        # Close stdin immediately since we don't write to it
-        # This prevents subprocesses from hanging while waiting for input
-        process.stdin.close()
 
         stdout_lines: list[str] = []
         stderr_lines: list[str] = []

From 28d16fa8f4cfd844373552a265249c01ff2d5d4d Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Thu, 4 Dec 2025 10:28:59 -0500
Subject: [PATCH 49/55] Fix GeminiCoder hanging by adding --output-format text
 flag

The gemini CLI was running in interactive mode by default, waiting for
additional input after processing the prompt. Adding --output-format text
enables non-interactive/headless mode, allowing gemini to exit after
completing the prompt.

This fixes the evaluation hanging issue where gemini would output
'Okay, my setup is complete. I'm ready for your first command.'
and wait indefinitely for stdin input.
---
 src/metacoder/coders/gemini.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index 7bb2307..4f4d225 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -149,6 +149,9 @@ def run(self, input_text: str) -> CoderOutput:
             # Use -p flag for prompt (works with MCP servers, positional doesn't)
             command.extend(["-p", text])
 
+            # Use text output format to prevent interactive mode (non-interactive/headless mode)
+            command.extend(["--output-format", "text"])
+
             logger.info("💎 Running command: gemini with prompt")
             logger.debug(f"💎 Full command: {' '.join(command)}")
             start_time = time.time()

From fbcff9d7772cccc821cad44c2f7fadccf5f35b34 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Thu, 4 Dec 2025 17:24:35 -0500
Subject: [PATCH 50/55] Fix gemini CLI hanging by using positional arguments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The gemini CLI was hanging indefinitely because it was entering
interactive mode when invoked with the -p flag. This fix switches
to using positional arguments instead, which causes gemini to exit
after processing the prompt.

Changes:
- Modified gemini.py to pass prompt as positional argument
- Removed -p flag usage
- Updated log message to reflect new approach

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/metacoder/coders/gemini.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index 4f4d225..fd8dc92 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -124,7 +124,7 @@ def default_config_objects(self) -> list[CoderConfigObject]:
 
     def run(self, input_text: str) -> CoderOutput:
         """
-        Run gemini with the given input text.
+        Run gemini with the given input text as a positional argument.
         """
         env = self.expand_env(self.env)
         self.prepare_workdir()
@@ -132,9 +132,7 @@ def run(self, input_text: str) -> CoderOutput:
         with change_directory(self.workdir):
             text = self.expand_prompt(input_text)
 
-            # Build the command
-            # Use -p flag instead of positional argument to work around bug with MCP servers
-            # See: https://github.com/google-gemini/gemini-cli/issues/XXX
+            # Build the command - use positional argument for prompt
             command = ["gemini"]
 
             # Add model parameter if specified
@@ -142,17 +140,12 @@ def run(self, input_text: str) -> CoderOutput:
                 command.extend(["-m", self.params["model"]])
 
             # Add workspace directory so MCP tools can access files
-            # Without this, gemini will error with "File path must be within workspace directories"
-            # Use Path.cwd() since we're already inside the workdir from change_directory()
             command.extend(["--include-directories", str(Path.cwd())])
 
-            # Use -p flag for prompt (works with MCP servers, positional doesn't)
-            command.extend(["-p", text])
+            # Add prompt as positional argument (simpler and works in non-interactive mode)
+            command.append(text)
 
-            # Use text output format to prevent interactive mode (non-interactive/headless mode)
-            command.extend(["--output-format", "text"])
-
-            logger.info("💎 Running command: gemini with prompt")
+            logger.info("💎 Running command: gemini with positional argument")
             logger.debug(f"💎 Full command: {' '.join(command)}")
             start_time = time.time()
 

From d12acec2c6a593941bd98757c990ed596b8b6b5d Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Fri, 5 Dec 2025 15:18:38 -0500
Subject: [PATCH 51/55] Revert positional argument change - use -p flag with
 --output-format text

---
 src/metacoder/coders/gemini.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index fd8dc92..4f4d225 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -124,7 +124,7 @@ def default_config_objects(self) -> list[CoderConfigObject]:
 
     def run(self, input_text: str) -> CoderOutput:
         """
-        Run gemini with the given input text as a positional argument.
+        Run gemini with the given input text.
         """
         env = self.expand_env(self.env)
         self.prepare_workdir()
@@ -132,7 +132,9 @@ def run(self, input_text: str) -> CoderOutput:
         with change_directory(self.workdir):
             text = self.expand_prompt(input_text)
 
-            # Build the command - use positional argument for prompt
+            # Build the command
+            # Use -p flag instead of positional argument to work around bug with MCP servers
+            # See: https://github.com/google-gemini/gemini-cli/issues/XXX
             command = ["gemini"]
 
             # Add model parameter if specified
@@ -140,12 +142,17 @@ def run(self, input_text: str) -> CoderOutput:
                 command.extend(["-m", self.params["model"]])
 
             # Add workspace directory so MCP tools can access files
+            # Without this, gemini will error with "File path must be within workspace directories"
+            # Use Path.cwd() since we're already inside the workdir from change_directory()
             command.extend(["--include-directories", str(Path.cwd())])
 
-            # Add prompt as positional argument (simpler and works in non-interactive mode)
-            command.append(text)
+            # Use -p flag for prompt (works with MCP servers, positional doesn't)
+            command.extend(["-p", text])
 
-            logger.info("💎 Running command: gemini with positional argument")
+            # Use text output format to prevent interactive mode (non-interactive/headless mode)
+            command.extend(["--output-format", "text"])
+
+            logger.info("💎 Running command: gemini with prompt")
             logger.debug(f"💎 Full command: {' '.join(command)}")
             start_time = time.time()
 

From 369b9724f76ea4f75b979ddd2b36b899cacc040f Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Fri, 5 Dec 2025 16:03:55 -0500
Subject: [PATCH 52/55] Fix CodexCoder to use codex CLI instead of claude
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CodexCoder class was incorrectly using the claude CLI command
instead of the codex CLI. This fixes the command to properly invoke
codex with the correct arguments:
- codex exec --json --dangerously-bypass-approvals-and-sandbox

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/metacoder/coders/codex.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/metacoder/coders/codex.py b/src/metacoder/coders/codex.py
index 3451ebe..7442b0e 100644
--- a/src/metacoder/coders/codex.py
+++ b/src/metacoder/coders/codex.py
@@ -88,16 +88,16 @@ def default_config_objects(self) -> list[CoderConfigObject]:
 
     def run(self, input_text: str) -> CoderOutput:
         """
-        Run claude code with the given input text.
+        Run codex with the given input text.
         """
         env = self.expand_env(self.env)
         # important - ensure that only local config files are used
-        # we assue chdir has been called beforehand
+        # we assume chdir has been called beforehand
         env["HOME"] = "."
         text = self.expand_prompt(input_text)
-        command = ["claude", "-p", "--verbose", "--output-format", "stream-json", text]
+        command = ["codex", "exec", "--json", "--dangerously-bypass-approvals-and-sandbox", text]
 
-        print(f"🤖 Running command: {' '.join(command)}")
+        print(f"📝 Running command: {' '.join(command)}")
         # time the command
         start_time = time.time()
         ao = self.run_process(command, env)
@@ -115,9 +115,9 @@ def run(self, input_text: str) -> CoderOutput:
             if "result" in message:
                 ao.result_text = message["result"]
         end_time = time.time()
-        print(f"🤖 Command took {end_time - start_time:.2f} seconds")
+        print(f"📝 Command took {end_time - start_time:.2f} seconds")
         ao.total_cost_usd = total_cost_usd
         ao.success = not is_error
         if not ao.success:
-            raise ValueError(f"Claude failed with error: {ao.stderr} // {ao}")
+            raise ValueError(f"Codex failed with error: {ao.stderr} // {ao}")
         return ao

From 794b32fded4c97024381afc7f699900a10a397f6 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Fri, 5 Dec 2025 16:13:45 -0500
Subject: [PATCH 53/55] Fix codex.py: add prepare_workdir and change_directory
 for proper MCP config setup

---
 src/metacoder/coders/codex.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/metacoder/coders/codex.py b/src/metacoder/coders/codex.py
index 7442b0e..2e4cdfc 100644
--- a/src/metacoder/coders/codex.py
+++ b/src/metacoder/coders/codex.py
@@ -9,6 +9,7 @@
     CoderConfigObject,
     CoderOutput,
     FileType,
+    change_directory,
 )
 
 
@@ -91,16 +92,18 @@ def run(self, input_text: str) -> CoderOutput:
         Run codex with the given input text.
         """
         env = self.expand_env(self.env)
-        # important - ensure that only local config files are used
-        # we assume chdir has been called beforehand
-        env["HOME"] = "."
-        text = self.expand_prompt(input_text)
-        command = ["codex", "exec", "--json", "--dangerously-bypass-approvals-and-sandbox", text]
+        self.prepare_workdir()
 
-        print(f"📝 Running command: {' '.join(command)}")
-        # time the command
-        start_time = time.time()
-        ao = self.run_process(command, env)
+        with change_directory(self.workdir):
+            # important - ensure that only local config files are used
+            env["HOME"] = "."
+            text = self.expand_prompt(input_text)
+            command = ["codex", "exec", "--json", "--dangerously-bypass-approvals-and-sandbox", text]
+
+            print(f"📝 Running command: {' '.join(command)}")
+            # time the command
+            start_time = time.time()
+            ao = self.run_process(command, env)
         # parse the jsonl output
         ao.structured_messages = [
             json.loads(line) for line in ao.stdout.split("\n") if line

From 1ac0504e5dff9bab03ee7f70fd6746b33e8eb5f8 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Fri, 5 Dec 2025 16:25:48 -0500
Subject: [PATCH 54/55] Add MCP support to CodexCoder
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add supports_mcp() classmethod returning True
- Add default_config_paths() for AGENTS.md and .codex/config.toml
- Add mcp_config_to_codex_format() to convert MCPConfig
- Add _generate_toml_config() to create TOML config string
- Update default_config_objects() to create .codex/config.toml with MCP servers
- Update docstring with proper Codex documentation

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/metacoder/coders/codex.py | 154 ++++++++++++++++++++++------------
 1 file changed, 99 insertions(+), 55 deletions(-)

diff --git a/src/metacoder/coders/codex.py b/src/metacoder/coders/codex.py
index 2e4cdfc..cc4a31f 100644
--- a/src/metacoder/coders/codex.py
+++ b/src/metacoder/coders/codex.py
@@ -3,6 +3,7 @@
 import time
 import logging
 import shutil
+from typing import Any
 
 from metacoder.coders.base_coder import (
     BaseCoder,
@@ -11,6 +12,7 @@
     FileType,
     change_directory,
 )
+from metacoder.configuration import ConfigFileRole, MCPConfig, MCPType
 
 
 logger = logging.getLogger(__name__)
@@ -18,8 +20,29 @@
 
 class CodexCoder(BaseCoder):
     """
-    For AWS bedrock, you may need to copy ~/.aws/
+    OpenAI Codex CLI integration.
 
+    Codex-specific configuration:
+
+    You can provide the following files in your configuration directory:
+
+    - `AGENTS.md` - Primary instructions for the assistant
+    - `.codex/config.toml` - Configuration including MCP servers
+
+    MCP Support:
+
+    Codex CLI supports MCP (Model Context Protocol) servers through the
+    mcp_servers configuration in .codex/config.toml. When MCPs are configured
+    through Metacoder, they will be automatically added to the config file.
+
+    The Codex CLI expects MCP servers to be configured in TOML format:
+
+        [mcp_servers.server_name]
+        command = "uvx"
+        args = ["mcp-server-name"]
+        env = { "API_KEY" = "value" }
+
+    Note: Requires codex CLI to be installed.
     """
 
     @classmethod
@@ -27,65 +50,86 @@ def is_available(cls) -> bool:
         """Check if codex command is available."""
         return shutil.which("codex") is not None
 
+    @classmethod
+    def supports_mcp(cls) -> bool:
+        """CodexCoder supports MCP extensions."""
+        return True
+
+    @classmethod
+    def default_config_paths(cls) -> dict[Path, ConfigFileRole]:
+        return {
+            Path("AGENTS.md"): ConfigFileRole.PRIMARY_INSTRUCTION,
+            Path(".codex/config.toml"): ConfigFileRole.CONFIG,
+        }
+
     @property
     def instructions_path(self) -> Path:
         return Path("AGENTS.md")
 
-    def default_config_objects(self) -> list[CoderConfigObject]:
-        """
-        extensions:
-            developer:
-                bundled: true
-                display_name: Developer
-                enabled: true
-                name: developer
-                timeout: 300
-                type: builtin
-            pdfreader:
-                args:
-                - mcp-read-pdf
-                bundled: null
-                cmd: uvx
-                description: Read large and complex PDF documents
-                enabled: true
-                env_keys: []
-                envs: {}
-                name: pdfreader
-                timeout: 300
-                type: stdio
-        """
-        return [
-            CoderConfigObject(
-                file_type=FileType.YAML,
-                relative_path=".config/goose/config.yaml",
-                content={
-                    "GOOSE_MODEL": "gpt-4o",
-                    "GOOSE_PROVIDER": "openai",
-                    "extensions": {
-                        "developer": {
-                            "bundled": True,
-                            "display_name": "Developer",
-                            "enabled": True,
-                            "name": "developer",
-                            "timeout": 300,
-                            "type": "builtin",
-                        },
-                        "pdfreader": {
-                            "args": ["mcp-read-pdf"],
-                            "bundled": None,
-                            "cmd": "uvx",
-                            "description": "Read large and complex PDF documents",
-                            "enabled": True,
-                            "env_keys": [],
-                            "envs": {},
-                            "name": "pdfreader",
-                            "timeout": 300,
-                            "type": "stdio",
-                        },
-                    },
-                },
+    def mcp_config_to_codex_format(self, mcp: MCPConfig) -> dict[str, Any]:
+        """Convert MCPConfig to Codex's MCP server format."""
+        server_config: dict[str, Any] = {}
+
+        # For stdio type MCPs
+        if mcp.type == MCPType.STDIO and mcp.command:
+            server_config["command"] = mcp.command
+            if mcp.args:
+                server_config["args"] = mcp.args
+            if mcp.env:
+                server_config["env"] = mcp.env
+
+        # For HTTP type MCPs
+        elif mcp.type == MCPType.HTTP:
+            raise NotImplementedError(
+                "HTTP MCPs are not supported for Codex wrapper yet"
             )
-        ]
+
+        return server_config
+
+    def _generate_toml_config(self, mcp_servers: dict[str, dict[str, Any]]) -> str:
+        """Generate TOML configuration string for Codex config.toml."""
+        lines = []
+
+        for server_name, server_config in mcp_servers.items():
+            lines.append(f"[mcp_servers.{server_name}]")
+            for key, value in server_config.items():
+                if key == "command":
+                    lines.append(f'command = "{value}"')
+                elif key == "args":
+                    args_str = ", ".join(f'"{arg}"' for arg in value)
+                    lines.append(f"args = [{args_str}]")
+                elif key == "env":
+                    env_parts = []
+                    for env_key, env_val in value.items():
+                        env_parts.append(f'"{env_key}" = "{env_val}"')
+                    env_str = ", ".join(env_parts)
+                    lines.append(f"env = {{ {env_str} }}")
+            lines.append("")
+
+        return "\n".join(lines)
+
+    def default_config_objects(self) -> list[CoderConfigObject]:
+        """Generate config objects including MCP configuration."""
+        config_objects = []
+
+        # Create .codex/config.toml if we have MCP extensions
+        if self.config and self.config.extensions:
+            mcp_servers = {}
+            for mcp in self.config.extensions:
+                if mcp.enabled:
+                    mcp_servers[mcp.name] = self.mcp_config_to_codex_format(mcp)
+
+            if mcp_servers:
+                toml_content = self._generate_toml_config(mcp_servers)
+                config_objects.append(
+                    CoderConfigObject(
+                        file_type=FileType.TEXT,
+                        relative_path=".codex/config.toml",
+                        content=toml_content,
+                    )
+                )
+
+        return config_objects
 
     def run(self, input_text: str) -> CoderOutput:
         """

From 8d45b4d7c3f3d6eb752fff016575e5757b519759 Mon Sep 17 00:00:00 2001
From: Justin Reese <justaddcoffee+github@gmail.com>
Date: Fri, 5 Dec 2025 17:21:32 -0500
Subject: [PATCH 55/55] Fix CodexCoder 401 Unauthorized error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove HOME=. override that was breaking authentication. Codex CLI
reads .codex/config.toml from the current directory automatically,
so overriding HOME is unnecessary and prevents codex from accessing
OpenAI API credentials.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/metacoder/coders/codex.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/metacoder/coders/codex.py b/src/metacoder/coders/codex.py
index cc4a31f..ad45c0c 100644
--- a/src/metacoder/coders/codex.py
+++ b/src/metacoder/coders/codex.py
@@ -139,8 +139,8 @@ def run(self, input_text: str) -> CoderOutput:
         self.prepare_workdir()
 
         with change_directory(self.workdir):
-            # important - ensure that only local config files are used
-            env["HOME"] = "."
+            # Codex reads .codex/config.toml from current directory automatically.
+            # Do NOT set HOME=. as this breaks authentication (401 Unauthorized).
             text = self.expand_prompt(input_text)
             command = ["codex", "exec", "--json", "--dangerously-bypass-approvals-and-sandbox", text]