huggingface · evalstate · Jan 26, 2026 · Jan 26, 2026
diff --git a/README.md b/README.md
@@ -189,8 +189,8 @@ sonnet
 
 ```json
 [
-  {"input": "Write a commit for adding login", "expected": {"contains": "feat"}},
-  {"input": "Fix the null pointer bug", "expected": {"contains": "fix"}}
+  {"input": "Write a commit for adding login", "expected": {"contains": ["feat", "login"]}},
+  {"input": "Fix the null pointer bug", "expected": {"contains": ["fix", "bug"]}}
 ]
 ```
 

diff --git a/src/upskill/agent_cards/test_gen.md b/src/upskill/agent_cards/test_gen.md
@@ -1,6 +1,7 @@
 ---
 type: agent
 # note that this takes precedence over cli switch. you can set model string directly.
+#model: opus?structured=tool_use
 model: opus?reasoning=1024
 description: Generate test cases for evaluating skills.
 ---

diff --git a/src/upskill/evaluate.py b/src/upskill/evaluate.py
@@ -20,6 +20,7 @@
 from upskill.models import (
     ConversationStats,
     EvalResults,
+    ExpectedSpec,
     Skill,
     TestCase,
     TestResult,
@@ -60,7 +61,7 @@ def isolated_workspace(base_dir: Path | None = None, cleanup: bool = True) -> Ge
 
 def check_expected(
     output: str,
-    expected: dict | None,
+    expected: ExpectedSpec,
     workspace: Path | None = None,
     test_case: TestCase | None = None,
 ) -> tuple[bool, ValidationResult | None]:
@@ -87,13 +88,10 @@ def check_expected(
             )
             return result.passed, result
 
-    # Legacy: simple contains check
-    if not expected:
-        return True, None
-
-    if "contains" in expected:
-        if expected["contains"].lower() not in output.lower():
-            return False, None
+    required = expected.contains
+    output_lower = output.lower()
+    if any(item.lower() not in output_lower for item in required):
+        return False, None
 
     return True, None
 
@@ -108,8 +106,8 @@ async def _run_test_with_evaluator(
 ) -> TestResult:
     """Run a single test case using a provided evaluator agent."""
     user_content = test_case.input
-    if test_case.context and "files" in test_case.context:
-        for filename, content in test_case.context["files"].items():
+    if test_case.context and test_case.context.files:
+        for filename, content in test_case.context.files.items():
             user_content += f"\n\n```{filename}\n{content}\n```"
 
     # Determine if we need workspace isolation
@@ -143,11 +141,17 @@ async def _run_in_workspace(workspace: Path | None) -> TestResult:
             # Check expected with custom validator support
             if workspace and test_case.validator:
                 success, validation_result = check_expected(
-                    output or "", test_case.expected, workspace, test_case
+                    output or "",
+                    test_case.expected,
+                    workspace,
+                    test_case,
                 )
             else:
                 success, validation_result = check_expected(
-                    output or "", test_case.expected
+                    output or "",
+                    test_case.expected,
+                    None,
+                    test_case,
                 )
 
             return TestResult(

diff --git a/src/upskill/generate.py b/src/upskill/generate.py
@@ -22,19 +22,19 @@
   "cases": [
     {
       "input": "Write a commit message for adding a new login feature",
-      "expected": {"contains": "feat"}
+      "expected": {"contains": ["feat", "login"]}
     },
     {
       "input": "Write a commit message for fixing a null pointer bug in the user service",
-      "expected": {"contains": "fix"}
+      "expected": {"contains": ["fix", "bug"]}
     },
     {
       "input": "Write a commit message for updating the README documentation",
-      "expected": {"contains": "docs"}
+      "expected": {"contains": ["docs", "readme"]}
     },
     {
       "input": "Write a commit message for a breaking API change",
-      "expected": {"contains": "BREAKING"}
+      "expected": {"contains": ["BREAKING", "api"]}
     }
   ]
 }
@@ -48,15 +48,15 @@
   "cases": [
     {
       "input": "Write code to fetch data from an API with retry logic",
-      "expected": {"contains": "retry"}
+      "expected": {"contains": ["retry", "error"]}
     },
     {
       "input": "How should I handle a 500 error from an API?",
-      "expected": {"contains": "backoff"}
+      "expected": {"contains": ["backoff", "500"]}
     },
     {
       "input": "Write error handling for a requests.get call",
-      "expected": {"contains": "except"}
+      "expected": {"contains": ["except", "requests"]}
     }
   ]
 }
@@ -72,13 +72,8 @@
     "## Your Task\n\n"
     f"Task: {TASK_PLACEHOLDER}\n\n"
     "Generate test cases that verify the agent can apply the skill correctly.\n\n"
-    "Output ONLY a valid JSON object (no markdown code blocks):\n"
-    "{\n"
-    '  "cases": [\n'
-    '    {"input": "prompt/question for the agent",\n'
-    '     "expected": {"contains": "substring that should appear in good response"}}\n'
-    "  ]\n"
-    "}\n\n"
+
+    "Each TestCase MUST include at least a list of expected strings in the expected field.\n"
     "Focus on practical scenarios that test understanding of the core concepts."
 )
 
@@ -113,9 +108,6 @@ async def generate_skill(
     model: str | None = None,
 ) -> Skill:
     """Generate a skill from a task description using FastAgent."""
-    # config = config or Config.load()
-    # model = model or config.model
-    # config_path = config.effective_fastagent_config
 
     prompt = f"Create a skill document that teaches an AI agent how to: {task}"
     if examples:
@@ -142,19 +134,31 @@ async def generate_tests(
     model: str | None = None,
 ) -> list[TestCase]:
     """Generate synthetic test cases from a task description using FastAgent."""
-    # config = config or Config.load()
-    # model = model or config.model
-    # config_path = config.effective_fastagent_config
 
     prompt = TEST_GENERATION_PROMPT.replace(TASK_PLACEHOLDER, task)
 
-
     result, _ = await generator.structured(prompt, TestCaseSuite)
-
     if result is None:
         raise ValueError("Test generator did not return structured test cases.")
 
-    return result.cases
+    cases = result.cases
+    invalid_expected = 0
+    for tc in cases:
+        expected_values = [value.strip() for value in tc.expected.contains if value.strip()]
+        if len(expected_values) < 2:
+            invalid_expected += 1
+
+    print(
+        "Generated test cases:",
+        f"total={len(cases)}",
+        f"invalid_expected={invalid_expected}",
+    )
+    if invalid_expected:
+        print(
+            "Warning: some test cases are missing at least two expected strings; "
+            "review generated tests."
+        )
+    return cases
 
 
 async def refine_skill(
@@ -259,4 +263,3 @@ async def improve_skill(
         source_task=f"Improved from {skill.name}: {instructions}",
         base_skill=skill,
     )
-
diff --git a/src/upskill/models.py b/src/upskill/models.py
@@ -7,7 +7,7 @@
 from datetime import datetime
 from pathlib import Path
 
-from pydantic import BaseModel, Field, field_validator
+from pydantic import BaseModel, ConfigDict, Field, field_validator
 
 
 class SkillMetadata(BaseModel):
@@ -33,24 +33,51 @@ class ValidationResult(BaseModel):
     error_message: str | None = None
 
 
+class ExpectedSpec(BaseModel):
+    """Expected output checks for a test case."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    contains: list[str]
+
+    @field_validator("contains", mode="before")
+    @classmethod
+    def coerce_contains(cls, value: str | list[str]) -> list[str]:
+        if isinstance(value, str):
+            return [value]
+        return value
+
+
+class TestCaseContext(BaseModel):
+    """Context payloads provided to the evaluator."""
+
+    model_config = ConfigDict(extra="forbid")
+
+    files: dict[str, str] | None = None
+
+
 class TestCase(BaseModel):
     """A test case for skill evaluation."""
 
+    model_config = ConfigDict(extra="forbid")
+
     input: str  # Task/prompt to give the agent
-    context: dict | None = None  # Files, env vars, etc.
-    expected: dict | None = None  # Expected output checks
+    context: TestCaseContext | None = None  # Files, env vars, etc.
+    expected: ExpectedSpec  # Expected output checks
 
     # Custom validator support
     output_file: str | None = None  # File to validate instead of agent output
     validator: str | None = None  # Validator name (e.g., "hf_eval_yaml")
-    validator_config: dict | None = None  # Config passed to validator
+    validator_config: dict[str, str | int | float | bool] | None = None
 
 
 
 
 class TestCaseSuite(BaseModel):
     """Structured container for a list of test cases."""
 
+    model_config = ConfigDict(extra="forbid")
+
     cases: list[TestCase] = Field(default_factory=list)