diff --git a/README.md b/README.md index c08a612..7066642 100644 --- a/README.md +++ b/README.md @@ -189,8 +189,8 @@ sonnet ```json [ - {"input": "Write a commit for adding login", "expected": {"contains": "feat"}}, - {"input": "Fix the null pointer bug", "expected": {"contains": "fix"}} + {"input": "Write a commit for adding login", "expected": {"contains": ["feat", "login"]}}, + {"input": "Fix the null pointer bug", "expected": {"contains": ["fix", "bug"]}} ] ``` diff --git a/src/upskill/agent_cards/test_gen.md b/src/upskill/agent_cards/test_gen.md index 3df6b34..0aba7f1 100644 --- a/src/upskill/agent_cards/test_gen.md +++ b/src/upskill/agent_cards/test_gen.md @@ -1,6 +1,7 @@ --- type: agent # note that this takes precedence over cli switch. you can set model string directly. +#model: opus?structured=tool_use model: opus?reasoning=1024 description: Generate test cases for evaluating skills. --- diff --git a/src/upskill/evaluate.py b/src/upskill/evaluate.py index 9b92652..08b4300 100644 --- a/src/upskill/evaluate.py +++ b/src/upskill/evaluate.py @@ -20,6 +20,7 @@ from upskill.models import ( ConversationStats, EvalResults, + ExpectedSpec, Skill, TestCase, TestResult, @@ -60,7 +61,7 @@ def isolated_workspace(base_dir: Path | None = None, cleanup: bool = True) -> Ge def check_expected( output: str, - expected: dict | None, + expected: ExpectedSpec, workspace: Path | None = None, test_case: TestCase | None = None, ) -> tuple[bool, ValidationResult | None]: @@ -87,13 +88,10 @@ def check_expected( ) return result.passed, result - # Legacy: simple contains check - if not expected: - return True, None - - if "contains" in expected: - if expected["contains"].lower() not in output.lower(): - return False, None + required = expected.contains + output_lower = output.lower() + if any(item.lower() not in output_lower for item in required): + return False, None return True, None @@ -108,8 +106,8 @@ async def _run_test_with_evaluator( ) -> TestResult: """Run a single test case using a provided evaluator agent.""" user_content = test_case.input - if test_case.context and "files" in test_case.context: - for filename, content in test_case.context["files"].items(): + if test_case.context and test_case.context.files: + for filename, content in test_case.context.files.items(): user_content += f"\n\n```{filename}\n{content}\n```" # Determine if we need workspace isolation @@ -143,11 +141,17 @@ async def _run_in_workspace(workspace: Path | None) -> TestResult: # Check expected with custom validator support if workspace and test_case.validator: success, validation_result = check_expected( - output or "", test_case.expected, workspace, test_case + output or "", + test_case.expected, + workspace, + test_case, ) else: success, validation_result = check_expected( - output or "", test_case.expected + output or "", + test_case.expected, + None, + test_case, ) return TestResult( diff --git a/src/upskill/generate.py b/src/upskill/generate.py index 978fe96..4b6e054 100644 --- a/src/upskill/generate.py +++ b/src/upskill/generate.py @@ -22,19 +22,19 @@ "cases": [ { "input": "Write a commit message for adding a new login feature", - "expected": {"contains": "feat"} + "expected": {"contains": ["feat", "login"]} }, { "input": "Write a commit message for fixing a null pointer bug in the user service", - "expected": {"contains": "fix"} + "expected": {"contains": ["fix", "bug"]} }, { "input": "Write a commit message for updating the README documentation", - "expected": {"contains": "docs"} + "expected": {"contains": ["docs", "readme"]} }, { "input": "Write a commit message for a breaking API change", - "expected": {"contains": "BREAKING"} + "expected": {"contains": ["BREAKING", "api"]} } ] } @@ -48,15 +48,15 @@ "cases": [ { "input": "Write code to fetch data from an API with retry logic", - "expected": {"contains": "retry"} + "expected": {"contains": ["retry", "error"]} }, { "input": "How should I handle a 500 error from an API?", - "expected": {"contains": "backoff"} + "expected": {"contains": ["backoff", "500"]} }, { "input": "Write error handling for a requests.get call", - "expected": {"contains": "except"} + "expected": {"contains": ["except", "requests"]} } ] } @@ -72,13 +72,8 @@ "## Your Task\n\n" f"Task: {TASK_PLACEHOLDER}\n\n" "Generate test cases that verify the agent can apply the skill correctly.\n\n" - "Output ONLY a valid JSON object (no markdown code blocks):\n" - "{\n" - ' "cases": [\n' - ' {"input": "prompt/question for the agent",\n' - ' "expected": {"contains": "substring that should appear in good response"}}\n' - " ]\n" - "}\n\n" + + "Each TestCase MUST include at least a list of expected strings in the expected field.\n" "Focus on practical scenarios that test understanding of the core concepts." ) @@ -113,9 +108,6 @@ async def generate_skill( model: str | None = None, ) -> Skill: """Generate a skill from a task description using FastAgent.""" - # config = config or Config.load() - # model = model or config.model - # config_path = config.effective_fastagent_config prompt = f"Create a skill document that teaches an AI agent how to: {task}" if examples: @@ -142,19 +134,31 @@ async def generate_tests( model: str | None = None, ) -> list[TestCase]: """Generate synthetic test cases from a task description using FastAgent.""" - # config = config or Config.load() - # model = model or config.model - # config_path = config.effective_fastagent_config prompt = TEST_GENERATION_PROMPT.replace(TASK_PLACEHOLDER, task) - result, _ = await generator.structured(prompt, TestCaseSuite) - if result is None: raise ValueError("Test generator did not return structured test cases.") - return result.cases + cases = result.cases + invalid_expected = 0 + for tc in cases: + expected_values = [value.strip() for value in tc.expected.contains if value.strip()] + if len(expected_values) < 2: + invalid_expected += 1 + + print( + "Generated test cases:", + f"total={len(cases)}", + f"invalid_expected={invalid_expected}", + ) + if invalid_expected: + print( + "Warning: some test cases are missing at least two expected strings; " + "review generated tests." + ) + return cases async def refine_skill( @@ -259,4 +263,3 @@ async def improve_skill( source_task=f"Improved from {skill.name}: {instructions}", base_skill=skill, ) - diff --git a/src/upskill/models.py b/src/upskill/models.py index 6f2fb0d..cf0a6ea 100644 --- a/src/upskill/models.py +++ b/src/upskill/models.py @@ -7,7 +7,7 @@ from datetime import datetime from pathlib import Path -from pydantic import BaseModel, Field, field_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator class SkillMetadata(BaseModel): @@ -33,17 +33,42 @@ class ValidationResult(BaseModel): error_message: str | None = None +class ExpectedSpec(BaseModel): + """Expected output checks for a test case.""" + + model_config = ConfigDict(extra="forbid") + + contains: list[str] + + @field_validator("contains", mode="before") + @classmethod + def coerce_contains(cls, value: str | list[str]) -> list[str]: + if isinstance(value, str): + return [value] + return value + + +class TestCaseContext(BaseModel): + """Context payloads provided to the evaluator.""" + + model_config = ConfigDict(extra="forbid") + + files: dict[str, str] | None = None + + class TestCase(BaseModel): """A test case for skill evaluation.""" + model_config = ConfigDict(extra="forbid") + input: str # Task/prompt to give the agent - context: dict | None = None # Files, env vars, etc. - expected: dict | None = None # Expected output checks + context: TestCaseContext | None = None # Files, env vars, etc. + expected: ExpectedSpec # Expected output checks # Custom validator support output_file: str | None = None # File to validate instead of agent output validator: str | None = None # Validator name (e.g., "hf_eval_yaml") - validator_config: dict | None = None # Config passed to validator + validator_config: dict[str, str | int | float | bool] | None = None @@ -51,6 +76,8 @@ class TestCase(BaseModel): class TestCaseSuite(BaseModel): """Structured container for a list of test cases.""" + model_config = ConfigDict(extra="forbid") + cases: list[TestCase] = Field(default_factory=list)