Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,8 @@ sonnet

```json
[
{"input": "Write a commit for adding login", "expected": {"contains": "feat"}},
{"input": "Fix the null pointer bug", "expected": {"contains": "fix"}}
{"input": "Write a commit for adding login", "expected": {"contains": ["feat", "login"]}},
{"input": "Fix the null pointer bug", "expected": {"contains": ["fix", "bug"]}}
]
```

Expand Down
1 change: 1 addition & 0 deletions src/upskill/agent_cards/test_gen.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
---
type: agent
# note that this takes precedence over cli switch. you can set model string directly.
#model: opus?structured=tool_use
model: opus?reasoning=1024
description: Generate test cases for evaluating skills.
---
Expand Down
28 changes: 16 additions & 12 deletions src/upskill/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from upskill.models import (
ConversationStats,
EvalResults,
ExpectedSpec,
Skill,
TestCase,
TestResult,
Expand Down Expand Up @@ -60,7 +61,7 @@ def isolated_workspace(base_dir: Path | None = None, cleanup: bool = True) -> Ge

def check_expected(
output: str,
expected: dict | None,
expected: ExpectedSpec,
workspace: Path | None = None,
test_case: TestCase | None = None,
) -> tuple[bool, ValidationResult | None]:
Expand All @@ -87,13 +88,10 @@ def check_expected(
)
return result.passed, result

# Legacy: simple contains check
if not expected:
return True, None

if "contains" in expected:
if expected["contains"].lower() not in output.lower():
return False, None
required = expected.contains
output_lower = output.lower()
if any(item.lower() not in output_lower for item in required):
return False, None

return True, None

Expand All @@ -108,8 +106,8 @@ async def _run_test_with_evaluator(
) -> TestResult:
"""Run a single test case using a provided evaluator agent."""
user_content = test_case.input
if test_case.context and "files" in test_case.context:
for filename, content in test_case.context["files"].items():
if test_case.context and test_case.context.files:
for filename, content in test_case.context.files.items():
user_content += f"\n\n```{filename}\n{content}\n```"

# Determine if we need workspace isolation
Expand Down Expand Up @@ -143,11 +141,17 @@ async def _run_in_workspace(workspace: Path | None) -> TestResult:
# Check expected with custom validator support
if workspace and test_case.validator:
success, validation_result = check_expected(
output or "", test_case.expected, workspace, test_case
output or "",
test_case.expected,
workspace,
test_case,
)
else:
success, validation_result = check_expected(
output or "", test_case.expected
output or "",
test_case.expected,
None,
test_case,
)

return TestResult(
Expand Down
51 changes: 27 additions & 24 deletions src/upskill/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,19 @@
"cases": [
{
"input": "Write a commit message for adding a new login feature",
"expected": {"contains": "feat"}
"expected": {"contains": ["feat", "login"]}
},
{
"input": "Write a commit message for fixing a null pointer bug in the user service",
"expected": {"contains": "fix"}
"expected": {"contains": ["fix", "bug"]}
},
{
"input": "Write a commit message for updating the README documentation",
"expected": {"contains": "docs"}
"expected": {"contains": ["docs", "readme"]}
},
{
"input": "Write a commit message for a breaking API change",
"expected": {"contains": "BREAKING"}
"expected": {"contains": ["BREAKING", "api"]}
}
]
}
Expand All @@ -48,15 +48,15 @@
"cases": [
{
"input": "Write code to fetch data from an API with retry logic",
"expected": {"contains": "retry"}
"expected": {"contains": ["retry", "error"]}
},
{
"input": "How should I handle a 500 error from an API?",
"expected": {"contains": "backoff"}
"expected": {"contains": ["backoff", "500"]}
},
{
"input": "Write error handling for a requests.get call",
"expected": {"contains": "except"}
"expected": {"contains": ["except", "requests"]}
}
]
}
Expand All @@ -72,13 +72,8 @@
"## Your Task\n\n"
f"Task: {TASK_PLACEHOLDER}\n\n"
"Generate test cases that verify the agent can apply the skill correctly.\n\n"
"Output ONLY a valid JSON object (no markdown code blocks):\n"
"{\n"
' "cases": [\n'
' {"input": "prompt/question for the agent",\n'
' "expected": {"contains": "substring that should appear in good response"}}\n'
" ]\n"
"}\n\n"

"Each TestCase MUST include at least a list of expected strings in the expected field.\n"
"Focus on practical scenarios that test understanding of the core concepts."
)

Expand Down Expand Up @@ -113,9 +108,6 @@ async def generate_skill(
model: str | None = None,
) -> Skill:
"""Generate a skill from a task description using FastAgent."""
# config = config or Config.load()
# model = model or config.model
# config_path = config.effective_fastagent_config

prompt = f"Create a skill document that teaches an AI agent how to: {task}"
if examples:
Expand All @@ -142,19 +134,31 @@ async def generate_tests(
model: str | None = None,
) -> list[TestCase]:
"""Generate synthetic test cases from a task description using FastAgent."""
# config = config or Config.load()
# model = model or config.model
# config_path = config.effective_fastagent_config

prompt = TEST_GENERATION_PROMPT.replace(TASK_PLACEHOLDER, task)


result, _ = await generator.structured(prompt, TestCaseSuite)

if result is None:
raise ValueError("Test generator did not return structured test cases.")

return result.cases
cases = result.cases
invalid_expected = 0
for tc in cases:
expected_values = [value.strip() for value in tc.expected.contains if value.strip()]
if len(expected_values) < 2:
invalid_expected += 1

print(
"Generated test cases:",
f"total={len(cases)}",
f"invalid_expected={invalid_expected}",
)
if invalid_expected:
print(
"Warning: some test cases are missing at least two expected strings; "
"review generated tests."
)
return cases


async def refine_skill(
Expand Down Expand Up @@ -259,4 +263,3 @@ async def improve_skill(
source_task=f"Improved from {skill.name}: {instructions}",
base_skill=skill,
)

35 changes: 31 additions & 4 deletions src/upskill/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from datetime import datetime
from pathlib import Path

from pydantic import BaseModel, Field, field_validator
from pydantic import BaseModel, ConfigDict, Field, field_validator


class SkillMetadata(BaseModel):
Expand All @@ -33,24 +33,51 @@ class ValidationResult(BaseModel):
error_message: str | None = None


class ExpectedSpec(BaseModel):
"""Expected output checks for a test case."""

model_config = ConfigDict(extra="forbid")

contains: list[str]

@field_validator("contains", mode="before")
@classmethod
def coerce_contains(cls, value: str | list[str]) -> list[str]:
if isinstance(value, str):
return [value]
return value


class TestCaseContext(BaseModel):
"""Context payloads provided to the evaluator."""

model_config = ConfigDict(extra="forbid")

files: dict[str, str] | None = None


class TestCase(BaseModel):
"""A test case for skill evaluation."""

model_config = ConfigDict(extra="forbid")

input: str # Task/prompt to give the agent
context: dict | None = None # Files, env vars, etc.
expected: dict | None = None # Expected output checks
context: TestCaseContext | None = None # Files, env vars, etc.
expected: ExpectedSpec # Expected output checks

# Custom validator support
output_file: str | None = None # File to validate instead of agent output
validator: str | None = None # Validator name (e.g., "hf_eval_yaml")
validator_config: dict | None = None # Config passed to validator
validator_config: dict[str, str | int | float | bool] | None = None




class TestCaseSuite(BaseModel):
"""Structured container for a list of test cases."""

model_config = ConfigDict(extra="forbid")

cases: list[TestCase] = Field(default_factory=list)


Expand Down