From 93c5e7b5b972007242626a57ce03fb76fd5ddad0 Mon Sep 17 00:00:00 2001 From: Claude Code Date: Mon, 30 Mar 2026 02:31:59 +0000 Subject: [PATCH 1/2] refactor: migrate test infrastructure from shell scripts to Rust-based skill-bench - Remove shell script-based test infrastructure (agents/skill-bench/) - Add TOML-based test cases in tests/ directory - Convert test checks from shell scripts to skill-bench format - Update mise.toml with test task for skill-bench - Add logs/ directory with .gitkeep for test execution logs - Update .gitignore to exclude log files but keep .gitkeep Co-Authored-By: Claude Opus 4.6 --- .gitignore | 7 ++ logs/.gitkeep | 0 mise.toml | 6 +- .../functional-no-spec.toml | 59 ++++++++++++ .../functional-with-spec.toml | 53 +++++++++++ tests/concept-interviewing/triggering.toml | 25 +++++ tests/constitution-reminding/functional.toml | 29 ++++++ tests/constitution-reminding/triggering.toml | 21 ++++ .../functional-get-patent-id.toml | 55 +++++++++++ .../functional-get-statistics.toml | 95 +++++++++++++++++++ .../functional-import-csv.toml | 29 ++++++ .../functional-import-multiple-csvs.toml | 43 +++++++++ .../functional-init-db.toml | 21 ++++ .../functional-integration.toml | 51 ++++++++++ .../functional-record-screening.toml | 67 +++++++++++++ tests/investigating-database/triggering.toml | 21 ++++ .../functional-file-review.toml | 49 ++++++++++ tests/legal-checking/functional.toml | 31 ++++++ tests/legal-checking/triggering.toml | 21 ++++ tests/targeting/functional-no-spec.toml | 52 ++++++++++ tests/targeting/functional-with-data.toml | 66 +++++++++++++ tests/targeting/functional-with-spec.toml | 78 +++++++++++++++ tests/targeting/triggering.toml | 17 ++++ 23 files changed, 895 insertions(+), 1 deletion(-) create mode 100644 logs/.gitkeep create mode 100644 tests/concept-interviewing/functional-no-spec.toml create mode 100644 tests/concept-interviewing/functional-with-spec.toml create mode 100644 tests/concept-interviewing/triggering.toml create mode 100644 tests/constitution-reminding/functional.toml create mode 100644 tests/constitution-reminding/triggering.toml create mode 100644 tests/investigating-database/functional-get-patent-id.toml create mode 100644 tests/investigating-database/functional-get-statistics.toml create mode 100644 tests/investigating-database/functional-import-csv.toml create mode 100644 tests/investigating-database/functional-import-multiple-csvs.toml create mode 100644 tests/investigating-database/functional-init-db.toml create mode 100644 tests/investigating-database/functional-integration.toml create mode 100644 tests/investigating-database/functional-record-screening.toml create mode 100644 tests/investigating-database/triggering.toml create mode 100644 tests/legal-checking/functional-file-review.toml create mode 100644 tests/legal-checking/functional.toml create mode 100644 tests/legal-checking/triggering.toml create mode 100644 tests/targeting/functional-no-spec.toml create mode 100644 tests/targeting/functional-with-data.toml create mode 100644 tests/targeting/functional-with-spec.toml create mode 100644 tests/targeting/triggering.toml diff --git a/.gitignore b/.gitignore index bd07edd..722fe4f 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,10 @@ Cargo.lock .claude/worktrees/ .claude/settings.local.json node_modules + +# Test logs +/logs/* +!/logs/.gitkeep + +# skill-bench test history +.skill-bench/ diff --git a/logs/.gitkeep b/logs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/mise.toml b/mise.toml index 8c7aa8a..b5b5210 100644 --- a/mise.toml +++ b/mise.toml @@ -1,8 +1,12 @@ [tools] -node = "20" [tasks] fmt = { description = "Format files with prettier", run = "npx prettier --write ." } pre-commit = { description = "Pre-commit hook to format files", depends = [ "fmt", ] } + +[tasks.test] +description = "Run skill-bench tests" +run = "skill-bench run 'tests' --plugin-dir ./plugin --threads 4 --log ./logs" +depends = ["fmt"] diff --git a/tests/concept-interviewing/functional-no-spec.toml b/tests/concept-interviewing/functional-no-spec.toml new file mode 100644 index 0000000..1ce96ba --- /dev/null +++ b/tests/concept-interviewing/functional-no-spec.toml @@ -0,0 +1,59 @@ +# Test Case: Concept Interview Functional (no existing specification) + +name = "functional-no-spec" +description = "Verify concept-interview uses question-responder when information is missing" +timeout = 180 # seconds + +test_prompt = """ +I want to start a patent search for a new voice recognition system for smart home devices with real-time transcription and noise-resistant recognition. + +Before asking me any questions, please use the question-responder skill to check if the required information is already available. Then proceed with the concept interview and assignee verification. +""" + +[answers] +"competitors" = ["Google", "Amazon"] +"target country" = "US" +"country" = "US" +"release date" = "2025-06-01" +"date" = "2025-06-01" +"target release date" = "2025-06-01" + +[[checks]] +name = "mcp_server_loaded" +command = { command = "mcp-loaded", server = "google-patent-cli" } + +[[checks]] +name = "skill_loaded" +command = { command = "skill-loaded", skill = "concept-interviewing" } + +[[checks]] +name = "question_responder_invoked" +command = { command = "skill-invoked", skill = "skill-bench-harness:question-responder" } + +[[checks]] +name = "concept_interview_invoked" +command = { command = "skill-invoked", skill = "concept-interviewing" } + +[[checks]] +name = "patent_assignee_check_invoked" +command = { command = "skill-invoked", skill = "patent-assignee-check" } + +[[checks]] +name = "constitution_loaded" +command = { command = "skill-invoked", skill = "constitution-reminding" } + +[[checks]] +name = "references_instructions_read" +command = { command = "tool-use", tool = "Read", param = "file_path", value = "concept-interview.*references/instructions.md" } + +[[checks]] +name = "specification_template_read" +command = { command = "tool-use", tool = "Read", param = "file_path", value = "specification-template.md" } + +[[checks]] +name = "specification_md_created" +command = { command = "workspace-file", path = "0-specifications/specification.md" } + +[[checks]] +name = "google_patent_mcp_succeeded" +command = { command = "mcp-success", tool = "search_patents" } diff --git a/tests/concept-interviewing/functional-with-spec.toml b/tests/concept-interviewing/functional-with-spec.toml new file mode 100644 index 0000000..60d7504 --- /dev/null +++ b/tests/concept-interviewing/functional-with-spec.toml @@ -0,0 +1,53 @@ +# Test Case: Concept Interview Functional (with existing specification) + +name = "functional-with-spec" +description = "Verify concept-interview verifies existing specification without re-interviewing" +timeout = 120 # seconds + +test_prompt = """ +Use concept-interview to verify our existing product specification is complete and ready for the targeting phase. Do not perform any additional assignee checks - just verify the existing specification. +""" + +[[setup]] +path = "0-specifications/specification.md" +content = """ +# Product Specification + +## 1. Product Concept + +Voice recognition system for smart home devices + +## 2. Target Market + +- **Country**: US +- **Release Date**: 2025-06-01 +- **Cutoff Date**: 2005-06-01 + +## 3. Competitors + +- **Google LLC** +- **Amazon.com Inc.** + +## 4. Verified Assignee Names (Canonicalized) + +| Original Name | Verified Assignee Names | Status | Notes | +| ------------- | ------------------------------------------ | -------- | ------------------------ | +| Google | Google LLC, Google Inc., GOOGLE LLC | Verified | Multiple name variations | +| Amazon | Amazon.com Inc., Amazon Technologies, Inc. | Verified | Multiple name variations | +""" + +[[checks]] +name = "mcp_server_loaded" +command = { command = "mcp-loaded", server = "google-patent-cli" } + +[[checks]] +name = "skill_loaded" +command = { command = "skill-loaded", skill = "concept-interviewing" } + +[[checks]] +name = "concept_interview_invoked" +command = { command = "skill-invoked", skill = "concept-interviewing" } + +[[checks]] +name = "patent_assignee_check_not_invoked" +command = { command = "skill-invoked", skill = "patent-assignee-check", deny = true } diff --git a/tests/concept-interviewing/triggering.toml b/tests/concept-interviewing/triggering.toml new file mode 100644 index 0000000..8b42650 --- /dev/null +++ b/tests/concept-interviewing/triggering.toml @@ -0,0 +1,25 @@ +# Test Case: Concept Interview - Triggering + +name = "triggering" +description = "Verify concept-interviewing skill is triggered when discussing patent search" +timeout = 60 + +test_prompt = """ +I want to start a patent search for a new voice recognition system in the US, releasing in 2025. Competitors are Google and Amazon. The system is for smart home devices with real-time transcription and noise-resistant recognition. Please proceed with the assignee verification and create the specification file. +""" + +[[checks]] +name = "mcp_server_loaded" +command = { command = "mcp-loaded", server = "google-patent-cli" } + +[[checks]] +name = "skill_loaded" +command = { command = "skill-loaded", skill = "concept-interviewing" } + +[[checks]] +name = "concept_interviewing_invoked" +command = { command = "skill-invoked", skill = "concept-interviewing" } + +[[checks]] +name = "patent_assignee_check_invoked" +command = { command = "skill-invoked", skill = "patent-assignee-check" } diff --git a/tests/constitution-reminding/functional.toml b/tests/constitution-reminding/functional.toml new file mode 100644 index 0000000..050013f --- /dev/null +++ b/tests/constitution-reminding/functional.toml @@ -0,0 +1,29 @@ +# Test Case: Constitution Reminding Functional + +name = "functional" +description = "Verify constitution-reminding loads and displays core principles" +timeout = 60 # seconds + +test_prompt = """ +Load the constitution skill to understand the core principles. +""" + +[[checks]] +name = "mcp_server_loaded" +command = { command = "mcp-loaded", server = "google-patent-cli" } + +[[checks]] +name = "skill_loaded" +command = { command = "skill-loaded", skill = "constitution-reminding" } + +[[checks]] +name = "constitution_reminding_invoked" +command = { command = "skill-invoked", skill = "constitution-reminding" } + +[[checks]] +name = "references_instructions_read" +command = { command = "tool-use", tool = "Read", param = "file_path", value = "constitution-reminding.*references/instructions.md" } + +[[checks]] +name = "constitution_text_displayed" +command = { command = "message-contains", text = "I." } diff --git a/tests/constitution-reminding/triggering.toml b/tests/constitution-reminding/triggering.toml new file mode 100644 index 0000000..fd2a607 --- /dev/null +++ b/tests/constitution-reminding/triggering.toml @@ -0,0 +1,21 @@ +# Test Case: Constitution Reminding - Triggering + +name = "triggering" +description = "Verify constitution-reminding skill is triggered when asked about core principles" +timeout = 60 + +test_prompt = """ +Load the constitution skill to understand the core principles. +""" + +[[checks]] +name = "mcp_server_loaded" +command = { command = "mcp-loaded", server = "google-patent-cli" } + +[[checks]] +name = "skill_loaded" +command = { command = "skill-loaded", skill = "constitution-reminding" } + +[[checks]] +name = "constitution_reminding_invoked" +command = { command = "skill-invoked", skill = "constitution-reminding" } diff --git a/tests/investigating-database/functional-get-patent-id.toml b/tests/investigating-database/functional-get-patent-id.toml new file mode 100644 index 0000000..32a45c4 --- /dev/null +++ b/tests/investigating-database/functional-get-patent-id.toml @@ -0,0 +1,55 @@ +# Test Case: Investigating Database - Get Patent ID + +name = "functional-get-patent-id" +description = "Verify investigating-database can retrieve patent ID by row number" +timeout = 120 # seconds + +test_prompt = """ +Get the patent ID at row 2 from the database. +""" + +[[setup]] +path = "patents.db" +content = """ +#!/usr/bin/env bash +# Setup script to create test database +sqlite3 patents.db < Date: Mon, 30 Mar 2026 02:33:55 +0000 Subject: [PATCH 2/2] chore: update model versions in devcontainer config - Update ANTHROPIC_DEFAULT_OPUS_MODEL to glm-5.1 - Update ANTHROPIC_DEFAULT_SONNET_MODEL to glm-5-turbo Co-Authored-By: Claude Opus 4.6 --- .devcontainer/post-create.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.devcontainer/post-create.sh b/.devcontainer/post-create.sh index 5ccae81..5cfd78a 100755 --- a/.devcontainer/post-create.sh +++ b/.devcontainer/post-create.sh @@ -80,8 +80,8 @@ EOF "ANTHROPIC_BASE_URL": "https://api.z.ai/api/anthropic", "API_TIMEOUT_MS": "3000000", "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1", - "ANTHROPIC_DEFAULT_OPUS_MODEL": "glm-5", - "ANTHROPIC_DEFAULT_SONNET_MODEL": "glm-4.7", + "ANTHROPIC_DEFAULT_OPUS_MODEL": "glm-5.1", + "ANTHROPIC_DEFAULT_SONNET_MODEL": "glm-5-turbo", "ANTHROPIC_DEFAULT_HAIKU_MODEL": "glm-4.5-air" } }