Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .devcontainer/post-create.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ EOF
"ANTHROPIC_BASE_URL": "https://api.z.ai/api/anthropic",
"API_TIMEOUT_MS": "3000000",
"CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
"ANTHROPIC_DEFAULT_OPUS_MODEL": "glm-5",
"ANTHROPIC_DEFAULT_SONNET_MODEL": "glm-4.7",
"ANTHROPIC_DEFAULT_OPUS_MODEL": "glm-5.1",
"ANTHROPIC_DEFAULT_SONNET_MODEL": "glm-5-turbo",
"ANTHROPIC_DEFAULT_HAIKU_MODEL": "glm-4.5-air"
}
}
Expand Down
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,10 @@ Cargo.lock
.claude/worktrees/
.claude/settings.local.json
node_modules

# Test logs
/logs/*
!/logs/.gitkeep

# skill-bench test history
.skill-bench/
Empty file added logs/.gitkeep
Empty file.
6 changes: 5 additions & 1 deletion mise.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
[tools]
node = "20"

[tasks]
fmt = { description = "Format files with prettier", run = "npx prettier --write ." }
pre-commit = { description = "Pre-commit hook to format files", depends = [
"fmt",
] }

[tasks.test]
description = "Run skill-bench tests"
run = "skill-bench run 'tests' --plugin-dir ./plugin --threads 4 --log ./logs"
depends = ["fmt"]
59 changes: 59 additions & 0 deletions tests/concept-interviewing/functional-no-spec.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Test Case: Concept Interview Functional (no existing specification)

name = "functional-no-spec"
description = "Verify concept-interview uses question-responder when information is missing"
timeout = 180 # seconds

test_prompt = """
I want to start a patent search for a new voice recognition system for smart home devices with real-time transcription and noise-resistant recognition.

Before asking me any questions, please use the question-responder skill to check if the required information is already available. Then proceed with the concept interview and assignee verification.
"""

[answers]
"competitors" = ["Google", "Amazon"]
"target country" = "US"
"country" = "US"
"release date" = "2025-06-01"
"date" = "2025-06-01"
"target release date" = "2025-06-01"

[[checks]]
name = "mcp_server_loaded"
command = { command = "mcp-loaded", server = "google-patent-cli" }

[[checks]]
name = "skill_loaded"
command = { command = "skill-loaded", skill = "concept-interviewing" }

[[checks]]
name = "question_responder_invoked"
command = { command = "skill-invoked", skill = "skill-bench-harness:question-responder" }

[[checks]]
name = "concept_interview_invoked"
command = { command = "skill-invoked", skill = "concept-interviewing" }

[[checks]]
name = "patent_assignee_check_invoked"
command = { command = "skill-invoked", skill = "patent-assignee-check" }

[[checks]]
name = "constitution_loaded"
command = { command = "skill-invoked", skill = "constitution-reminding" }

[[checks]]
name = "references_instructions_read"
command = { command = "tool-use", tool = "Read", param = "file_path", value = "concept-interview.*references/instructions.md" }

[[checks]]
name = "specification_template_read"
command = { command = "tool-use", tool = "Read", param = "file_path", value = "specification-template.md" }

[[checks]]
name = "specification_md_created"
command = { command = "workspace-file", path = "0-specifications/specification.md" }

[[checks]]
name = "google_patent_mcp_succeeded"
command = { command = "mcp-success", tool = "search_patents" }
53 changes: 53 additions & 0 deletions tests/concept-interviewing/functional-with-spec.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Test Case: Concept Interview Functional (with existing specification)

name = "functional-with-spec"
description = "Verify concept-interview verifies existing specification without re-interviewing"
timeout = 120 # seconds

test_prompt = """
Use concept-interview to verify our existing product specification is complete and ready for the targeting phase. Do not perform any additional assignee checks - just verify the existing specification.
"""

[[setup]]
path = "0-specifications/specification.md"
content = """
# Product Specification

## 1. Product Concept

Voice recognition system for smart home devices

## 2. Target Market

- **Country**: US
- **Release Date**: 2025-06-01
- **Cutoff Date**: 2005-06-01

## 3. Competitors

- **Google LLC**
- **Amazon.com Inc.**

## 4. Verified Assignee Names (Canonicalized)

| Original Name | Verified Assignee Names | Status | Notes |
| ------------- | ------------------------------------------ | -------- | ------------------------ |
| Google | Google LLC, Google Inc., GOOGLE LLC | Verified | Multiple name variations |
| Amazon | Amazon.com Inc., Amazon Technologies, Inc. | Verified | Multiple name variations |
"""

[[checks]]
name = "mcp_server_loaded"
command = { command = "mcp-loaded", server = "google-patent-cli" }

[[checks]]
name = "skill_loaded"
command = { command = "skill-loaded", skill = "concept-interviewing" }

[[checks]]
name = "concept_interview_invoked"
command = { command = "skill-invoked", skill = "concept-interviewing" }

[[checks]]
name = "patent_assignee_check_not_invoked"
command = { command = "skill-invoked", skill = "patent-assignee-check", deny = true }
25 changes: 25 additions & 0 deletions tests/concept-interviewing/triggering.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Test Case: Concept Interview - Triggering

name = "triggering"
description = "Verify concept-interviewing skill is triggered when discussing patent search"
timeout = 60

test_prompt = """
I want to start a patent search for a new voice recognition system in the US, releasing in 2025. Competitors are Google and Amazon. The system is for smart home devices with real-time transcription and noise-resistant recognition. Please proceed with the assignee verification and create the specification file.
"""

[[checks]]
name = "mcp_server_loaded"
command = { command = "mcp-loaded", server = "google-patent-cli" }

[[checks]]
name = "skill_loaded"
command = { command = "skill-loaded", skill = "concept-interviewing" }

[[checks]]
name = "concept_interviewing_invoked"
command = { command = "skill-invoked", skill = "concept-interviewing" }

[[checks]]
name = "patent_assignee_check_invoked"
command = { command = "skill-invoked", skill = "patent-assignee-check" }
29 changes: 29 additions & 0 deletions tests/constitution-reminding/functional.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Test Case: Constitution Reminding Functional

name = "functional"
description = "Verify constitution-reminding loads and displays core principles"
timeout = 60 # seconds

test_prompt = """
Load the constitution skill to understand the core principles.
"""

[[checks]]
name = "mcp_server_loaded"
command = { command = "mcp-loaded", server = "google-patent-cli" }

[[checks]]
name = "skill_loaded"
command = { command = "skill-loaded", skill = "constitution-reminding" }

[[checks]]
name = "constitution_reminding_invoked"
command = { command = "skill-invoked", skill = "constitution-reminding" }

[[checks]]
name = "references_instructions_read"
command = { command = "tool-use", tool = "Read", param = "file_path", value = "constitution-reminding.*references/instructions.md" }

[[checks]]
name = "constitution_text_displayed"
command = { command = "message-contains", text = "I." }
21 changes: 21 additions & 0 deletions tests/constitution-reminding/triggering.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Test Case: Constitution Reminding - Triggering

name = "triggering"
description = "Verify constitution-reminding skill is triggered when asked about core principles"
timeout = 60

test_prompt = """
Load the constitution skill to understand the core principles.
"""

[[checks]]
name = "mcp_server_loaded"
command = { command = "mcp-loaded", server = "google-patent-cli" }

[[checks]]
name = "skill_loaded"
command = { command = "skill-loaded", skill = "constitution-reminding" }

[[checks]]
name = "constitution_reminding_invoked"
command = { command = "skill-invoked", skill = "constitution-reminding" }
55 changes: 55 additions & 0 deletions tests/investigating-database/functional-get-patent-id.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Test Case: Investigating Database - Get Patent ID

name = "functional-get-patent-id"
description = "Verify investigating-database can retrieve patent ID by row number"
timeout = 120 # seconds

test_prompt = """
Get the patent ID at row 2 from the database.
"""

[[setup]]
path = "patents.db"
content = """
#!/usr/bin/env bash
# Setup script to create test database
sqlite3 patents.db <<EOF
PRAGMA foreign_keys = ON;
CREATE TABLE target_patents (
id TEXT PRIMARY KEY NOT NULL,
family_id TEXT,
title TEXT,
abstract_text TEXT,
publication_date TEXT,
country TEXT,
assignee TEXT,
filing_date TEXT,
grant_date TEXT,
citation_count INTEGER,
claim_count INTEGER,
extra_fields TEXT,
created_at TEXT DEFAULT (datetime('now')),
updated_at TEXT DEFAULT (datetime('now'))
);
INSERT INTO target_patents (id, family_id, title, abstract_text, publication_date, country, assignee) VALUES
('US1234567A', 'US1234567', 'First Patent', 'Abstract 1', '2023-01-15', 'US', 'Assignee1'),
('US7654321A', 'US7654321', 'Second Patent', 'Abstract 2', '2023-03-20', 'US', 'Assignee2'),
('US9999999A', 'US9999999', 'Third Patent', 'Abstract 3', '2023-06-10', 'US', 'Assignee3');
EOF
"""

[[checks]]
name = "skill_loaded"
command = { command = "skill-loaded", skill = "investigating-database" }

[[checks]]
name = "investigating_database_invoked"
command = { command = "skill-invoked", skill = "investigating-database" }

[[checks]]
name = "get_patent_id_script_executed"
command = { command = "tool-use", tool = "Bash", param = "command", value = "get-patent-id.sh" }

[[checks]]
name = "correct_patent_id_retrieved"
command = { command = "message-contains", text = "US7654321A" }
95 changes: 95 additions & 0 deletions tests/investigating-database/functional-get-statistics.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Test Case: Investigating Database - Get Statistics

name = "functional-get-statistics"
description = "Verify investigating-database can retrieve screening progress statistics"
timeout = 120 # seconds

test_prompt = """
Get the screening progress statistics from the database.
"""

[[setup]]
path = "patents.db"
content = """
#!/usr/bin/env bash
# Setup script to create test database with sample data
sqlite3 patents.db <<EOF
PRAGMA foreign_keys = ON;
CREATE TABLE target_patents (
id TEXT PRIMARY KEY NOT NULL,
family_id TEXT,
title TEXT,
abstract_text TEXT,
publication_date TEXT,
country TEXT,
assignee TEXT,
filing_date TEXT,
grant_date TEXT,
citation_count INTEGER,
claim_count INTEGER,
extra_fields TEXT,
created_at TEXT DEFAULT (datetime('now')),
updated_at TEXT DEFAULT (datetime('now'))
);
CREATE TABLE screened_patents (
id TEXT PRIMARY KEY NOT NULL,
title TEXT,
legal_status TEXT,
judgment TEXT NOT NULL,
reason TEXT,
abstract_text TEXT,
screened_at TEXT DEFAULT (datetime('now')),
updated_at TEXT DEFAULT (datetime('now'))
);
CREATE VIEW v_screening_progress AS
SELECT
(SELECT COUNT(*) FROM target_patents) as total_targets,
(SELECT COUNT(*) FROM screened_patents) as total_screened,
(SELECT COUNT(*) FROM screened_patents WHERE judgment = 'relevant') as relevant,
(SELECT COUNT(*) FROM screened_patents WHERE judgment = 'irrelevant') as irrelevant,
(SELECT COUNT(*) FROM screened_patents WHERE judgment = 'expired') as expired;
INSERT INTO target_patents (id, family_id, title, abstract_text, publication_date, country, assignee) VALUES
('US1234567A', 'US1234567', 'Patent 1', 'Abstract 1', '2023-01-15', 'US', 'Assignee1'),
('US7654321A', 'US7654321', 'Patent 2', 'Abstract 2', '2023-03-20', 'US', 'Assignee2'),
('US9999999A', 'US9999999', 'Patent 3', 'Abstract 3', '2023-06-10', 'US', 'Assignee3'),
('US1111111A', 'US1111111', 'Patent 4', 'Abstract 4', '2023-08-20', 'US', 'Assignee4'),
('US2222222A', 'US2222222', 'Patent 5', 'Abstract 5', '2023-10-10', 'US', 'Assignee5');
INSERT INTO screened_patents (id, title, judgment, reason) VALUES
('US1234567A', 'Patent 1', 'relevant', 'Core technology'),
('US7654321A', 'Patent 2', 'relevant', 'Related technology'),
('US9999999A', 'Patent 3', 'irrelevant', 'Different domain'),
('US1111111A', 'Patent 4', 'expired', 'Status expired');
EOF
"""

[[checks]]
name = "skill_loaded"
command = { command = "skill-loaded", skill = "investigating-database" }

[[checks]]
name = "investigating_database_invoked"
command = { command = "skill-invoked", skill = "investigating-database" }

[[checks]]
name = "get_statistics_script_executed"
command = { command = "tool-use", tool = "Bash", param = "command", value = "get-statistics.sh" }

[[checks]]
name = "statistics_contain_total_targets"
command = { command = "message-contains", text = "total_targets" }

[[checks]]
name = "statistics_contain_count_5"
command = { command = "message-contains", text = "5" }

[[checks]]
name = "statistics_contain_count_4"
command = { command = "message-contains", text = "4" }

[[checks]]
name = "statistics_contain_count_2"
command = { command = "message-contains", text = "2" }

[[checks]]
name = "statistics_contain_count_1"
command = { command = "message-contains", text = "1" }
Loading
Loading