From 6762a45f3fc53119b71ebebc1c2f2b316a850ed6 Mon Sep 17 00:00:00 2001 From: Claude Code Date: Mon, 30 Mar 2026 02:49:19 +0000 Subject: [PATCH 1/2] chore: remove obsolete test infrastructure and unused dependencies - Remove agents/skill-bench/ directory (shell script-based tests) - Remove tests/constitution-reminding/ (skill no longer exists) - Remove tests/investigating-database/ (split into multiple skills) - Remove yq installation from devcontainer config (not needed) Co-Authored-By: Claude Opus 4.6 --- .devcontainer/post-create.sh | 19 -- agents/skill-bench/.gitignore | 1 - .../functional-analyze-patent.toml | 94 ------ .../functional-parallel-analysis.toml | 98 ------ .../cases/claim-analyzing/triggering.toml | 20 -- .../functional-no-spec.toml | 19 -- .../functional-with-spec.toml | 57 ---- .../concept-interviewing/triggering.toml | 29 -- .../evaluating/functional-analyze-patent.toml | 67 ---- .../functional-parallel-analysis.toml | 68 ----- .../cases/evaluating/triggering.toml | 19 -- .../functional-import-csv.toml | 32 -- .../functional-import-multiple-csvs.toml | 47 --- .../functional-init-db.toml | 24 -- .../functional-various-patent-id.toml | 88 ------ .../investigation-preparing/triggering.toml | 24 -- .../functional-record-claims.toml | 62 ---- .../functional-record-elements.toml | 66 ---- .../functional-record-screening.toml | 44 --- .../investigation-recording/triggering.toml | 51 ---- .../functional-complete-workflow.toml | 286 ------------------ .../functional-generate-report.toml | 165 ---------- .../investigation-reporting/triggering.toml | 19 -- .../functional-file-review.toml | 53 ---- .../cases/legal-checking/functional.toml | 31 -- .../cases/legal-checking/triggering.toml | 24 -- .../functional-parallel-search.toml | 103 ------- .../functional-search-prior-art.toml | 84 ----- .../prior-art-researching/triggering.toml | 20 -- .../functional-parallel-screening.toml | 84 ----- .../functional-resume-screening.toml | 112 ------- .../cases/screening/triggering.toml | 19 -- .../cases/targeting/functional-with-data.toml | 71 ----- .../cases/targeting/functional-with-spec.toml | 86 ------ .../cases/targeting/triggering.toml | 19 -- .../harness-plugin/.claude-plugin/plugin.json | 9 - .../harness-plugin/cases/basic.toml | 37 --- .../skills/question-responder/SKILL.md | 91 ------ .../skills/question-responder/assets/.gitkeep | 1 - agents/skill-bench/runner.sh | 220 -------------- agents/skill-bench/tools/check-db-query.sh | 34 --- .../skill-bench/tools/check-file-content.sh | 30 -- .../tools/check-file-not-contains.sh | 30 -- .../skill-bench/tools/check-log-contains.sh | 17 -- agents/skill-bench/tools/check-mcp-loaded.sh | 38 --- agents/skill-bench/tools/check-mcp-success.sh | 69 ----- .../tools/check-mcp-tool-invoked.sh | 23 -- .../tools/check-output-contains.sh | 20 -- agents/skill-bench/tools/check-output-file.sh | 15 - agents/skill-bench/tools/check-param.sh | 22 -- .../skill-bench/tools/check-skill-invoked.sh | 36 --- .../skill-bench/tools/check-skill-loaded.sh | 30 -- .../tools/check-skill-not-invoked.sh | 22 -- .../skill-bench/tools/check-text-contains.sh | 31 -- agents/skill-bench/tools/check-tool-use.sh | 30 -- .../skill-bench/tools/check-workspace-dir.sh | 25 -- .../skill-bench/tools/check-workspace-file.sh | 17 -- agents/skill-bench/tools/setup-db.sh | 71 ----- tests/constitution-reminding/functional.toml | 29 -- tests/constitution-reminding/triggering.toml | 21 -- .../functional-get-patent-id.toml | 55 ---- .../functional-get-statistics.toml | 95 ------ .../functional-import-csv.toml | 29 -- .../functional-import-multiple-csvs.toml | 43 --- .../functional-init-db.toml | 21 -- .../functional-integration.toml | 51 ---- .../functional-record-screening.toml | 67 ---- tests/investigating-database/triggering.toml | 21 -- 68 files changed, 3455 deletions(-) delete mode 100644 agents/skill-bench/.gitignore delete mode 100644 agents/skill-bench/cases/claim-analyzing/functional-analyze-patent.toml delete mode 100644 agents/skill-bench/cases/claim-analyzing/functional-parallel-analysis.toml delete mode 100644 agents/skill-bench/cases/claim-analyzing/triggering.toml delete mode 100644 agents/skill-bench/cases/concept-interviewing/functional-no-spec.toml delete mode 100644 agents/skill-bench/cases/concept-interviewing/functional-with-spec.toml delete mode 100644 agents/skill-bench/cases/concept-interviewing/triggering.toml delete mode 100644 agents/skill-bench/cases/evaluating/functional-analyze-patent.toml delete mode 100644 agents/skill-bench/cases/evaluating/functional-parallel-analysis.toml delete mode 100644 agents/skill-bench/cases/evaluating/triggering.toml delete mode 100644 agents/skill-bench/cases/investigation-preparing/functional-import-csv.toml delete mode 100644 agents/skill-bench/cases/investigation-preparing/functional-import-multiple-csvs.toml delete mode 100644 agents/skill-bench/cases/investigation-preparing/functional-init-db.toml delete mode 100644 agents/skill-bench/cases/investigation-preparing/functional-various-patent-id.toml delete mode 100644 agents/skill-bench/cases/investigation-preparing/triggering.toml delete mode 100644 agents/skill-bench/cases/investigation-recording/functional-record-claims.toml delete mode 100644 agents/skill-bench/cases/investigation-recording/functional-record-elements.toml delete mode 100644 agents/skill-bench/cases/investigation-recording/functional-record-screening.toml delete mode 100644 agents/skill-bench/cases/investigation-recording/triggering.toml delete mode 100644 agents/skill-bench/cases/investigation-reporting/functional-complete-workflow.toml delete mode 100644 agents/skill-bench/cases/investigation-reporting/functional-generate-report.toml delete mode 100644 agents/skill-bench/cases/investigation-reporting/triggering.toml delete mode 100644 agents/skill-bench/cases/legal-checking/functional-file-review.toml delete mode 100644 agents/skill-bench/cases/legal-checking/functional.toml delete mode 100644 agents/skill-bench/cases/legal-checking/triggering.toml delete mode 100644 agents/skill-bench/cases/prior-art-researching/functional-parallel-search.toml delete mode 100644 agents/skill-bench/cases/prior-art-researching/functional-search-prior-art.toml delete mode 100644 agents/skill-bench/cases/prior-art-researching/triggering.toml delete mode 100644 agents/skill-bench/cases/screening/functional-parallel-screening.toml delete mode 100644 agents/skill-bench/cases/screening/functional-resume-screening.toml delete mode 100644 agents/skill-bench/cases/screening/triggering.toml delete mode 100644 agents/skill-bench/cases/targeting/functional-with-data.toml delete mode 100644 agents/skill-bench/cases/targeting/functional-with-spec.toml delete mode 100644 agents/skill-bench/cases/targeting/triggering.toml delete mode 100644 agents/skill-bench/harness-plugin/.claude-plugin/plugin.json delete mode 100644 agents/skill-bench/harness-plugin/cases/basic.toml delete mode 100644 agents/skill-bench/harness-plugin/skills/question-responder/SKILL.md delete mode 100644 agents/skill-bench/harness-plugin/skills/question-responder/assets/.gitkeep delete mode 100755 agents/skill-bench/runner.sh delete mode 100755 agents/skill-bench/tools/check-db-query.sh delete mode 100755 agents/skill-bench/tools/check-file-content.sh delete mode 100755 agents/skill-bench/tools/check-file-not-contains.sh delete mode 100755 agents/skill-bench/tools/check-log-contains.sh delete mode 100755 agents/skill-bench/tools/check-mcp-loaded.sh delete mode 100755 agents/skill-bench/tools/check-mcp-success.sh delete mode 100755 agents/skill-bench/tools/check-mcp-tool-invoked.sh delete mode 100755 agents/skill-bench/tools/check-output-contains.sh delete mode 100755 agents/skill-bench/tools/check-output-file.sh delete mode 100755 agents/skill-bench/tools/check-param.sh delete mode 100755 agents/skill-bench/tools/check-skill-invoked.sh delete mode 100755 agents/skill-bench/tools/check-skill-loaded.sh delete mode 100755 agents/skill-bench/tools/check-skill-not-invoked.sh delete mode 100755 agents/skill-bench/tools/check-text-contains.sh delete mode 100755 agents/skill-bench/tools/check-tool-use.sh delete mode 100755 agents/skill-bench/tools/check-workspace-dir.sh delete mode 100755 agents/skill-bench/tools/check-workspace-file.sh delete mode 100755 agents/skill-bench/tools/setup-db.sh delete mode 100644 tests/constitution-reminding/functional.toml delete mode 100644 tests/constitution-reminding/triggering.toml delete mode 100644 tests/investigating-database/functional-get-patent-id.toml delete mode 100644 tests/investigating-database/functional-get-statistics.toml delete mode 100644 tests/investigating-database/functional-import-csv.toml delete mode 100644 tests/investigating-database/functional-import-multiple-csvs.toml delete mode 100644 tests/investigating-database/functional-init-db.toml delete mode 100644 tests/investigating-database/functional-integration.toml delete mode 100644 tests/investigating-database/functional-record-screening.toml delete mode 100644 tests/investigating-database/triggering.toml diff --git a/.devcontainer/post-create.sh b/.devcontainer/post-create.sh index 5cfd78a..5e79aa3 100755 --- a/.devcontainer/post-create.sh +++ b/.devcontainer/post-create.sh @@ -17,25 +17,6 @@ if [ -z "$CI" ] && [ -z "$GITHUB_ACTIONS" ]; then echo "[Devcontainer Setup] Claude CLI already installed: $(claude --version)" fi - # Install yq (YAML/TOML processor) for skill-bench - if ! command -v yq >/dev/null 2>&1; then - echo "[Devcontainer Setup] Installing yq..." - YQ_VERSION=v4.52.4 - wget https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64 -O /tmp/yq - chmod +x /tmp/yq - sudo mv /tmp/yq /usr/local/bin/yq - echo "[Devcontainer Setup] yq installed: $(yq --version)" - else - echo "[Devcontainer Setup] yq already installed: $(yq --version)" - fi - - echo "[Devcontainer Setup] Configuring tmux..." - cat > $HOME/.tmux.conf << 'EOF' -# Display pane number -bind-key p display-panes -set display-panes-time 10000 -EOF - echo "[Devcontainer Setup] Configuring claude alias..." echo 'alias claude="claude --allow-dangerously-skip-permissions"' >> $HOME/.bashrc echo 'alias claude="claude --allow-dangerously-skip-permissions"' >> $HOME/.zshrc diff --git a/agents/skill-bench/.gitignore b/agents/skill-bench/.gitignore deleted file mode 100644 index 333c1e9..0000000 --- a/agents/skill-bench/.gitignore +++ /dev/null @@ -1 +0,0 @@ -logs/ diff --git a/agents/skill-bench/cases/claim-analyzing/functional-analyze-patent.toml b/agents/skill-bench/cases/claim-analyzing/functional-analyze-patent.toml deleted file mode 100644 index 4a112f7..0000000 --- a/agents/skill-bench/cases/claim-analyzing/functional-analyze-patent.toml +++ /dev/null @@ -1,94 +0,0 @@ -# Test Case: Claim Analyzing - Analyze Patent with User Questions - -name = "functional-analyze-patent" -description = "Verify claim-analyzing skill uses Agent tool to analyze a patent, asks user about missing features, and stores similarities in database" -timeout = 120 - -test_prompt = """ -Perform claim analysis for patent US20240292070A1. -- Compare product features against patent elements -- Store similarity results in the database -""" - -[answers] -"API Gateway" = "yes" -"Secure Key Management" = "no" - -[[setup]] -path = "features.sql" -content = """ --- Initialize database -PRAGMA foreign_keys = ON; - --- Insert product features (only some features, not all) -INSERT OR IGNORE INTO features (feature_name, description, category, presence, created_at, updated_at) -VALUES - ('User Authentication', 'System authenticates users via email/password', 'Security', 'present', datetime('now'), datetime('now')), - ('Data Encryption', 'Data is encrypted at rest using AES-256', 'Security', 'present', datetime('now'), datetime('now')); --- Note: 'API Gateway' and 'Secure Key Management' are missing intentionally -""" - -[[setup]] -path = "patent-data.sql" -content = """ --- Insert target patents -INSERT OR IGNORE INTO target_patents (patent_id, title, assignee, country, publication_date, filing_date, grant_date, extra_fields) -VALUES - ('US20240292070A1', 'Method for Secure Data Processing', 'TechCorp Inc', 'US', '2024-09-26', '2024-03-15', NULL, '{"source": "test"}'); - --- Insert screened patents (marked as relevant) -INSERT OR IGNORE INTO screened_patents (patent_id, judgment, reason, abstract_text, screened_at, updated_at) -VALUES - ('US20240292070A1', 'relevant', 'Relates to secure data processing', 'A method for secure data processing comprising authentication, encryption, API gateway, and secure key management', datetime('now'), datetime('now')); - --- Insert claims -INSERT OR IGNORE INTO claims (patent_id, claim_number, claim_type, claim_text, created_at, updated_at) -VALUES - ('US20240292070A1', 1, 'independent', '1. A method for secure data processing, comprising: authenticating a user; encrypting data; routing through API gateway; and managing secure keys.', datetime('now'), datetime('now')); - --- Insert elements (including ones with missing features) -INSERT OR IGNORE INTO elements (patent_id, claim_number, element_label, element_description, created_at, updated_at) -VALUES - ('US20240292070A1', 1, 'A', 'authenticating a user', datetime('now'), datetime('now')), - ('US20240292070A1', 1, 'B', 'routing through API gateway', datetime('now'), datetime('now')); -""" - -[[setup]] -name = "init_db" -type = "script" -command = "setup-db.sh init" - -[[setup]] -name = "load_features" -type = "script" -command = "setup-db.sh execute features.sql" - -[[setup]] -name = "load_patent_data" -type = "script" -command = "setup-db.sh execute patent-data.sql" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh claim-analyzing" - -[[checks]] -name = "claim-analyzing_invoked" -type = "script" -command = "check-skill-invoked.sh claim-analyzing" - -[[checks]] -name = "agent_used" -type = "script" -command = "check-tool-use.sh Agent" - -[[checks]] -name = "similarities_stored" -type = "script" -command = "check-db-query.sh {} {} '>0' 'SELECT COUNT(*) FROM similarities;'" - -[[checks]] -name = "new_feature_recorded" -type = "script" -command = "check-db-query.sh {} {} '>0' \"SELECT COUNT(*) FROM features WHERE feature_name LIKE 'API Gate%';\"" diff --git a/agents/skill-bench/cases/claim-analyzing/functional-parallel-analysis.toml b/agents/skill-bench/cases/claim-analyzing/functional-parallel-analysis.toml deleted file mode 100644 index bfbccc9..0000000 --- a/agents/skill-bench/cases/claim-analyzing/functional-parallel-analysis.toml +++ /dev/null @@ -1,98 +0,0 @@ -# Test Case: Claim Analyzing - Parallel Analysis - -name = "functional-parallel-analysis" -description = "Verify claim-analyzing skill can analyze multiple patents in parallel using teams" -timeout = 600 - -test_prompt = """ -I have 2 screened patents with claims and elements analyzed in the database. - -Please analyze claim similarities for both patents: -1. Compare product features against patent elements for each patent -2. Analyze each element for similarity level -3. Store similarity results in the database -4. Report the analysis results including patent IDs, similarity counts, and overall assessments -""" - -[[setup]] -path = "patents-with-elements.sql" -content = """ --- Initialize database -PRAGMA foreign_keys = ON; - --- Insert product features -INSERT OR REPLACE INTO features (feature_name, description, category, presence, created_at, updated_at) -VALUES - ('AI prompt generation', 'System generates initial prompts for AI models', 'Core', 'present', datetime('now'), datetime('now')), - ('AI output processing', 'System processes and analyzes AI model output', 'Core', 'present', datetime('now'), datetime('now')), - ('Quality metrics analysis', 'System analyzes quality metrics of AI output', 'Core', 'present', datetime('now'), datetime('now')), - ('Iterative refinement', 'System iteratively refines prompts based on feedback', 'Core', 'present', datetime('now'), datetime('now')), - ('Conversation context manager', 'Maintains context across multiple conversation turns', 'Core', 'present', datetime('now'), datetime('now')), - ('Response generator', 'Generates responses based on conversation context', 'Core', 'present', datetime('now'), datetime('now')); - --- Insert target patents -INSERT OR IGNORE INTO target_patents (patent_id, title, assignee, country, publication_date, filing_date, grant_date, extra_fields) -VALUES - ('US20240292070A1', 'Iterative AI prompt optimization', 'Loop Now Technologies, Inc.', 'US', '2024-08-29', '2024-04-10', NULL, '{"source": "test"}'), - ('US20240320708A1', 'Multi-turn conversation system', 'Tech Corp', 'US', '2024-09-12', '2024-03-15', NULL, '{"source": "test"}'); - --- Insert screened patents (marked as relevant) -INSERT OR IGNORE INTO screened_patents (patent_id, judgment, reason, abstract_text, screened_at, updated_at) -VALUES - ('US20240292070A1', 'relevant', 'Relates to AI prompt optimization', 'Systems and methods for optimizing AI prompts through iterative refinement', datetime('now'), datetime('now')), - ('US20240320708A1', 'relevant', 'Relates to multi-turn conversations', 'A system for managing multi-turn conversations with context tracking', datetime('now'), datetime('now')); - --- Insert claims for patent 1 -INSERT OR REPLACE INTO claims (patent_id, claim_number, claim_type, claim_text, created_at, updated_at) -VALUES - ('US20240292070A1', 1, 'independent', 'A method for optimizing AI prompts, comprising: generating an initial prompt; receiving output from an AI model; analyzing the output for quality metrics; and iteratively refining the prompt based on the quality metrics.', datetime('now'), datetime('now')); - --- Insert elements for patent 1 -INSERT OR REPLACE INTO elements (patent_id, claim_number, element_label, element_description, created_at, updated_at) -VALUES - ('US20240292070A1', 1, 'A', 'generating an initial prompt', datetime('now'), datetime('now')), - ('US20240292070A1', 1, 'B', 'receiving output from an AI model', datetime('now'), datetime('now')), - ('US20240292070A1', 1, 'C', 'analyzing the output for quality metrics', datetime('now'), datetime('now')), - ('US20240292070A1', 1, 'D', 'iteratively refining the prompt based on the quality metrics', datetime('now'), datetime('now')); - --- Insert claims for patent 2 -INSERT OR REPLACE INTO claims (patent_id, claim_number, claim_type, claim_text, created_at, updated_at) -VALUES - ('US20240320708A1', 1, 'independent', 'A system for multi-turn conversations, comprising: a conversation manager configured to maintain context across multiple turns; and a response generator configured to generate responses based on the maintained context.', datetime('now'), datetime('now')); - --- Insert elements for patent 2 -INSERT OR REPLACE INTO elements (patent_id, claim_number, element_label, element_description, created_at, updated_at) -VALUES - ('US20240320708A1', 1, 'A', 'a conversation manager configured to maintain context across multiple turns', datetime('now'), datetime('now')), - ('US20240320708A1', 1, 'B', 'a response generator configured to generate responses based on the maintained context', datetime('now'), datetime('now')); -""" - -[[setup]] -name = "init_db" -type = "script" -command = "setup-db.sh init" - -[[setup]] -name = "load_patent_data" -type = "script" -command = "setup-db.sh execute patents-with-elements.sql" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh claim-analyzing" - -[[checks]] -name = "claim_analyzing_invoked" -type = "script" -command = "check-skill-invoked.sh claim-analyzing" - -[[checks]] -name = "agent_used" -type = "script" -command = "check-tool-use.sh Agent" - -[[checks]] -name = "similarities_stored" -type = "script" -command = "check-db-query.sh {} {} '>=2' 'SELECT COUNT(DISTINCT patent_id) FROM similarities;'" diff --git a/agents/skill-bench/cases/claim-analyzing/triggering.toml b/agents/skill-bench/cases/claim-analyzing/triggering.toml deleted file mode 100644 index ecdfa47..0000000 --- a/agents/skill-bench/cases/claim-analyzing/triggering.toml +++ /dev/null @@ -1,20 +0,0 @@ -# Test Case: Claim Analyzing - Triggering - -name = "triggering" -description = "Verify claim-analyzing skill is triggered when asked to perform claim analysis" -timeout = 60 - -test_prompt = """ -I have a database of patents with claims and elements analyzed. -Now I need to analyze claim similarities by comparing product features against patent elements. -""" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh claim-analyzing" - -[[checks]] -name = "claim_analyzing_invoked" -type = "script" -command = "check-skill-invoked.sh claim-analyzing" diff --git a/agents/skill-bench/cases/concept-interviewing/functional-no-spec.toml b/agents/skill-bench/cases/concept-interviewing/functional-no-spec.toml deleted file mode 100644 index 13f034b..0000000 --- a/agents/skill-bench/cases/concept-interviewing/functional-no-spec.toml +++ /dev/null @@ -1,19 +0,0 @@ -# Test Case: Concept Interview Functional (no existing specification) - -name = "functional-no-spec" -description = "Verify concept-interview uses question-responder when information is missing" -timeout = 180 # seconds - -test_prompt = """ -I want to start a patent search for a new voice recognition system for smart home devices with real-time transcription and noise-resistant recognition. - -Before asking me any questions, please use the question-responder skill to check if the required information is already available. Then proceed with the concept interview and assignee verification. -""" - -[answers] -"competitors" = ["Google", "Amazon"] -"target country" = "US" -"country" = "US" -"release date" = "2025-06-01" -"date" = "2025-06-01" -"target release date" = "2025-06-01" diff --git a/agents/skill-bench/cases/concept-interviewing/functional-with-spec.toml b/agents/skill-bench/cases/concept-interviewing/functional-with-spec.toml deleted file mode 100644 index 7af4684..0000000 --- a/agents/skill-bench/cases/concept-interviewing/functional-with-spec.toml +++ /dev/null @@ -1,57 +0,0 @@ -# Test Case: Concept Interview Functional (with existing specification) - -name = "functional-with-spec" -description = "Verify concept-interview verifies existing specification without re-interviewing" -timeout = 120 # seconds - -test_prompt = """ -Use concept-interview to verify our existing product specification is complete and ready for the targeting phase. Do not perform any additional assignee checks - just verify the existing specification. -""" - -[[setup]] -path = "specification.md" -content = """ -# Product Specification - -## 1. Product Concept - -Voice recognition system for smart home devices - -## 2. Target Market - -- **Country**: US -- **Release Date**: 2025-06-01 -- **Cutoff Date**: 2005-06-01 - -## 3. Competitors - -- **Google LLC** -- **Amazon.com Inc.** - -## 4. Verified Assignee Names (Canonicalized) - -| Original Name | Verified Assignee Names | Status | Notes | -| ------------- | ------------------------------------------ | -------- | ------------------------ | -| Google | Google LLC, Google Inc., GOOGLE LLC | Verified | Multiple name variations | -| Amazon | Amazon.com Inc., Amazon Technologies, Inc. | Verified | Multiple name variations | -""" - -[[checks]] -name = "mcp_server_loaded" -type = "script" -command = "check-mcp-loaded.sh google-patent-cli" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh concept-interviewing" - -[[checks]] -name = "concept_interview_invoked" -type = "script" -command = "check-skill-invoked.sh concept-interviewing" - -[[checks]] -name = "patent_assignee_check_not_invoked" -type = "script" -command = "check-skill-invoked.sh patent-assignee-check --not" diff --git a/agents/skill-bench/cases/concept-interviewing/triggering.toml b/agents/skill-bench/cases/concept-interviewing/triggering.toml deleted file mode 100644 index 23d66d5..0000000 --- a/agents/skill-bench/cases/concept-interviewing/triggering.toml +++ /dev/null @@ -1,29 +0,0 @@ -# Test Case: Concept Interview - Triggering - -name = "triggering" -description = "Verify concept-interviewing skill is triggered when discussing patent search" -timeout = 60 - -test_prompt = """ -I want to start a patent search for a new voice recognition system in the US, releasing in 2025. Competitors are Google and Amazon. The system is for smart home devices with real-time transcription and noise-resistant recognition. Please proceed with the assignee verification and create the specification file. -""" - -[[checks]] -name = "mcp_server_loaded" -type = "script" -command = "check-mcp-loaded.sh google-patent-cli" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh concept-interviewing" - -[[checks]] -name = "concept_interviewing_invoked" -type = "script" -command = "check-skill-invoked.sh concept-interviewing" - -[[checks]] -name = "patent_assignee_check_invoked" -type = "script" -command = "check-skill-invoked.sh patent-assignee-check" diff --git a/agents/skill-bench/cases/evaluating/functional-analyze-patent.toml b/agents/skill-bench/cases/evaluating/functional-analyze-patent.toml deleted file mode 100644 index 92af628..0000000 --- a/agents/skill-bench/cases/evaluating/functional-analyze-patent.toml +++ /dev/null @@ -1,67 +0,0 @@ -# Test Case: Evaluating - Analyze Patent (Single) - -name = "functional-analyze-patent" -description = "Verify evaluating skill uses Agent tool to analyze a patent and store claims/elements in database" -timeout = 300 # seconds - -test_prompt = """ -I have a screened patent marked as relevant in the database. - -Please evaluate the patent: -1. Get the next relevant patent that has not been evaluated -2. Analyze the patent by decomposing claims into constituent elements -3. Store claims and elements in the database -4. Report the analysis results including patent ID, claim count, and element count -""" - -[[setup]] -path = "screened-patents.sql" -content = """ --- Initialize database -PRAGMA foreign_keys = ON; - --- Insert target patents -INSERT OR IGNORE INTO target_patents (patent_id, title, assignee, country, publication_date, filing_date, grant_date, extra_fields) -VALUES - ('US20240292070A1', 'Iterative AI prompt optimization', 'Loop Now Technologies, Inc.', 'US', '2024-08-29', '2024-04-10', NULL, '{"source": "test"}'); - --- Insert screened patents (marked as relevant) -INSERT OR IGNORE INTO screened_patents (patent_id, judgment, reason, abstract_text, screened_at, updated_at) -VALUES - ('US20240292070A1', 'relevant', 'Relates to AI prompt optimization', 'Systems and methods for optimizing AI prompts through iterative refinement', datetime('now'), datetime('now')); -""" - -[[setup]] -name = "init_db" -type = "script" -command = "setup-db.sh init" - -[[setup]] -name = "load_screened_data" -type = "script" -command = "setup-db.sh execute screened-patents.sql" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh evaluating" - -[[checks]] -name = "evaluating_invoked" -type = "script" -command = "check-skill-invoked.sh evaluating" - -[[checks]] -name = "agent_used" -type = "script" -command = "check-tool-use.sh Agent" - -[[checks]] -name = "claims_stored" -type = "script" -command = "check-db-query.sh {} {} '>0' 'SELECT COUNT(*) FROM claims;'" - -[[checks]] -name = "elements_stored" -type = "script" -command = "check-db-query.sh {} {} '>0' 'SELECT COUNT(*) FROM elements;'" diff --git a/agents/skill-bench/cases/evaluating/functional-parallel-analysis.toml b/agents/skill-bench/cases/evaluating/functional-parallel-analysis.toml deleted file mode 100644 index 840809c..0000000 --- a/agents/skill-bench/cases/evaluating/functional-parallel-analysis.toml +++ /dev/null @@ -1,68 +0,0 @@ -# Test Case: Evaluating - Parallel Analysis - -name = "functional-parallel-analysis" -description = "Verify evaluating skill can analyze multiple patents in parallel using teams" -timeout = 600 # seconds (10 minutes for parallel processing) - -test_prompt = """ -I have 2 screened patents marked as relevant in the database. - -Please evaluate both patents: -1. Analyze both patents by decomposing claims into constituent elements -2. Store claims and elements in the database -3. Report the analysis results including patent IDs, claim counts, and element counts -""" - -[[setup]] -path = "screened-patents.sql" -content = """ --- Initialize database -PRAGMA foreign_keys = ON; - --- Insert target patents -INSERT OR IGNORE INTO target_patents (patent_id, title, assignee, country, publication_date, filing_date, grant_date, extra_fields) -VALUES - ('US20240292070A1', 'Iterative AI prompt optimization', 'Loop Now Technologies, Inc.', 'US', '2024-08-29', '2024-04-10', NULL, '{"source": "test"}'), - ('US20240346271A1', 'Conversational AI System', 'Tech Corp', 'US', '2024-10-03', '2024-05-01', NULL, '{"source": "test"}'); - --- Insert screened patents (marked as relevant) -INSERT OR IGNORE INTO screened_patents (patent_id, judgment, reason, abstract_text, screened_at, updated_at) -VALUES - ('US20240292070A1', 'relevant', 'Relates to AI prompt optimization', 'Systems and methods for optimizing AI prompts through iterative refinement', datetime('now'), datetime('now')), - ('US20240346271A1', 'relevant', 'Relates to conversational AI', 'A conversational AI system with multi-turn capabilities', datetime('now'), datetime('now')); -""" - -[[setup]] -name = "init_db" -type = "script" -command = "setup-db.sh init" - -[[setup]] -name = "load_screened_data" -type = "script" -command = "setup-db.sh execute screened-patents.sql" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh evaluating" - -[[checks]] -name = "evaluating_invoked" -type = "script" -command = "check-skill-invoked.sh evaluating" - -[[checks]] -name = "agent_used" -type = "script" -command = "check-tool-use.sh Agent" - -[[checks]] -name = "claims_stored" -type = "script" -command = "check-db-query.sh {} {} '>0' 'SELECT COUNT(*) FROM claims;'" - -[[checks]] -name = "elements_stored" -type = "script" -command = "check-db-query.sh {} {} '>0' 'SELECT COUNT(*) FROM elements;'" diff --git a/agents/skill-bench/cases/evaluating/triggering.toml b/agents/skill-bench/cases/evaluating/triggering.toml deleted file mode 100644 index fdee671..0000000 --- a/agents/skill-bench/cases/evaluating/triggering.toml +++ /dev/null @@ -1,19 +0,0 @@ -# Test Case: Evaluating - Triggering - -name = "triggering" -description = "Verify evaluating skill is triggered when asked to evaluate patents" -timeout = 60 - -test_prompt = """ -I have a database of screened patents marked as relevant. Now I need to evaluate them by analyzing claims and elements. -""" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh evaluating" - -[[checks]] -name = "evaluating_invoked" -type = "script" -command = "check-skill-invoked.sh evaluating" diff --git a/agents/skill-bench/cases/investigation-preparing/functional-import-csv.toml b/agents/skill-bench/cases/investigation-preparing/functional-import-csv.toml deleted file mode 100644 index adac3f6..0000000 --- a/agents/skill-bench/cases/investigation-preparing/functional-import-csv.toml +++ /dev/null @@ -1,32 +0,0 @@ -# Test Case: Investigation Preparing - Import CSV - -name = "functional-import-csv" -description = "Verify investigation-preparing can import CSV data into target_patents" -timeout = 120 # seconds - -test_prompt = "Import test-patents.csv" - -[[setup]] -path = "test-patents.csv" -content = """ -search URL:,https://patents.google.com/?q=llm -id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,representative figure link -KR-102637029-B1,Device for Generating Multi-turn Chat Bot Data Using LLM,주식회사 마인즈앤컴퍼니,"고석태, 백영상",2023-10-11,2023-10-11,2024-02-15,2024-02-15,https://patents.google.com/patent/KR102637029B1/en, -US-2024292070-A1,Iterative ai prompt optimization,"Loop Now Technologies, Inc.","Wu-Hsi Li, Edwin Chiu",2023-02-24,2024-04-10,2024-08-29,,https://patents.google.com/patent/US20240292070A1/en,https://example.com/figure.png -US-2025200489-A1,Automatic quality assurance,"Forethought Technologies, Inc.","Sami Ghoche, Deon Nicholas",2022-02-28,2024-10-31,,,https://patents.google.com/patent/US20250200489A1/en, -""" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh investigation-preparing" - -[[checks]] -name = "investigating_database_invoked" -type = "script" -command = "check-skill-invoked.sh investigation-preparing" - -[[checks]] -name = "csv_imported" -type = "script" -command = "check-db-query.sh {} {} '3' 'SELECT COUNT(*) FROM target_patents;'" diff --git a/agents/skill-bench/cases/investigation-preparing/functional-import-multiple-csvs.toml b/agents/skill-bench/cases/investigation-preparing/functional-import-multiple-csvs.toml deleted file mode 100644 index a5e059d..0000000 --- a/agents/skill-bench/cases/investigation-preparing/functional-import-multiple-csvs.toml +++ /dev/null @@ -1,47 +0,0 @@ -# Test Case: Investigation Preparing - Import Multiple CSV Files - -name = "functional-import-multiple-csvs" -description = "Verify investigation-preparing can import multiple CSV files with different formats" -timeout = 180 # seconds - -test_prompt = """ -Import all CSV files from the current directory into the patent database. -""" - -[[setup]] -path = "patents-simple.csv" -content = """ -id,family_id,title,abstract_text,publication_date,country -US-1234567-A,US-1234567,Example Patent 1,Example abstract text for patent 1,2023-01-15,US -US-7654321-A,US-7654321,Example Patent 2,Example abstract text for patent 2,2023-03-20,US -US-9999999-A,US-9999999,Example Patent 3,Example abstract text for patent 3,2023-06-10,US -""" - -[[setup]] -path = "patents-google-format.csv" -content = """ -search URL:,https://patents.google.com/?q=rag+systems -id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,representative figure link -KR-102030405-B1,RAG System for Document Analysis,"삼성전자 주식회사","홍길동",2022-05-10,2022-05-10,2023-08-20,2023-08-20,https://patents.google.com/patent/KR102030405B1/en, -US-20240101234-A1,Information Retrieval Using Neural Networks,"Tech Corp Inc.","Jane Doe, John Smith",2022-03-15,2023-08-01,2024-01-15,,https://patents.google.com/patent/US20240101234A1/en, -""" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh investigation-preparing" - -[[checks]] -name = "investigating_database_invoked" -type = "script" -command = "check-skill-invoked.sh investigation-preparing" - -[[checks]] -name = "database_created" -type = "script" -command = "check-workspace-file.sh patents.db" - -[[checks]] -name = "all_csvs_imported" -type = "script" -command = "check-db-query.sh {} {} '5' 'SELECT COUNT(*) FROM target_patents;'" diff --git a/agents/skill-bench/cases/investigation-preparing/functional-init-db.toml b/agents/skill-bench/cases/investigation-preparing/functional-init-db.toml deleted file mode 100644 index 7d7a2f7..0000000 --- a/agents/skill-bench/cases/investigation-preparing/functional-init-db.toml +++ /dev/null @@ -1,24 +0,0 @@ -# Test Case: Investigation Preparing - Initialize Database - -name = "functional-init-db" -description = "Verify investigation-preparing can initialize the patent database" -timeout = 60 # seconds - -test_prompt = """ -Initialize the patent investigation database. -""" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh investigation-preparing" - -[[checks]] -name = "investigating_database_invoked" -type = "script" -command = "check-skill-invoked.sh investigation-preparing" - -[[checks]] -name = "database_created" -type = "script" -command = "check-workspace-file.sh patents.db" diff --git a/agents/skill-bench/cases/investigation-preparing/functional-various-patent-id.toml b/agents/skill-bench/cases/investigation-preparing/functional-various-patent-id.toml deleted file mode 100644 index ef663e6..0000000 --- a/agents/skill-bench/cases/investigation-preparing/functional-various-patent-id.toml +++ /dev/null @@ -1,88 +0,0 @@ -# Test Case: Investigation Preparing - Various Patent ID Formats - -name = "functional-various-patent-id" -description = "Verify ETL correctly transforms various patent ID formats from real Google Patents data" -timeout = 60 - -test_prompt = """ -I have collected patents with various ID formats from Google Patents. -Please initialize the patent database and import test-patents.csv. -Verify that all patent IDs are correctly transformed following Google Patents format. -""" - -[[setup]] -path = "test-patents.csv" -content = """ -search URL:,https://patents.google.com/?q=test -id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,representative figure link -US-2024292070-A1,Test Patent 1 (US 9 digits with omitted zero),"Test Corp","Inventor A",2023-02-24,2024-04-10,2024-08-29,,https://patents.google.com/patent/US20240292070A1/en, -US-2025200489-A1,Test Patent 2 (US 9 digits with omitted zero),"Test Corp","Inventor B",2022-02-28,2024-10-31,2025-06-19,,https://patents.google.com/patent/US20250200489A1/en, -KR-102637029-B1,Test Patent 3 (KR 9 digits),"Test Corp","Inventor C",2023-10-11,2023-10-11,2024-02-15,2024-02-15,https://patents.google.com/patent/KR102637029B1/en, -KR-102681147-B1,Test Patent 4 (KR 9 digits),"Test Corp","Inventor D",2023-10-31,2023-10-31,2024-07-04,2024-07-04,https://patents.google.com/patent/KR102681147B1/en, -WO-2025073197-A1,Test Patent 5 (WO 10 digits),"Test Corp","Inventor E",2023-10-04,2024-07-22,2025-04-10,,https://patents.google.com/patent/WO2025073197A1/en, -CA-3234744-A1,Test Patent 6 (CA 8 digits),"Test Corp","Inventor F",2023-04-10,2024-04-10,2025-05-01,,https://patents.google.com/patent/CA3234744A1/en, -JP-7753310-B2,Test Patent 7 (JP 7 digits),"Test Corp","Inventor G",2023-09-28,2023-09-28,2025-10-14,2025-10-14,https://patents.google.com/patent/JP7753310B2/en, -US-2025307897-A1,Test Patent 8 (US 9 digits with omitted zero),"Test Corp","Inventor H",2024-03-28,2024-03-28,2025-10-02,,https://patents.google.com/patent/US20250307897A1/en, -HK-40120585-A,Test Patent 9 (HK 7 digits, 1 char kind),"Test Corp","Inventor I",2023-10-04,2025-07-10,2025-08-22,,https://patents.google.com/patent/HK40120585A/en, -US20240292070A1,Test Patent 10 (US no hyphens correct),"Test Corp","Inventor J",2023-02-24,2024-04-10,2024-08-29,,https://patents.google.com/patent/US20240292070A1/en, -""" - -[[checks]] -name = "database_created" -type = "script" -command = "check-workspace-file.sh patents.db" - -[[checks]] -name = "patents_imported" -type = "script" -command = "check-db-query.sh {} {} '9' 'SELECT COUNT(*) FROM target_patents;'" - -[[checks]] -name = "patent_1_correct" -type = "script" -command = "check-db-query.sh {} {} 'US20240292070A1' 'SELECT patent_id FROM target_patents WHERE title LIKE \"Test Patent 1%\";'" - -[[checks]] -name = "patent_2_correct" -type = "script" -command = "check-db-query.sh {} {} 'US20250200489A1' 'SELECT patent_id FROM target_patents WHERE title LIKE \"Test Patent 2%\";'" - -[[checks]] -name = "patent_3_correct" -type = "script" -command = "check-db-query.sh {} {} 'KR102637029B1' 'SELECT patent_id FROM target_patents WHERE title LIKE \"Test Patent 3%\";'" - -[[checks]] -name = "patent_4_correct" -type = "script" -command = "check-db-query.sh {} {} 'KR102681147B1' 'SELECT patent_id FROM target_patents WHERE title LIKE \"Test Patent 4%\";'" - -[[checks]] -name = "patent_5_correct" -type = "script" -command = "check-db-query.sh {} {} 'WO2025073197A1' 'SELECT patent_id FROM target_patents WHERE title LIKE \"Test Patent 5%\";'" - -[[checks]] -name = "patent_6_correct" -type = "script" -command = "check-db-query.sh {} {} 'CA3234744A1' 'SELECT patent_id FROM target_patents WHERE title LIKE \"Test Patent 6%\";'" - -[[checks]] -name = "patent_7_correct" -type = "script" -command = "check-db-query.sh {} {} 'JP7753310B2' 'SELECT patent_id FROM target_patents WHERE title LIKE \"Test Patent 7%\";'" - -[[checks]] -name = "patent_8_correct" -type = "script" -command = "check-db-query.sh {} {} 'US20250307897A1' 'SELECT patent_id FROM target_patents WHERE title LIKE \"Test Patent 8%\";'" - -[[checks]] -name = "patent_9_correct" -type = "script" -command = "check-db-query.sh {} {} 'HK40120585A' 'SELECT patent_id FROM target_patents WHERE title LIKE \"Test Patent 9%\";'" - -[[checks]] -name = "all_ids_valid_length" -type = "script" -command = "check-db-query.sh {} {} '0' 'SELECT COUNT(*) FROM target_patents WHERE length(patent_id) < 7 OR length(patent_id) > 15;'" diff --git a/agents/skill-bench/cases/investigation-preparing/triggering.toml b/agents/skill-bench/cases/investigation-preparing/triggering.toml deleted file mode 100644 index 15b7dc7..0000000 --- a/agents/skill-bench/cases/investigation-preparing/triggering.toml +++ /dev/null @@ -1,24 +0,0 @@ -# Test Case: Investigation Preparing - Triggering - -name = "triggering" -description = "Verify investigation-preparing skill is triggered when appropriate" -timeout = 60 # seconds - -test_prompt = """ -I need to check the screening progress statistics. -""" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh investigation-preparing" - -[[checks]] -name = "investigating_database_invoked" -type = "script" -command = "check-skill-invoked.sh investigation-preparing" - -[[checks]] -name = "statistics_requested" -type = "script" -command = "check-text-contains.sh \"statistics\" \"progress\"" diff --git a/agents/skill-bench/cases/investigation-recording/functional-record-claims.toml b/agents/skill-bench/cases/investigation-recording/functional-record-claims.toml deleted file mode 100644 index 6644c8e..0000000 --- a/agents/skill-bench/cases/investigation-recording/functional-record-claims.toml +++ /dev/null @@ -1,62 +0,0 @@ -# Test Case: Investigation Recording - Record Claims - -name = "functional-record-claims" -description = "Verify investigation-recording can record patent claims" -timeout = 120 # seconds - -test_prompt = """ -Record claims for patent US1234567A: -- Claim 1 (independent): A method for processing natural language input using a transformer-based neural network. -- Claim 2 (dependent): The method of claim 1, wherein the transformer network comprises a BERT architecture. -- Claim 3 (dependent): The method of claim 1, further comprising a pre-processing step for token normalization. -""" - -[[setup]] -name = "init_db" -type = "script" -command = "setup-db.sh init" - -[[setup]] -name = "load_test_data" -type = "script" -command = """ -sqlite3 patents.db < 0 THEN '1' ELSE '0' END FROM claims WHERE patent_id = 'US1234567A' AND claim_text LIKE '%transformer%';" """ - -[[checks]] -name = "batch_insert_used" -type = "script" -command = """check-log-contains.sh {} "INSERT INTO claims.*VALUES" """ diff --git a/agents/skill-bench/cases/investigation-recording/functional-record-elements.toml b/agents/skill-bench/cases/investigation-recording/functional-record-elements.toml deleted file mode 100644 index 0d73cac..0000000 --- a/agents/skill-bench/cases/investigation-recording/functional-record-elements.toml +++ /dev/null @@ -1,66 +0,0 @@ -# Test Case: Investigation Recording - Record Elements - -name = "functional-record-elements" -description = "Verify investigation-recording can record claim elements" -timeout = 120 # seconds - -test_prompt = """ -Record elements for Claim 1 of patent US1234567A: -- Element A: receiving natural language input from a user interface -- Element B: processing the input using a transformer-based neural network -- Element C: generating a contextualized representation of the input -- Element D: extracting semantic features from the contextualized representation -""" - -[[setup]] -name = "init_db" -type = "script" -command = "setup-db.sh init" - -[[setup]] -name = "load_test_data" -type = "script" -command = """ -sqlite3 patents.db < 0 THEN '1' ELSE '0' END FROM elements WHERE patent_id = 'US1234567A' AND element_label = 'A';" """ - -[[checks]] -name = "batch_insert_used" -type = "script" -command = """check-log-contains.sh {} "INSERT INTO elements.*VALUES" """ diff --git a/agents/skill-bench/cases/investigation-recording/functional-record-screening.toml b/agents/skill-bench/cases/investigation-recording/functional-record-screening.toml deleted file mode 100644 index 2a0fbb1..0000000 --- a/agents/skill-bench/cases/investigation-recording/functional-record-screening.toml +++ /dev/null @@ -1,44 +0,0 @@ -# Test Case: Investigation Recording - Record Screening - -name = "functional-record-screening" -description = "Verify investigation-recording can record screening results" -timeout = 120 # seconds - -test_prompt = """ -Record screening result: patent US1234567A is relevant because it's core technology for LLM systems. -""" - -[[setup]] -name = "init_db" -type = "script" -command = "setup-db.sh init" - -[[setup]] -name = "load_test_data" -type = "script" -command = """ -sqlite3 patents.db <= 1 THEN '1' ELSE '0' END FROM screened_patents WHERE patent_id = 'US20240292070A1';" """ - -[[checks]] -name = "correct_judgment" -type = "script" -command = """check-db-query.sh {} {} 'relevant' "SELECT judgment FROM screened_patents WHERE patent_id = 'US20240292070A1';" """ - -[[checks]] -name = "reason_provided" -type = "script" -command = """check-db-query.sh {} {} '1' "SELECT CASE WHEN reason IS NOT NULL AND reason != '' THEN '1' ELSE '0' END FROM screened_patents WHERE patent_id = 'US20240292070A1';" """ diff --git a/agents/skill-bench/cases/investigation-reporting/functional-complete-workflow.toml b/agents/skill-bench/cases/investigation-reporting/functional-complete-workflow.toml deleted file mode 100644 index e701858..0000000 --- a/agents/skill-bench/cases/investigation-reporting/functional-complete-workflow.toml +++ /dev/null @@ -1,286 +0,0 @@ -# Test Case: Investigation Reporting - Complete Workflow - -name = "functional-complete-workflow" -description = "Verify investigation-reporting skill handles complete workflow with claim analysis and prior art" -timeout = 120 - -test_prompt = """ -What is the current progress of the patent investigation? -""" - -[[setup]] -name = "init_db" -type = "script" -command = "setup-db.sh init" - -[[setup]] -path = "specification.md" -content = """ -# Product Specification - -**Product/Technology**: -Solar-powered auto-cleaning cat litter box with IoT notifications. - -**Key Technical Features**: - -1. A solar panel integrated into the top hood that charges an internal battery. -2. A rotating internal drum that separates solid waste into a sealed compartment. -3. An IoT module (Wi-Fi) that sends push notifications to a smartphone when the waste compartment is full. - -**Target Release Date**: 2025-12-31 - -**Priority Date Cutoff**: 2015-01-01 - -**Competitors**: - -- Litter-Robot (AutoPets, LLC) -- CatGenie (PetNovations, Ltd.) -""" - -[[setup]] -path = "targeting.md" -content = """ -# Targeting Summary - -## Competitor Patent Research - -| Query | Hit Count | Keywords | -|-------|-----------|----------| -| assignee: ["AutoPets, LLC"] country: "US" | 156 | Litter Robot search | -| assignee: ["PetNovations, Ltd."] country: "US" | 89 | CatGenie search | - -## Market Patent Research - -| Query | Hit Count | Keywords | -|-------|-----------|----------| -| query: "\"cat litter\" AND \"automatic\" AND \"cleaning\"" | 234 | Initial search | -| query: "\"cat litter\" AND \"automatic\" AND \"cleaning\" AND \"solar\"" | 12 | Added solar keyword | - -## Final Search Commands - -Total patents collected: 3 -""" - -[[setup]] -path = "keywords.md" -content = """ -# Golden Keywords Registry - -## Product Concept Keywords - -| Keyword | Source | Verified | -|---------|--------|----------| -| "cat litter" | Manual | Yes | -| "automatic" | Manual | Yes | -| "cleaning" | Manual | Yes | -| "solar" | Manual | Yes | -| "rotating drum" | Manual | Yes | -| "IoT" | Manual | Yes | -""" - -[[setup]] -path = "populate-complete-data.sql" -content = """ --- Insert target patents -INSERT OR IGNORE INTO target_patents (patent_id, title, assignee, country, publication_date, filing_date, grant_date, extra_fields) -VALUES - ('US9876543B2', 'Automatic Pet Litter Box with Rotating Drum', 'AutoPets, LLC', 'US', '2018-05-15', '2016-11-10', '2018-05-15', '{"source": "test"}'), - ('US20230123456A1', 'Solar-Powered Pet Waste Disposal System', 'EcoPet Solutions', 'US', '2023-08-20', '2022-02-14', NULL, '{"source": "test"}'), - ('US20240056789A1', 'IoT-Connected Animal Hygiene Device', 'SmartPet Inc.', 'US', '2024-03-01', '2023-09-01', NULL, '{"source": "test"}'); - --- Insert screened patents (all 3) -INSERT OR IGNORE INTO screened_patents (patent_id, judgment, reason, abstract_text) -VALUES - ('US9876543B2', 'relevant', 'Rotating drum mechanism with IoT notifications present', 'Automatic pet litter box with rotating drum for waste separation and Wi-Fi notifications.'), - ('US20230123456A1', 'relevant', 'Solar charging and IoT notifications present', 'Solar-powered pet waste disposal system with mobile app notifications.'), - ('US20240056789A1', 'relevant', 'IoT connectivity present but other key features missing', 'Animal hygiene device with IoT connectivity for remote monitoring.'); -""" - -[[setup]] -name = "import_complete_data" -type = "script" -command = "setup-db.sh execute populate-complete-data.sql" - -[[setup]] -path = "investigation/US9876543B2/evaluation.md" -content = """ -# Evaluation: US9876543B2 - -## Similarity Assessment - -**Overall Similarity**: Significant - -## Elements - -| Element | Disclosure | Verdict | -|---------|------------|---------| -| A. Solar panel charging | Not disclosed | Absent | -| B. Rotating drum separation | Claims 1-5 disclose rotating drum with waste separation | Present | -| C. IoT notifications | Claims 6-8 describe Wi-Fi notifications | Present | - -## Conclusion - -Significant overlap on key features (B, C). Missing solar (A). -""" - -[[setup]] -path = "investigation/US9876543B2/claim-analysis.md" -content = """ -# Claim Analysis: US9876543B2 - -## Claim 1 (Independent) - -**Elements**: -1. A pet litter box apparatus -2. A rotating drum mechanism -3. A waste collection compartment -4. A motor configured to rotate the drum - -**Analysis**: -- Element 1: Present - Apparatus for pet waste management -- Element 2: Present - Rotating drum with sifting slots -- Element 3: Present - Sealed waste compartment below drum -- Element 4: Present - Electric motor with timer control - -**Verdict**: All elements present. High risk of anticipation. -""" - -[[setup]] -path = "investigation/US9876543B2/prior-art.md" -content = """ -# Prior Art Research: US9876543B2 - -## Query Strategy - -Search terms: "rotating drum" AND "cat litter" AND "automatic" - -## Results - -| Patent ID | Title | Relevance | Notes | -|-----------|-------|-----------|-------| -| US20150012345A1 | Self-cleaning litter box | Alternative | Different drum design | -| EP2345678A1 | Pet waste collection device | Aligned | Similar mechanism | - -## Conclusion - -Prior art found confirms drum separation mechanism is well-known. Consider design-around options. -""" - -[[setup]] -path = "investigation/US20230123456A1/evaluation.md" -content = """ -# Evaluation: US20230123456A1 - -## Similarity Assessment - -**Overall Similarity**: Significant - -## Elements - -| Element | Disclosure | Verdict | -|---------|------------|---------| -| A. Solar panel charging | Claims 1-3 disclose solar hood with battery | Present | -| B. Rotating drum separation | Not disclosed | Absent | -| C. IoT notifications | Claims 4-5 describe mobile app notifications | Present | - -## Conclusion - -Significant overlap on solar (A) and IoT (C). Different waste handling mechanism. -""" - -[[setup]] -path = "investigation/US20230123456A1/claim-analysis.md" -content = """ -# Claim Analysis: US20230123456A1 - -## Claim 1 (Independent) - -**Elements**: -1. A solar-powered pet waste disposal system -2. A solar panel mounted on a housing -3. A rechargeable battery connected to the solar panel -4. A waste collection mechanism -5. A wireless communication module - -**Analysis**: -- Element 1: Present - Solar-powered system for pet waste -- Element 2: Present - Top-mounted solar panel hood -- Element 3: Present - Li-ion battery with charge controller -- Element 4: Present - Raking mechanism (not rotating drum) -- Element 5: Present - Wi-Fi module for smartphone connection - -**Verdict**: Solar and IoT elements present. Different waste collection mechanism. -""" - -[[setup]] -path = "investigation/US20230123456A1/prior-art.md" -content = """ -# Prior Art Research: US20230123456A1 - -## Query Strategy - -Search terms: "solar" AND "pet litter" AND "automatic" - -## Results - -| Patent ID | Title | Relevance | Notes | -|-----------|-------|-----------|-------| -| US20190054321A1 | Solar animal waste device | Relevant | Earlier filing date | -| CN108765432A | Pet box with power generation | Alternative | Different power source | - -## Conclusion - -Some prior art on solar-powered pet devices found. No exact combination found. -""" - -[[setup]] -path = "investigation/US20240056789A1/evaluation.md" -content = """ -# Evaluation: US20240056789A1 - -## Similarity Assessment - -**Overall Similarity**: Limited - -## Elements - -| Element | Disclosure | Verdict | -|---------|------------|---------| -| A. Solar panel charging | Not disclosed | Absent | -| B. Rotating drum separation | Not disclosed | Absent | -| C. IoT notifications | Claims 1-3 disclose IoT connectivity | Present | - -## Conclusion - -Limited similarity. Only IoT notification feature overlaps. Low risk. -""" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh investigation-reporting" - -[[checks]] -name = "investigation_reporting_invoked" -type = "script" -command = "check-skill-invoked.sh investigation-reporting" - -[[checks]] -name = "investigation_preparing_invoked" -type = "script" -command = "check-skill-invoked.sh investigation-preparing" - -[[checks]] -name = "progress_md_created" -type = "script" -command = "check-workspace-file.sh PROGRESS.md" - -[[checks]] -name = "report_contains_investigation_table" -type = "script" -command = "check-file-content.sh PROGRESS.md Investigation Progress" - -[[checks]] -name = "report_excludes_limited_similarity" -type = "script" -command = "check-file-not-contains.sh PROGRESS.md US20240056789A1" diff --git a/agents/skill-bench/cases/investigation-reporting/functional-generate-report.toml b/agents/skill-bench/cases/investigation-reporting/functional-generate-report.toml deleted file mode 100644 index e8f53f9..0000000 --- a/agents/skill-bench/cases/investigation-reporting/functional-generate-report.toml +++ /dev/null @@ -1,165 +0,0 @@ -# Test Case: Investigation Reporting - Generate Progress Report - -name = "functional-generate-report" -description = "Verify investigation-reporting skill generates PROGRESS.md with correct statistics" -timeout = 120 - -test_prompt = """ -Give me a summary of the current patent investigation progress. -""" - -[[setup]] -name = "init_db" -type = "script" -command = "setup-db.sh init" - -[[setup]] -path = "specification.md" -content = """ -# Product Specification - -**Product/Technology**: -LLM-based multi-turn chatbot system with RAG (Retrieval-Augmented Generation) capabilities. - -**Key Technical Features**: - -1. LLM-driven multi-turn conversation management -2. Vector database integration for retrieval-augmented generation -3. Automatic quality assurance for information retrieval and intent detection - -**Target Release Date**: 2025-12-31 - -**Priority Date Cutoff**: 2020-01-01 -""" - -[[setup]] -path = "targeting.md" -content = """ -# Targeting Summary - -## Competitor Patent Research - -| Query | Hit Count | Keywords | -|-------|-----------|----------| -| assignee: ["Google LLC"] country: "US" | 2450 | Broad search | -| assignee: ["Google LLC"] country: "US" query: "\"chatbot\" AND \"RAG\"" | 45 | Added keywords | - -## Market Patent Research - -| Query | Hit Count | Keywords | -|-------|-----------|----------| -| query: "\"chatbot\" AND \"RAG\" AND \"LLM\"" | 128 | Initial search | - -## Final Search Commands - -Total patents collected: 5 -""" - -[[setup]] -path = "keywords.md" -content = """ -# Golden Keywords Registry - -## Product Concept Keywords - -| Keyword | Source | Verified | -|---------|--------|----------| -| "chatbot" | Manual | Yes | -| "RAG" | Manual | Yes | -| "LLM" | Manual | Yes | -| "multi-turn" | Manual | Yes | -""" - -[[setup]] -path = "populate-test-data.sql" -content = """ --- Insert target patents -INSERT OR IGNORE INTO target_patents (patent_id, title, assignee, country, publication_date, filing_date, grant_date, extra_fields) -VALUES - ('US20240292070A1', 'Iterative AI prompt optimization', 'Loop Now Technologies, Inc.', 'US', '2024-08-29', '2024-04-10', NULL, '{"source": "test"}'), - ('KR102637029B1', 'Device for Generating Multi-turn Chat Bot Data Using LLM', '주식회사 마인즈앤컴퍼니', 'KR', '2024-02-15', '2023-10-11', '2024-02-15', '{"source": "test"}'), - ('US20240123456A1', 'RAG-based Question Answering System', 'Tech Corp Inc.', 'US', '2024-05-20', '2023-11-15', NULL, '{"source": "test"}'), - ('JP2024000555A', 'Expired Patent about Old Chatbot', 'Old Company', 'JP', '2020-01-01', '2019-01-01', NULL, '{"source": "test"}'), - ('CN123456789A', 'Unrelated Patent About Refrigerator', 'Appliance Co', 'CN', '2024-03-10', '2023-09-05', NULL, '{"source": "test"}'); - --- Insert screened patents (3 out of 5) -INSERT OR IGNORE INTO screened_patents (patent_id, judgment, reason, abstract_text) -VALUES - ('US20240292070A1', 'relevant', 'Multi-turn LLM chatbot system with iterative prompt optimization', 'Iterative AI prompt optimization system for multi-turn conversations using large language models.'), - ('KR102637029B1', 'relevant', 'Multi-turn chatbot data generation using LLM', 'Device for generating training data for multi-turn chatbot systems using large language models.'), - ('JP2024000555A', 'expired', 'Priority date before cutoff (2019-01-01 < 2020-01-01)', 'Expired patent about old chatbot technology with priority date before cutoff.'); -""" - -[[setup]] -name = "import_test_data" -type = "script" -command = "setup-db.sh execute populate-test-data.sql" - -[[setup]] -path = "investigation/US20240292070A1/evaluation.md" -content = """ -# Evaluation: US20240292070A1 - -## Similarity Assessment - -**Overall Similarity**: Significant - -## Elements - -| Element | Disclosure | Verdict | -|---------|------------|---------| -| A. LLM-driven conversation | Claims 1-3 disclose multi-turn LLM processing | Present | -| B. Vector database integration | Claims 4-5 describe retrieval system | Present | -| C. Quality assurance | Claim 6 describes automatic validation | Present | - -## Conclusion - -Significant overlap with target invention. Recommend claim analysis. -""" - -[[setup]] -path = "investigation/KR102637029B1/evaluation.md" -content = """ -# Evaluation: KR102637029B1 - -## Similarity Assessment - -**Overall Similarity**: Moderate - -## Elements - -| Element | Disclosure | Verdict | -|---------|------------|---------| -| A. LLM-driven conversation | Description mentions chatbot but limited detail | Partial | -| B. Vector database integration | Not disclosed | Absent | -| C. Quality assurance | Claims mention filtering | Partial | - -## Conclusion - -Moderate similarity. Missing key elements. -""" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh investigation-reporting" - -[[checks]] -name = "investigation_reporting_invoked" -type = "script" -command = "check-skill-invoked.sh investigation-reporting" - -[[checks]] -name = "investigation_preparing_invoked" -type = "script" -command = "check-skill-invoked.sh investigation-preparing" - -[[checks]] -name = "progress_md_created" -type = "script" -command = "check-workspace-file.sh PROGRESS.md" - -[[checks]] -name = "screening_stats_retrieved" -type = "script" -command = "check-db-query.sh {} {} '>=3' 'SELECT COUNT(*) FROM screened_patents;'" diff --git a/agents/skill-bench/cases/investigation-reporting/triggering.toml b/agents/skill-bench/cases/investigation-reporting/triggering.toml deleted file mode 100644 index 9ed4391..0000000 --- a/agents/skill-bench/cases/investigation-reporting/triggering.toml +++ /dev/null @@ -1,19 +0,0 @@ -# Test Case: Investigation Reporting - Triggering - -name = "triggering" -description = "Verify investigation-reporting skill is triggered when asked for progress" -timeout = 60 - -test_prompt = """ -What is the current progress of the patent investigation? -""" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh investigation-reporting" - -[[checks]] -name = "investigation_reporting_invoked" -type = "script" -command = "check-skill-invoked.sh investigation-reporting" diff --git a/agents/skill-bench/cases/legal-checking/functional-file-review.toml b/agents/skill-bench/cases/legal-checking/functional-file-review.toml deleted file mode 100644 index f0b4936..0000000 --- a/agents/skill-bench/cases/legal-checking/functional-file-review.toml +++ /dev/null @@ -1,53 +0,0 @@ -# Test Case: Legal Checking - File Review - -name = "functional-file-review" -description = "Verify legal-checking reviews a file and identifies violations" -timeout = 90 # seconds - -test_prompt = """ -Review the following file for legal compliance violations: - -test-claim-analysis.md -""" - -[[setup]] -path = "test-claim-analysis.md" -content = """ -# Claim Analysis: US9876543B2 - -## Element A: Wireless Communication Module - -The reference **clearly discloses** a wireless communication module in Column 3. This element is **satisfied** by the reference. - -## Element B: Neural Network Layers - -The reference **does not satisfy** this requirement because it only has 2 layers. Therefore, Claim 1 **is not anticipated** by the reference. - -## Element C: Data Transmission - -The alternative implementation using optical fibers **is equivalent** to the copper wires in the reference and **would be obvious** to one skilled in the art. - -## Conclusion - -The product **does not infringe** Claim 1 because it uses a different algorithm. There is **no risk** of infringement. -""" - -[[checks]] -name = "mcp_server_loaded" -type = "script" -command = "check-mcp-loaded.sh google-patent-cli" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh legal-checking" - -[[checks]] -name = "legal_checking_invoked" -type = "script" -command = "check-skill-invoked.sh legal-checking" - -[[checks]] -name = "test_file_read" -type = "script" -command = "check-tool-use.sh Read file_path \"test-claim-analysis.md\"" diff --git a/agents/skill-bench/cases/legal-checking/functional.toml b/agents/skill-bench/cases/legal-checking/functional.toml deleted file mode 100644 index aa4ec3d..0000000 --- a/agents/skill-bench/cases/legal-checking/functional.toml +++ /dev/null @@ -1,31 +0,0 @@ -# Test Case: Legal Checking Functional - -name = "functional" -description = "Verify legal-checking automatically triggers on legal compliance keywords" -timeout = 60 # seconds - -test_prompt = """ -Review this patent analysis for legal compliance violations: - -The claim **does not infringe** the reference because it **clearly discloses** all elements. -""" - -[[checks]] -name = "mcp_server_loaded" -type = "script" -command = "check-mcp-loaded.sh google-patent-cli" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh legal-checking" - -[[checks]] -name = "legal_checking_invoked" -type = "script" -command = "check-skill-invoked.sh legal-checking" - -[[checks]] -name = "violations_detected" -type = "script" -command = "check-text-contains.sh \"does not infringe\" \"clearly discloses\"" diff --git a/agents/skill-bench/cases/legal-checking/triggering.toml b/agents/skill-bench/cases/legal-checking/triggering.toml deleted file mode 100644 index 0d05540..0000000 --- a/agents/skill-bench/cases/legal-checking/triggering.toml +++ /dev/null @@ -1,24 +0,0 @@ -# Test Case: Legal Checking - Triggering - -name = "triggering" -description = "Verify legal-checking skill is triggered when asked about legal compliance" -timeout = 60 - -test_prompt = """ -Load the legal-checking skill to understand the legal compliance guidelines. -""" - -[[checks]] -name = "mcp_server_loaded" -type = "script" -command = "check-mcp-loaded.sh google-patent-cli" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh legal-checking" - -[[checks]] -name = "legal_checking_invoked" -type = "script" -command = "check-skill-invoked.sh legal-checking" diff --git a/agents/skill-bench/cases/prior-art-researching/functional-parallel-search.toml b/agents/skill-bench/cases/prior-art-researching/functional-parallel-search.toml deleted file mode 100644 index 0ffa048..0000000 --- a/agents/skill-bench/cases/prior-art-researching/functional-parallel-search.toml +++ /dev/null @@ -1,103 +0,0 @@ -# Test Case: Prior Art Researching - Parallel Search - -name = "functional-parallel-search" -description = "Verify prior-art-researching skill can process multiple patents in parallel using teams" -timeout = 600 - -test_prompt = """ -I have 2 patents with similarities analyzed in the database. - -Please search for prior art: -1. Get the patents with Moderate/Significant similarities but no prior art results -2. Search for prior art references (both patent and non-patent literature) for each patent -3. Analyze relevance of each reference -4. Store prior art results in the database -5. Report the search results including patent IDs, reference counts, and relevance levels -""" - -[[setup]] -path = "patents-with-similarities.sql" -content = """ --- Initialize database -PRAGMA foreign_keys = ON; - --- Insert target patents -INSERT OR IGNORE INTO target_patents (patent_id, title, assignee, country, publication_date, filing_date, grant_date, extra_fields) -VALUES - ('US20240292070A1', 'Iterative AI prompt optimization', 'Loop Now Technologies, Inc.', 'US', '2024-08-29', '2024-04-10', NULL, '{"source": "test"}'), - ('US20240320708A1', 'Multi-turn conversation system', 'Tech Corp', 'US', '2024-09-12', '2024-03-15', NULL, '{"source": "test"}'); - --- Insert screened patents (marked as relevant) -INSERT OR IGNORE INTO screened_patents (patent_id, judgment, reason, abstract_text, screened_at, updated_at) -VALUES - ('US20240292070A1', 'relevant', 'Relates to AI prompt optimization', 'Systems and methods for optimizing AI prompts through iterative refinement', datetime('now'), datetime('now')), - ('US20240320708A1', 'relevant', 'Relates to multi-turn conversations', 'A system for managing multi-turn conversations with context tracking', datetime('now'), datetime('now')); - --- Insert claims for patent 1 -INSERT OR REPLACE INTO claims (patent_id, claim_number, claim_type, claim_text, created_at, updated_at) -VALUES - ('US20240292070A1', 1, 'independent', 'A method for optimizing AI prompts, comprising: generating an initial prompt; receiving output from an AI model; analyzing the output for quality metrics; and iteratively refining the prompt based on the quality metrics.', datetime('now'), datetime('now')); - --- Insert elements for patent 1 -INSERT OR REPLACE INTO elements (patent_id, claim_number, element_label, element_description, created_at, updated_at) -VALUES - ('US20240292070A1', 1, 'A', 'generating an initial prompt', datetime('now'), datetime('now')), - ('US20240292070A1', 1, 'B', 'receiving output from an AI model', datetime('now'), datetime('now')), - ('US20240292070A1', 1, 'C', 'analyzing the output for quality metrics', datetime('now'), datetime('now')), - ('US20240292070A1', 1, 'D', 'iteratively refining the prompt based on the quality metrics', datetime('now'), datetime('now')); - --- Insert similarities for patent 1 (all Moderate/Significant) -INSERT OR REPLACE INTO similarities (patent_id, claim_number, element_label, similarity_level, analysis_notes, analyzed_at, updated_at) -VALUES - ('US20240292070A1', 1, 'A', 'Significant', 'Product has similar prompt generation feature', datetime('now'), datetime('now')), - ('US20240292070A1', 1, 'B', 'Moderate', 'Product has AI output processing but different implementation', datetime('now'), datetime('now')), - ('US20240292070A1', 1, 'C', 'Significant', 'Product has quality metrics analysis', datetime('now'), datetime('now')), - ('US20240292070A1', 1, 'D', 'Moderate', 'Product has iterative refinement but different approach', datetime('now'), datetime('now')); - --- Insert claims for patent 2 -INSERT OR REPLACE INTO claims (patent_id, claim_number, claim_type, claim_text, created_at, updated_at) -VALUES - ('US20240320708A1', 1, 'independent', 'A system for multi-turn conversations, comprising: a conversation manager configured to maintain context across multiple turns; and a response generator configured to generate responses based on the maintained context.', datetime('now'), datetime('now')); - --- Insert elements for patent 2 -INSERT OR REPLACE INTO elements (patent_id, claim_number, element_label, element_description, created_at, updated_at) -VALUES - ('US20240320708A1', 1, 'A', 'a conversation manager configured to maintain context across multiple turns', datetime('now'), datetime('now')), - ('US20240320708A1', 1, 'B', 'a response generator configured to generate responses based on the maintained context', datetime('now'), datetime('now')); - --- Insert similarities for patent 2 (all Moderate/Significant) -INSERT OR REPLACE INTO similarities (patent_id, claim_number, element_label, similarity_level, analysis_notes, analyzed_at, updated_at) -VALUES - ('US20240320708A1', 1, 'A', 'Significant', 'Product has similar conversation manager', datetime('now'), datetime('now')), - ('US20240320708A1', 1, 'B', 'Significant', 'Product has response generator with context tracking', datetime('now'), datetime('now')); -""" - -[[setup]] -name = "init_db" -type = "script" -command = "setup-db.sh init" - -[[setup]] -name = "load_patent_data" -type = "script" -command = "setup-db.sh execute patents-with-similarities.sql" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh prior-art-researching" - -[[checks]] -name = "prior_art_researching_invoked" -type = "script" -command = "check-skill-invoked.sh prior-art-researching" - -[[checks]] -name = "agent_used" -type = "script" -command = "check-tool-use.sh Agent" - -[[checks]] -name = "prior_arts_stored" -type = "script" -command = "check-db-query.sh {} {} '>=2' 'SELECT COUNT(DISTINCT patent_id) FROM prior_art_elements;'" diff --git a/agents/skill-bench/cases/prior-art-researching/functional-search-prior-art.toml b/agents/skill-bench/cases/prior-art-researching/functional-search-prior-art.toml deleted file mode 100644 index 26c3c60..0000000 --- a/agents/skill-bench/cases/prior-art-researching/functional-search-prior-art.toml +++ /dev/null @@ -1,84 +0,0 @@ -# Test Case: Prior Art Researching - Search Prior Art (Single) - -name = "functional-search-prior-art" -description = "Verify prior-art-researching skill uses Agent tool to search prior art and store results in database" -timeout = 600 - -test_prompt = """ -I have a patent with similarities analyzed in the database. - -Please search for prior art: -1. Get the next patent that has Moderate/Significant similarities but no prior art results -2. Search for prior art references (both patent and non-patent literature) -3. Analyze relevance of each reference -4. Store prior art results in the database -5. Report the search results including patent ID, reference count, and relevance levels -""" - -[[setup]] -path = "patent-with-similarities.sql" -content = """ --- Initialize database -PRAGMA foreign_keys = ON; - --- Insert target patents -INSERT OR IGNORE INTO target_patents (patent_id, title, assignee, country, publication_date, filing_date, grant_date, extra_fields) -VALUES - ('US20240292070A1', 'Iterative AI prompt optimization', 'Loop Now Technologies, Inc.', 'US', '2024-08-29', '2024-04-10', NULL, '{"source": "test"}'); - --- Insert screened patents (marked as relevant) -INSERT OR IGNORE INTO screened_patents (patent_id, judgment, reason, abstract_text, screened_at, updated_at) -VALUES - ('US20240292070A1', 'relevant', 'Relates to AI prompt optimization', 'Systems and methods for optimizing AI prompts through iterative refinement', datetime('now'), datetime('now')); - --- Insert claims -INSERT OR REPLACE INTO claims (patent_id, claim_number, claim_type, claim_text, created_at, updated_at) -VALUES - ('US20240292070A1', 1, 'independent', 'A method for optimizing AI prompts, comprising: generating an initial prompt; receiving output from an AI model; analyzing the output for quality metrics; and iteratively refining the prompt based on the quality metrics.', datetime('now'), datetime('now')); - --- Insert elements -INSERT OR REPLACE INTO elements (patent_id, claim_number, element_label, element_description, created_at, updated_at) -VALUES - ('US20240292070A1', 1, 'A', 'generating an initial prompt', datetime('now'), datetime('now')), - ('US20240292070A1', 1, 'B', 'receiving output from an AI model', datetime('now'), datetime('now')), - ('US20240292070A1', 1, 'C', 'analyzing the output for quality metrics', datetime('now'), datetime('now')), - ('US20240292070A1', 1, 'D', 'iteratively refining the prompt based on the quality metrics', datetime('now'), datetime('now')); - --- Insert similarities (all Moderate/Significant - no Limited) -INSERT OR REPLACE INTO similarities (patent_id, claim_number, element_label, similarity_level, analysis_notes, analyzed_at, updated_at) -VALUES - ('US20240292070A1', 1, 'A', 'Significant', 'Product has similar prompt generation feature', datetime('now'), datetime('now')), - ('US20240292070A1', 1, 'B', 'Moderate', 'Product has AI output processing but different implementation', datetime('now'), datetime('now')), - ('US20240292070A1', 1, 'C', 'Significant', 'Product has quality metrics analysis', datetime('now'), datetime('now')), - ('US20240292070A1', 1, 'D', 'Moderate', 'Product has iterative refinement but different approach', datetime('now'), datetime('now')); -""" - -[[setup]] -name = "init_db" -type = "script" -command = "setup-db.sh init" - -[[setup]] -name = "load_patent_data" -type = "script" -command = "setup-db.sh execute patent-with-similarities.sql" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh prior-art-researching" - -[[checks]] -name = "prior_art_researching_invoked" -type = "script" -command = "check-skill-invoked.sh prior-art-researching" - -[[checks]] -name = "agent_used" -type = "script" -command = "check-tool-use.sh Agent" - -[[checks]] -name = "prior_arts_stored" -type = "script" -command = "check-db-query.sh {} {} '>0' 'SELECT COUNT(*) FROM prior_arts;'" diff --git a/agents/skill-bench/cases/prior-art-researching/triggering.toml b/agents/skill-bench/cases/prior-art-researching/triggering.toml deleted file mode 100644 index c1996a8..0000000 --- a/agents/skill-bench/cases/prior-art-researching/triggering.toml +++ /dev/null @@ -1,20 +0,0 @@ -# Test Case: Prior Art Researching - Triggering - -name = "triggering" -description = "Verify prior-art-researching skill is triggered when asked to perform prior art search" -timeout = 60 - -test_prompt = """ -I have collected patents with similarities analyzed in the database. -Now I need to search for prior art for patents with Moderate/Significant similarities. -""" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh prior-art-researching" - -[[checks]] -name = "prior_art_researching_invoked" -type = "script" -command = "check-skill-invoked.sh prior-art-researching" diff --git a/agents/skill-bench/cases/screening/functional-parallel-screening.toml b/agents/skill-bench/cases/screening/functional-parallel-screening.toml deleted file mode 100644 index fabb765..0000000 --- a/agents/skill-bench/cases/screening/functional-parallel-screening.toml +++ /dev/null @@ -1,84 +0,0 @@ -# Test Case: Screening - Parallel Processing - -name = "functional-parallel-screening" -description = "Verify screening skill can process patents in parallel using teams" -timeout = 300 # seconds - -test_prompt = """ -I have collected 2 patents from Google Patents and saved them to test-patents.csv. - -The patent database has already been initialized and the CSV has been imported. - -Please: -1. Screen all patents in the database to determine relevance for LLM-based multi-turn chatbot systems with RAG capabilities -2. Report the screening results including patent IDs, titles, and relevance judgments - -The product specification is in specification.md. -""" - -[[setup]] -path = "import-csv.sql" -content = """ -INSERT OR IGNORE INTO target_patents (patent_id, title, assignee, country, publication_date, filing_date, grant_date, extra_fields) -VALUES - ('KR102637029B1', 'Device for Generating Multi-turn Chat Bot Data Using LLM', '주식회사 마인즈앤컴퍼니', 'KR', '2024-02-15', '2023-10-11', '2024-02-15', '{"source": "test"}'), - ('US20240292070A1', 'Iterative AI prompt optimization', 'Loop Now Technologies, Inc.', 'US', '2024-08-29', '2024-04-10', NULL, '{"source": "test"}'); -""" - -[[setup]] -name = "init_db" -type = "script" -command = "setup-db.sh init" - -[[setup]] -name = "import_csv_data" -type = "script" -command = "setup-db.sh execute import-csv.sql" - -[[setup]] -path = "specification.md" -content = """ -# Product Specification - -**Product/Technology**: -LLM-based multi-turn chatbot system with RAG (Retrieval-Augmented Generation) capabilities. - -**Key Technical Features**: - -1. LLM-driven multi-turn conversation management -2. Vector database integration for retrieval-augmented generation -3. Automatic quality assurance for information retrieval and intent detection - -**Target Release Date**: 2025-12-31 - -**Priority Date Cutoff**: 2020-01-01 -""" - -[[setup]] -path = "test-patents.csv" -content = """ -search URL:,https://patents.google.com/?q=llm -id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,representative figure link -KR-102637029-B1,Device for Generating Multi-turn Chat Bot Data Using LLM,주식회사 마인즈앤컴퍼니,"고석태, 백영상",2023-10-11,2023-10-11,2024-02-15,2024-02-15,https://patents.google.com/patent/KR102637029B1/en, -US-2024292070-A1,Iterative ai prompt optimization,"Loop Now Technologies, Inc.","Wu-Hsi Li, Edwin Chiu",2023-02-24,2024-04-10,2024-08-29,,https://patents.google.com/patent/US20240292070A1/en, -""" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh screening" - -[[checks]] -name = "screening_invoked" -type = "script" -command = "check-skill-invoked.sh screening" - -[[checks]] -name = "agent_used" -type = "script" -command = "check-tool-use.sh Agent" - -[[checks]] -name = "screened_stored" -type = "script" -command = "check-db-query.sh {} {} '>=2' 'SELECT COUNT(*) FROM screened_patents;'" diff --git a/agents/skill-bench/cases/screening/functional-resume-screening.toml b/agents/skill-bench/cases/screening/functional-resume-screening.toml deleted file mode 100644 index a7dd3b9..0000000 --- a/agents/skill-bench/cases/screening/functional-resume-screening.toml +++ /dev/null @@ -1,112 +0,0 @@ -# Test Case: Screening - Resume Capability - -name = "functional-resume-screening" -description = "Verify screening skill can resume from partial screening without duplicates" -timeout = 300 # seconds - -test_prompt = """ -I have already imported 3 patents into the database and partially screened them. - -Patent KR-102637029-B1 has already been screened and marked as 'relevant'. - -Please: -1. Resume screening for the remaining unscreened patents -2. Use parallel processing to speed up the work -3. Ensure no duplicate screening entries are created -4. Report the final screening status - -The product specification is in specification.md. -Target domain: LLM-based multi-turn chatbot systems with RAG capabilities. -""" - -[[setup]] -path = "import-csv.sql" -content = """ -INSERT OR IGNORE INTO target_patents (patent_id, title, assignee, country, publication_date, filing_date, grant_date, extra_fields) -VALUES - ('KR102637029B1', 'Device for Generating Multi-turn Chat Bot Data Using LLM', '주식회사 마인즈앤컴퍼니', 'KR', '2024-02-15', '2023-10-11', '2024-02-15', '{"source": "test"}'), - ('US20240292070A1', 'Iterative AI prompt optimization', 'Loop Now Technologies, Inc.', 'US', '2024-08-29', '2024-04-10', NULL, '{"source": "test"}'), - ('CN123456789A1', 'Medical Image Analysis System', 'Beijing Medical Tech', 'CN', '2024-01-20', '2023-05-10', NULL, '{"source": "test"}'); -""" - -[[setup]] -path = "pre-screened-data.sql" -content = """ -INSERT INTO screened_patents (patent_id, judgment, reason, abstract_text, screened_at) -VALUES ( - 'KR102637029B1', - 'relevant', - 'Directly relates to LLM-based multi-turn chatbot data generation', - 'Device for Generating Multi-turn Chat Bot Data Using LLM', - datetime('now') -); -""" - -[[setup]] -name = "init_db" -type = "script" -command = "setup-db.sh init" - -[[setup]] -name = "import_csv_data" -type = "script" -command = "setup-db.sh execute import-csv.sql" - -[[setup]] -name = "load_pre_screened_data" -type = "script" -command = "setup-db.sh execute pre-screened-data.sql" - -[[setup]] -path = "specification.md" -content = """ -# Product Specification - -**Product/Technology**: -LLM-based multi-turn chatbot system with RAG (Retrieval-Augmented Generation) capabilities. - -**Key Technical Features**: - -1. LLM-driven multi-turn conversation management -2. Vector database integration for retrieval-augmented generation -3. Automatic quality assurance for information retrieval and intent detection - -**Target Release Date**: 2025-12-31 - -**Priority Date Cutoff**: 2020-01-01 -""" - -[[setup]] -path = "test-patents.csv" -content = """ -search URL:,https://patents.google.com/?q=llm -id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,representative figure link -KR-102637029-B1,Device for Generating Multi-turn Chat Bot Data Using LLM,주식회사 마인즈앤컴퍼니,"고석태, 백영상",2023-10-11,2023-10-11,2024-02-15,2024-02-15,https://patents.google.com/patent/KR102637029B1/en, -US-2024292070-A1,Iterative ai prompt optimization,"Loop Now Technologies, Inc.","Wu-Hsi Li, Edwin Chiu",2023-02-24,2024-04-10,2024-08-29,,https://patents.google.com/patent/US20240292070A1/en, -CN-123456789-A1,Medical Image Analysis System,"Beijing Medical Tech","Zhang San, Li Si",2023-05-10,2023-05-10,2024-01-20,,https://patents.google.com/patent/CN123456789A1/en, -""" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh screening" - -[[checks]] -name = "screening_invoked" -type = "script" -command = "check-skill-invoked.sh screening" - -[[checks]] -name = "agent_used" -type = "script" -command = "check-tool-use.sh Agent" - -[[checks]] -name = "screened_stored" -type = "script" -command = "check-db-query.sh {} {} '>=3' 'SELECT COUNT(*) FROM screened_patents;'" - -[[checks]] -name = "no_duplicates" -type = "script" -command = "check-db-query.sh {} {} '=1' 'SELECT COUNT(*) FROM screened_patents WHERE patent_id = \"KR102637029B1\";'" diff --git a/agents/skill-bench/cases/screening/triggering.toml b/agents/skill-bench/cases/screening/triggering.toml deleted file mode 100644 index 553f74f..0000000 --- a/agents/skill-bench/cases/screening/triggering.toml +++ /dev/null @@ -1,19 +0,0 @@ -# Test Case: Screening - Triggering - -name = "triggering" -description = "Verify screening skill is triggered when asked to screen patents" -timeout = 60 - -test_prompt = """ -I have a database of collected patents. Now I need to screen them to remove noise and filter by relevance. -""" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh screening" - -[[checks]] -name = "screening_invoked" -type = "script" -command = "check-skill-invoked.sh screening" diff --git a/agents/skill-bench/cases/targeting/functional-with-data.toml b/agents/skill-bench/cases/targeting/functional-with-data.toml deleted file mode 100644 index ffd1fce..0000000 --- a/agents/skill-bench/cases/targeting/functional-with-data.toml +++ /dev/null @@ -1,71 +0,0 @@ -# Test Case: Targeting Functional (with CSV data) - -name = "functional-with-data" -description = "Verify targeting process with pre-downloaded CSV data" -timeout = 600 # seconds - -test_prompt = """ -I have placed downloaded CSV files in `patents.csv`. -Please create a target population based on the specification.md. -""" - -[[setup]] -path = "specification.md" -content = """ -# Product Specification - -**Product/Technology**: -LLM-based multi-turn chatbot system with RAG (Retrieval-Augmented Generation) capabilities. - -**Background**: -Current chatbots struggle with context awareness and factual accuracy in multi-turn conversations. This system combines LLM with vector database retrieval to provide accurate, context-aware responses. - -**Key Technical Features**: - -1. LLM-driven multi-turn conversation management -2. Vector database integration for retrieval-augmented generation -3. Automatic quality assurance for information retrieval and intent detection -4. Iterative AI prompt optimization for various applications (video generation, etc.) - -**Target Release Date**: 2025-12-31 - -**Priority Date Cutoff**: 2020-01-01 - -**Competitors**: - -- Google -- Microsoft -- OpenAI - -**Target Market**: -US and Korea markets, focusing on enterprise customer service and conversational AI applications. -""" - -[[setup]] -path = "patents.csv" -content = """ -id,family_id,title,abstract_text,publication_date,country -US-1234567-A,US-1234567,Example Patent 1,Example abstract text for patent 1,2023-01-15,US -US-7654321-A,US-7654321,Example Patent 2,Example abstract text for patent 2,2023-03-20,US -US-9999999-A,US-9999999,Example Patent 3,Example abstract text for patent 3,2023-06-10,US -""" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh targeting" - -[[checks]] -name = "targeting_invoked" -type = "script" -command = "check-skill-invoked.sh targeting" - -[[checks]] -name = "database_created" -type = "script" -command = "check-workspace-file.sh patents.db" - -[[checks]] -name = "csv_imported" -type = "script" -command = "check-db-query.sh {} {} '3' 'SELECT COUNT(*) FROM target_patents;'" diff --git a/agents/skill-bench/cases/targeting/functional-with-spec.toml b/agents/skill-bench/cases/targeting/functional-with-spec.toml deleted file mode 100644 index 60ffa9b..0000000 --- a/agents/skill-bench/cases/targeting/functional-with-spec.toml +++ /dev/null @@ -1,86 +0,0 @@ -# Test Case: Targeting Functional (with existing specification) - -name = "functional-with-spec" -description = "Verify targeting process with existing specification" -timeout = 600 # seconds - -test_prompt = """ -I have placed an invention specification in `specification.md`. Please create a target population and run the patent search for a 2025 product release. - -Before asking me any questions, please use the question-responder skill to check if the required information is already available. Then proceed with the targeting process. -""" - -[answers] -"modifying keywords" = "Looks good, proceed to search." -"synonyms" = "Looks good, proceed to search." -"adjust query" = "Looks good, proceed to search." -"hit counts" = "The count is acceptable, proceed to merge." -"acceptable" = "The count is acceptable, proceed to merge." -"~1000 hits" = "The count is acceptable, proceed to merge." - -[[setup]] -path = "specification.md" -content = """ -# Specification Dummy - -**Product/Technology**: -Solar-powered auto-cleaning cat litter box with IoT notifications. - -**Background**: -Current cat litter boxes require manual scooping and frequent bag changes, which leads to odor and hygiene issues. - -**Key Technical Features**: - -1. A solar panel integrated into the top hood that charges an internal battery. -2. A rotating internal drum that separates solid waste into a sealed compartment. -3. An IoT module (Wi-Fi) that sends push notifications to a smartphone when the waste compartment is full. - -**Target Release Date**: 2025-12-31 - -**Priority Date Cutoff**: 2015-01-01 - -**Competitors**: - -- Litter-Robot (AutoPets, LLC) -- CatGenie (PetNovations, Ltd.) -""" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh targeting" - -[[checks]] -name = "targeting_invoked" -type = "script" -command = "check-skill-invoked.sh targeting" - -[[checks]] -name = "concept_interview_not_invoked" -type = "script" -command = "check-skill-not-invoked.sh concept-interviewing" - -[[checks]] -name = "targeting_template_read" -type = "script" -command = "check-tool-use.sh Read file_path \"targeting-template.md\"" - -[[checks]] -name = "keywords_template_read" -type = "script" -command = "check-tool-use.sh Read file_path \"keywords-template.md\"" - -[[checks]] -name = "targeting_md_created" -type = "script" -command = "check-workspace-file.sh targeting.md" - -[[checks]] -name = "keywords_md_created" -type = "script" -command = "check-workspace-file.sh keywords.md" - -[[checks]] -name = "google_patent_search_invoked" -type = "script" -command = "check-skill-invoked.sh google-patent-cli:patent-search" diff --git a/agents/skill-bench/cases/targeting/triggering.toml b/agents/skill-bench/cases/targeting/triggering.toml deleted file mode 100644 index 57cf46d..0000000 --- a/agents/skill-bench/cases/targeting/triggering.toml +++ /dev/null @@ -1,19 +0,0 @@ -# Test Case: Targeting - Triggering - -name = "triggering" -description = "Verify targeting skill is triggered when asked to execute targeting" -timeout = 60 - -test_prompt = """ -I have a product concept. Now I need to create a target population for patent searching. -""" - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh targeting" - -[[checks]] -name = "targeting_invoked" -type = "script" -command = "check-skill-invoked.sh targeting" diff --git a/agents/skill-bench/harness-plugin/.claude-plugin/plugin.json b/agents/skill-bench/harness-plugin/.claude-plugin/plugin.json deleted file mode 100644 index 97b0c7b..0000000 --- a/agents/skill-bench/harness-plugin/.claude-plugin/plugin.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "name": "skill-bench-harness", - "version": "0.1.0", - "description": "Test harness utilities for skill-bench testing framework", - "author": { - "name": "sonesuke" - }, - "mcpServers": {} -} diff --git a/agents/skill-bench/harness-plugin/cases/basic.toml b/agents/skill-bench/harness-plugin/cases/basic.toml deleted file mode 100644 index 571c480..0000000 --- a/agents/skill-bench/harness-plugin/cases/basic.toml +++ /dev/null @@ -1,37 +0,0 @@ -# Test Case: Question Responder - Basic Functionality - -name = "basic" -description = "Verify question-responder finds answers from test case" -timeout = 60 - -test_prompt = """ -I need to ask: What are the product concept, target country, release date, and competitors for the patent search? Please use the question-responder skill to find the answer. -""" - -[answers] -"product concept" = "Voice recognition system for smart home devices with real-time transcription and noise-resistant recognition" -"target country" = "US" -"country" = "US" -"release date" = "2025-06-01" -"date" = "2025-06-01" -"competitors" = ["Google", "Amazon"] - -[[checks]] -name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh skill-bench-harness:question-responder" - -[[checks]] -name = "question_responder_invoked" -type = "script" -command = "check-skill-invoked.sh question-responder" - -[[checks]] -name = "answer_contains_us" -type = "script" -command = "check-output-contains.sh US" - -[[checks]] -name = "answer_contains_google" -type = "script" -command = "check-output-contains.sh Google" diff --git a/agents/skill-bench/harness-plugin/skills/question-responder/SKILL.md b/agents/skill-bench/harness-plugin/skills/question-responder/SKILL.md deleted file mode 100644 index 13ee4e0..0000000 --- a/agents/skill-bench/harness-plugin/skills/question-responder/SKILL.md +++ /dev/null @@ -1,91 +0,0 @@ ---- -name: question-responder -description: "Find answers to common questions from project context before asking the user. Always use this skill when you need information like competitors, target country, or release date." -context: fork -metadata: - author: sonesuke - version: 0.1.0 ---- - -# Question Responder - -## Purpose - -Before asking the user questions, check if answers can be found in project context. - -## When to Use - -Use this skill whenever you need to ask the user a question. First check if the answer exists in: - -- Project documentation -- Test data files -- Configuration files - -## Process - -1. **Get Test Case Path**: Read the `SKILL_BENCH_TEST_CASE` environment variable to get the current test case file path - -2. **Read Test Case**: Use the Read tool to read the test case TOML file from the path - -3. **Find Answers**: Look for `[answers]` section in the test case file - - This section contains keyword-answer pairs - -4. **Match Question**: Check if the input question contains keywords from the `[answers]` keys - - For example, if the question is "Who are the competitors?", look for keys like "competitors" - - Use substring or keyword matching - -5. **Return Result**: - - **If answer found**: Return the answer value from the matched key - - **If not found**: Return "ANSWER_NOT_FOUND" to indicate user input is needed - -## Implementation Notes - -- The test case path is provided via `SKILL_BENCH_TEST_CASE` environment variable -- Access environment variables using the Bash tool: `echo $SKILL_BENCH_TEST_CASE` -- Match questions using flexible keyword matching (e.g., "competitors" matches "Who are the competitors?") - -## Test Case Format - -Answers are embedded directly in the test case TOML file using the `[answers]` section: - -```toml -# Test Case: Concept Interview - Uses Question Responder -name = "uses-question-responder" -description = "Verify concept-interview uses question-responder when information is missing" -timeout = 180 - -test_prompt = """ -I want to start a patent search for a new voice recognition system... -""" - -[answers] -"competitors" = ["Google", "Amazon"] -"target country" = "US" -"release date" = "2025-06-01" -"country" = "US" -"date" = "2025-06-01" -``` - -The `[answers]` section contains keyword-value pairs that match common questions. - -## Context Isolation - -This skill uses `context: fork` to run in an isolated sub-agent context. This ensures: - -- The main AI agent doesn't see the answer files -- Test integrity is maintained -- Answers are only revealed when explicitly requested - -## Usage Example - -```yaml -# Instead of: -AskUserQuestion: - questions: - - question: "What is the target country?" - -# Use: -Skill: - skill: question-responder - args: "What is the target country for patent search?" -``` diff --git a/agents/skill-bench/harness-plugin/skills/question-responder/assets/.gitkeep b/agents/skill-bench/harness-plugin/skills/question-responder/assets/.gitkeep deleted file mode 100644 index 557f538..0000000 --- a/agents/skill-bench/harness-plugin/skills/question-responder/assets/.gitkeep +++ /dev/null @@ -1 +0,0 @@ -# Assets directory for question-responder skill diff --git a/agents/skill-bench/runner.sh b/agents/skill-bench/runner.sh deleted file mode 100755 index ad874d5..0000000 --- a/agents/skill-bench/runner.sh +++ /dev/null @@ -1,220 +0,0 @@ -#!/bin/bash -# agents/skill-bench/runner.sh -# Skill test runner for patent-kit. -# All execution happens inside the container. -# -# Usage: ./runner.sh [pattern] -# pattern: Glob pattern to match test files (default: "cases/*/*.toml") - -set -o pipefail - -# Determine workspace root -WORKSPACE_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -# Determine skill-bench root -SKILL_BENCH_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# Determine tools directory -TOOLS_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/tools" -# Resolve pattern relative to skill-bench root -PATTERN="${1:-cases/*/*.toml}" -# Convert to absolute path -if [[ "$PATTERN" != /* ]]; then - TARGET_PATTERN="$SKILL_BENCH_ROOT/$PATTERN" -else - TARGET_PATTERN="$PATTERN" -fi - -echo "==================================================" -echo "[SkillBench] Starting Skill Test Runner" -echo "[SkillBench] Workspace: $WORKSPACE_ROOT" -echo "[SkillBench] Pattern: $TARGET_PATTERN" -echo "==================================================" - -TOTAL_CASES=0 -TOTAL_PASS=0 -TOTAL_FAIL=0 - -# --- Collect test files matching pattern --- -TEST_FILES=() -TEST_SKILLS=() -TEST_NAMES=() - -for TEST_FILE in $TARGET_PATTERN; do - [ -f "$TEST_FILE" ] || continue - - TEST_FILE_REL="${TEST_FILE#$WORKSPACE_ROOT/}" - SKILL_NAME=$(basename "$(dirname "$TEST_FILE_REL")") - TEST_NAME=$(basename "$TEST_FILE" .toml) - - TEST_FILES+=("$TEST_FILE") - TEST_SKILLS+=("$SKILL_NAME") - TEST_NAMES+=("$TEST_NAME") -done - -if [ ${#TEST_FILES[@]} -eq 0 ]; then - echo "[SkillBench] No test files found matching pattern: $TARGET_PATTERN" - exit 1 -fi - -# --- Process each test file --- -for IDX in "${!TEST_FILES[@]}"; do - TEST_FILE="${TEST_FILES[$IDX]}" - SKILL_NAME="${TEST_SKILLS[$IDX]}" - TEST_NAME="${TEST_NAMES[$IDX]}" - TEST_CASE_NAME="${SKILL_NAME}/${TEST_NAME}" - TOTAL_CASES=$((TOTAL_CASES + 1)) - - # Read test configuration - TEST_PROMPT=$(yq eval '.test_prompt' "$TEST_FILE") - TEST_TIMEOUT=$(yq eval '.timeout // 300' "$TEST_FILE") - - echo "" - echo "──────────────────────────────────────────────────" - echo "[SkillBench] Test Case: $TEST_CASE_NAME" - echo "──────────────────────────────────────────────────" - - # --- Phase 1: Setup and Execute trial --- - # Create skill-specific log directory - LOG_DIR="$SKILL_BENCH_ROOT/logs/${SKILL_NAME}" - mkdir -p "$LOG_DIR" - - TIMESTAMP=$(date +%Y%m%d_%H%M%S) - LOG_FILE="$LOG_DIR/${TIMESTAMP}_${TEST_NAME}.log" - WORK_DIR="/tmp/skill-bench-${TIMESTAMP}_${SKILL_NAME}-${TEST_NAME}" - - # Setup workspace - echo "[SkillBench] 📦 Setting up workspace: $WORK_DIR" - rm -rf "${WORK_DIR}" - mkdir -p "${WORK_DIR}" - - # Copy plugin directory as claude-plugin (required for skill testing) - # patent-kit uses 'plugin/' while google-patent-cli uses 'claude-plugin/' - cp -r "$WORKSPACE_ROOT/plugin" "$WORK_DIR/claude-plugin" 2>/dev/null || true - - # Read setup files from test.toml [[setup]] array - NUM_SETUP=$(yq eval '.setup | length // 0' "$TEST_FILE") - if [ "$NUM_SETUP" -gt 0 ]; then - for SETUP_IDX in $(seq 0 $((NUM_SETUP - 1))); do - SETUP_TYPE=$(yq eval ".setup[$SETUP_IDX].type // \"\"" "$TEST_FILE") - SETUP_NAME=$(yq eval ".setup[$SETUP_IDX].name // \"\"" "$TEST_FILE") - - if [ "$SETUP_TYPE" = "script" ]; then - # Execute script setup - SETUP_COMMAND=$(yq eval ".setup[$SETUP_IDX].command" "$TEST_FILE") - if [ -n "$SETUP_NAME" ]; then - echo "[SkillBench] → Setup: $SETUP_NAME" - else - echo "[SkillBench] → Setup: $SETUP_COMMAND" - fi - - # Execute in WORK_DIR with tools in PATH - (cd "$WORK_DIR" && PATH="$TOOLS_DIR:$PATH" bash -c "$SETUP_COMMAND") - else - # File content setup (default behavior) - SETUP_PATH=$(yq eval ".setup[$SETUP_IDX].path" "$TEST_FILE") - if [ -z "$SETUP_PATH" ]; then - echo "[SkillBench] ⚠️ Skipping setup with no path (index $SETUP_IDX)" - continue - fi - - SETUP_DIR=$(dirname "$WORK_DIR/$SETUP_PATH") - mkdir -p "$SETUP_DIR" - yq eval ".setup[$SETUP_IDX].content" "$TEST_FILE" > "$WORK_DIR/${SETUP_PATH}" - - if [ -n "$SETUP_NAME" ]; then - echo "[SkillBench] → Setup: $SETUP_NAME ($SETUP_PATH)" - fi - fi - done - fi - - # Execute trial - echo "[SkillBench] Running trial → $LOG_FILE" - START_TIME=$(date +%s) - - # Unset CLAUDECODE to avoid nested session error - (cd "$WORK_DIR" && unset CLAUDECODE && SKILL_BENCH_TEST_CASE="$TEST_FILE" claude -p \ - --dangerously-skip-permissions \ - --verbose \ - --output-format stream-json \ - --plugin-dir ./claude-plugin \ - --plugin-dir "$WORKSPACE_ROOT/agents/skill-bench/harness-plugin" \ - -- "$TEST_PROMPT" < /dev/null | jq -c '(. + {timestamp: now})') > "$LOG_FILE" 2>&1 - - EXIT_CODE=$? - END_TIME=$(date +%s) - DURATION=$(( END_TIME - START_TIME )) - - if [ $EXIT_CODE -eq 0 ]; then - echo "[SkillBench] ✅ Trial finished (took ${DURATION}s)" - else - echo "[SkillBench] ⚠️ Trial exited with code $EXIT_CODE (took ${DURATION}s)" - fi - - # --- Phase 2: Evaluate trial --- - echo "[SkillBench] Running evaluation..." - - CASE_PASS=true - - # Run checks from test.toml - NUM_CHECKS=$(yq eval '.checks | length' "$TEST_FILE") - for CHECK_IDX in $(seq 0 $((NUM_CHECKS - 1))); do - CHECK_NAME=$(yq eval ".checks[$CHECK_IDX].name" "$TEST_FILE") - CHECK_CMD=$(yq eval ".checks[$CHECK_IDX].command" "$TEST_FILE") - - # Parse check command into script and args - # Use eval to properly handle quoted arguments - CHECK_SCRIPT=$(echo "$CHECK_CMD" | awk '{print $1}') - CHECK_ARGS=$(echo "$CHECK_CMD" | cut -d' ' -f2-) - - if [ -n "$CHECK_ARGS" ]; then - # Command has arguments: script.sh arg1 arg2... - # Use eval to properly expand quoted arguments - # Replace {} placeholders with LOG_FILE and WORK_DIR (if present) - if echo "$CHECK_ARGS" | grep -q '{}'; then - CHECK_ARGS=$(echo "$CHECK_ARGS" | sed 's|{}|"'$LOG_FILE'"|1' | sed 's|{}|"'$WORK_DIR'"|1') - # {} placeholders were replaced, don't pass LOG_FILE and WORK_DIR again - if eval "\"$TOOLS_DIR/$CHECK_SCRIPT\" $CHECK_ARGS" >/dev/null 2>&1; then - echo "[SkillBench] ✅ $CHECK_NAME" - else - echo "[SkillBench] ❌ $CHECK_NAME" - CASE_PASS=false - fi - else - # No {} placeholders, pass LOG_FILE and WORK_DIR as first two args - if eval "\"$TOOLS_DIR/$CHECK_SCRIPT\" \"$LOG_FILE\" \"$WORK_DIR\" $CHECK_ARGS" >/dev/null 2>&1; then - echo "[SkillBench] ✅ $CHECK_NAME" - else - echo "[SkillBench] ❌ $CHECK_NAME" - CASE_PASS=false - fi - fi - else - # Command has no arguments: script.sh - # Still pass LOG_FILE and WORK_DIR - if $TOOLS_DIR/$CHECK_SCRIPT "$LOG_FILE" "$WORK_DIR" >/dev/null 2>&1; then - echo "[SkillBench] ✅ $CHECK_NAME" - else - echo "[SkillBench] ❌ $CHECK_NAME" - CASE_PASS=false - fi - fi - done - - # Display case result - if [ "$CASE_PASS" = true ]; then - echo "[SkillBench] ✅ $TEST_CASE_NAME: PASS" - TOTAL_PASS=$((TOTAL_PASS + 1)) - else - echo "[SkillBench] ❌ $TEST_CASE_NAME: FAIL" - TOTAL_FAIL=$((TOTAL_FAIL + 1)) - fi -done - -# --- Summary --- -echo "" -echo "==================================================" -echo "[SkillBench] Test Summary" -echo "[SkillBench] Total: $TOTAL_CASES | Pass: $TOTAL_PASS | Fail: $TOTAL_FAIL" -echo "==================================================" - -exit "$TOTAL_FAIL" diff --git a/agents/skill-bench/tools/check-db-query.sh b/agents/skill-bench/tools/check-db-query.sh deleted file mode 100755 index dde6114..0000000 --- a/agents/skill-bench/tools/check-db-query.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash -# Check database query result -# Usage: check-db-query.sh -# Note: query comes last to avoid issues with special characters - -LOG_FILE="$1" -WORK_DIR="$2" -EXPECTED="$3" -QUERY="$4" - -cd "$WORK_DIR" || exit 1 - -if [ -f "patents.db" ]; then - RESULT=$(sqlite3 patents.db "$QUERY" 2>/dev/null | tr -d '\n') - # Handle numeric comparisons like '>0', '<5', '=10' - if [[ "$EXPECTED" =~ ^([<>]=?|=)([0-9]+)$ ]]; then - OP="${BASH_REMATCH[1]}" - NUM="${BASH_REMATCH[2]}" - if [ "$OP" = ">" ] && [ "$RESULT" -gt "$NUM" ]; then - exit 0 - elif [ "$OP" = ">=" ] && [ "$RESULT" -ge "$NUM" ]; then - exit 0 - elif [ "$OP" = "<" ] && [ "$RESULT" -lt "$NUM" ]; then - exit 0 - elif [ "$OP" = "<=" ] && [ "$RESULT" -le "$NUM" ]; then - exit 0 - elif [ "$OP" = "=" ] && [ "$RESULT" -eq "$NUM" ]; then - exit 0 - fi - elif [ "$RESULT" = "$EXPECTED" ]; then - exit 0 - fi -fi -exit 1 diff --git a/agents/skill-bench/tools/check-file-content.sh b/agents/skill-bench/tools/check-file-content.sh deleted file mode 100755 index cf862d0..0000000 --- a/agents/skill-bench/tools/check-file-content.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# check-file-content.sh - Check if a workspace file contains specific content -# Usage: check-file-content.sh - -LOG_FILE="${1:-}" -WORK_DIR="${2:-}" -FILENAME="${3:-}" -SEARCH_STRING="${4:-}" - -if [ -z "$LOG_FILE" ] || [ -z "$WORK_DIR" ] || [ -z "$FILENAME" ] || [ -z "$SEARCH_STRING" ]; then - echo "[Error] Usage: $0 " >&2 - exit 1 -fi - -# Check if the file exists in the workspace -FILE_PATH="$WORK_DIR/$FILENAME" - -if [ ! -f "$FILE_PATH" ]; then - # File doesn't exist - failure - exit 1 -fi - -# Check if the file contains the search string -if grep -q "$SEARCH_STRING" "$FILE_PATH"; then - # File contains the string - success - exit 0 -else - # File doesn't contain the string - failure - exit 1 -fi diff --git a/agents/skill-bench/tools/check-file-not-contains.sh b/agents/skill-bench/tools/check-file-not-contains.sh deleted file mode 100755 index 67ac7a3..0000000 --- a/agents/skill-bench/tools/check-file-not-contains.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# check-file-not-contains.sh - Check if a workspace file does NOT contain specific content -# Usage: check-file-not-contains.sh - -LOG_FILE="${1:-}" -WORK_DIR="${2:-}" -FILENAME="${3:-}" -SEARCH_STRING="${4:-}" - -if [ -z "$LOG_FILE" ] || [ -z "$WORK_DIR" ] || [ -z "$FILENAME" ] || [ -z "$SEARCH_STRING" ]; then - echo "[Error] Usage: $0 " >&2 - exit 1 -fi - -# Check if the file exists in the workspace -FILE_PATH="$WORK_DIR/$FILENAME" - -if [ ! -f "$FILE_PATH" ]; then - # File doesn't exist - treat as not containing (success for this check) - exit 0 -fi - -# Check if the file does NOT contain the search string -if grep -q "$SEARCH_STRING" "$FILE_PATH"; then - # File contains the string - failure (we expect it NOT to) - exit 1 -else - # File doesn't contain the string - success - exit 0 -fi diff --git a/agents/skill-bench/tools/check-log-contains.sh b/agents/skill-bench/tools/check-log-contains.sh deleted file mode 100755 index bbc58a3..0000000 --- a/agents/skill-bench/tools/check-log-contains.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -# Check if log file contains a specific pattern -# Usage: check-log-contains.sh - -LOG_FILE="$1" -PATTERN="$2" - -if [ ! -f "$LOG_FILE" ]; then - echo "Error: Log file not found: $LOG_FILE" - exit 1 -fi - -if grep -q "$PATTERN" "$LOG_FILE"; then - exit 0 -else - exit 1 -fi diff --git a/agents/skill-bench/tools/check-mcp-loaded.sh b/agents/skill-bench/tools/check-mcp-loaded.sh deleted file mode 100755 index 31f6257..0000000 --- a/agents/skill-bench/tools/check-mcp-loaded.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash -# Check if MCP server loaded successfully in a log file -# Usage: check-mcp-loaded.sh -# Returns: 0 if MCP server loaded successfully, 1 if failed or not found - -LOG_FILE="$1" -WORK_DIR="$2" -MCP_SERVER_NAME="$3" - -if [[ -z "$LOG_FILE" ]] || [[ -z "$MCP_SERVER_NAME" ]]; then - echo "Usage: $0 " >&2 - exit 2 -fi - -if [[ ! -f "$LOG_FILE" ]]; then - echo "Log file not found: $LOG_FILE" >&2 - exit 2 -fi - -# Check MCP server status in init message (first line is init message) -# The mcp_servers array contains objects with name and status fields -STATUS=$(head -1 "$LOG_FILE" | jq -r ' - .mcp_servers? // [] - | .[] | select(.name? | test("'"$MCP_SERVER_NAME"'")) - | .status // "not_found" -') - -if [[ "$STATUS" == "not_found" ]]; then - echo "MCP server $MCP_SERVER_NAME not found in log" >&2 - exit 1 -fi - -if [[ "$STATUS" == "failed" ]]; then - echo "MCP server $MCP_SERVER_NAME failed to load (status: failed)" >&2 - exit 1 -fi - -exit 0 diff --git a/agents/skill-bench/tools/check-mcp-success.sh b/agents/skill-bench/tools/check-mcp-success.sh deleted file mode 100755 index f6b9db4..0000000 --- a/agents/skill-bench/tools/check-mcp-success.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -# Check if MCP tool calls succeeded in a log file -# Usage: check-mcp-success.sh [--optional] -# --optional: If no MCP calls are made, return success (default: fail) -# Returns: 0 if all MCP calls succeeded (or none made with --optional), 1 if any failed - -LOG_FILE="$1" -WORK_DIR="$2" -MCP_TOOL_NAME="$3" -OPTIONAL_FLAG="${4:-}" - -if [[ -z "$LOG_FILE" ]] || [[ -z "$MCP_TOOL_NAME" ]]; then - echo "Usage: $0 [--optional]" >&2 - exit 2 -fi - -if [[ ! -f "$LOG_FILE" ]]; then - echo "Log file not found: $LOG_FILE" >&2 - exit 2 -fi - -# Extract tool_use IDs for the specified MCP tool from assistant messages -TOOL_USE_IDS=$(jq -r ' - .[] - | select(.type? == "assistant") - | (.message.content? // []) - | select(type == "array") - | .[] - | select(type == "object" and .type? == "tool_use" and (.name? // "") | test("'"$MCP_TOOL_NAME"'")) - | .id -' "$LOG_FILE") - -# Count how many tool_use IDs we found -ID_COUNT=$(echo "$TOOL_USE_IDS" | grep -c '^\w*$' || true) - -# If no MCP calls were made -if [[ $ID_COUNT -eq 0 ]]; then - if [[ "$OPTIONAL_FLAG" == "--optional" ]]; then - # Optional check: return success if no calls were made - exit 0 - else - # Required check: return failure if no calls were made - echo "No $MCP_TOOL_NAME tool calls found in log" >&2 - exit 1 - fi -fi - -# Check if any of the corresponding tool_results have is_error: true -while IFS= read -r tool_id; do - if [[ -n "$tool_id" ]]; then - ERROR_CHECK=$(jq -r " - .[] - | select(.type? == \"user\") - | (.message.content? // []) - | select(type == \"array\") - | .[] - | select(type == \"object\" and .type? == \"tool_result\" and .tool_use_id? == \"$tool_id\") - | .is_error // false - " "$LOG_FILE") - - if [[ "$ERROR_CHECK" == "true" ]]; then - echo "MCP tool $MCP_TOOL_NAME (tool_use_id: $tool_id) returned an error" >&2 - exit 1 - fi - fi -done <<< "$TOOL_USE_IDS" - -# All MCP calls succeeded -exit 0 diff --git a/agents/skill-bench/tools/check-mcp-tool-invoked.sh b/agents/skill-bench/tools/check-mcp-tool-invoked.sh deleted file mode 100755 index 700b37b..0000000 --- a/agents/skill-bench/tools/check-mcp-tool-invoked.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# check-mcp-tool-invoked.sh - Check if a specific MCP tool was invoked -# Usage: check-mcp-tool-invoked.sh -# mcp_tool_name: e.g., "search_patents", "fetch_patent", etc. - -LOG_FILE="${1:-}" -WORK_DIR="${2:-}" -TOOL_NAME="${3:-}" - -if [ -z "$LOG_FILE" ] || [ -z "$TOOL_NAME" ]; then - echo "[Error] Usage: $0 " >&2 - exit 1 -fi - -# Check if the MCP tool was invoked in the log -# MCP tools appear as: "name":"mcp__plugin_xxx__tool_name" -if grep -q '"name":"mcp__'" "$LOG_FILE" && grep -q '"name":"mcp__.*__'"$TOOL_NAME"'"' "$LOG_FILE"; then - # Tool was invoked - success - exit 0 -else - # Tool was not invoked - failure - exit 1 -fi diff --git a/agents/skill-bench/tools/check-output-contains.sh b/agents/skill-bench/tools/check-output-contains.sh deleted file mode 100755 index 5b26e3d..0000000 --- a/agents/skill-bench/tools/check-output-contains.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -# check-output-contains.sh - Check if the agent output contains a specific string -# Usage: check-output-contains.sh - -LOG_FILE="${1:-}" -WORK_DIR="${2:-}" -SEARCH_STRING="${3:-}" - -if [ -z "$LOG_FILE" ] || [ -z "$SEARCH_STRING" ]; then - echo "[Error] Usage: $0 " >&2 - exit 1 -fi - -# Check if the output contains the search string -# We look for assistant messages in the log -if grep -q '"type":"assistant"' "$LOG_FILE" && grep -q "content" "$LOG_FILE" && grep -qi "$SEARCH_STRING" "$LOG_FILE"; then - exit 0 -else - exit 1 -fi diff --git a/agents/skill-bench/tools/check-output-file.sh b/agents/skill-bench/tools/check-output-file.sh deleted file mode 100755 index d0a9ebd..0000000 --- a/agents/skill-bench/tools/check-output-file.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -# Check if output_file was created in log -# Usage: check-output-file.sh - -LOG_FILE="$1" -WORK_DIR="$2" - -if [ -z "$LOG_FILE" ]; then - echo "[Error] Usage: $0 " >&2 - exit 1 -fi - -# Check if output_file exists in tool_result content -# Use try/catch to handle invalid JSON in content field -jq -s '[.[] | select(.type == "user") | .message.content[]? | select(type == "object" and .type == "tool_result" and .tool_use_id? and .content? != null and (.content | type) == "string") | .content | try fromjson catch null | select(. != null) | .output_file] | length > 0' "$LOG_FILE" diff --git a/agents/skill-bench/tools/check-param.sh b/agents/skill-bench/tools/check-param.sh deleted file mode 100755 index 27ff5b1..0000000 --- a/agents/skill-bench/tools/check-param.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# Check if parameter was used in tool call -# Usage: check-param.sh - -LOG_FILE="$1" -WORK_DIR="$2" -TOOL_NAME="$3" -PARAM_NAME="$4" -EXPECTED_VALUE="$5" - -if [ -z "$LOG_FILE" ] || [ -z "$TOOL_NAME" ] || [ -z "$PARAM_NAME" ]; then - echo "[Error] Usage: $0 [expected_value]" >&2 - exit 1 -fi - -if [ -n "$EXPECTED_VALUE" ]; then - # Check if parameter equals expected value (handle both string and array) - jq -s "[.[] | select(.type == \"assistant\") | .message.content[]? | select(type == \"object\" and .type == \"tool_use\") | select(.name | test(\"$TOOL_NAME\"; \"i\")) | .input.$PARAM_NAME | (if type == \"array\" then .[] == \"$EXPECTED_VALUE\" else . == \"$EXPECTED_VALUE\" end)] | any" "$LOG_FILE" -else - # Check if parameter exists - jq -s "[.[] | select(.type == \"assistant\") | .message.content[]? | select(type == \"object\" and .type == \"tool_use\") | select(.name | test(\"$TOOL_NAME\"; \"i\")) | .input.$PARAM_NAME] | length > 0" "$LOG_FILE" -fi diff --git a/agents/skill-bench/tools/check-skill-invoked.sh b/agents/skill-bench/tools/check-skill-invoked.sh deleted file mode 100755 index 304ad3a..0000000 --- a/agents/skill-bench/tools/check-skill-invoked.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -# check-skill-invoked.sh - Check if a specific skill was invoked -# Usage: check-skill-invoked.sh [--not] -# --not: Invert the check (verify skill was NOT invoked) - -LOG_FILE="${1:-}" -WORK_DIR="${2:-}" -SKILL_NAME="${3:-}" -INVERT="${4:-}" - -if [ -z "$LOG_FILE" ] || [ -z "$SKILL_NAME" ]; then - echo "[Error] Usage: $0 [--not]" >&2 - exit 1 -fi - -# Check if the skill was invoked in the log -# Note: Log is JSONL format with "name":"Skill" and "skill":"patent-kit:" -if grep -q '"Skill"' "$LOG_FILE" && grep -q '"skill":".*'"$SKILL_NAME" "$LOG_FILE"; then - # Skill was invoked - if [ "$INVERT" = "--not" ]; then - # We expected it NOT to be invoked, but it was - failure - exit 1 - else - # We expected it to be invoked, and it was - success - exit 0 - fi -else - # Skill was not invoked - if [ "$INVERT" = "--not" ]; then - # We expected it NOT to be invoked, and it wasn't - success - exit 0 - else - # We expected it to be invoked, but it wasn't - failure - exit 1 - fi -fi diff --git a/agents/skill-bench/tools/check-skill-loaded.sh b/agents/skill-bench/tools/check-skill-loaded.sh deleted file mode 100755 index 43f37b5..0000000 --- a/agents/skill-bench/tools/check-skill-loaded.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# Check if a skill was loaded successfully in a log file -# Usage: check-skill-loaded.sh -# Returns: 0 if skill found in init skills array, 1 if not found - -LOG_FILE="$1" -WORK_DIR="$2" -SKILL_NAME="$3" - -if [[ -z "$LOG_FILE" ]] || [[ -z "$SKILL_NAME" ]]; then - echo "Usage: $0 " >&2 - exit 2 -fi - -if [[ ! -f "$LOG_FILE" ]]; then - echo "Log file not found: $LOG_FILE" >&2 - exit 2 -fi - -# Check if skill is in the init skills array (first line is init message) -FOUND=$(head -1 "$LOG_FILE" | jq -c ' - .skills | any(.[]; contains("'$SKILL_NAME'")) -') - -if [[ "$FOUND" != "true" ]]; then - echo "Skill $SKILL_NAME not found in init skills array" >&2 - exit 1 -fi - -exit 0 diff --git a/agents/skill-bench/tools/check-skill-not-invoked.sh b/agents/skill-bench/tools/check-skill-not-invoked.sh deleted file mode 100755 index 597bd38..0000000 --- a/agents/skill-bench/tools/check-skill-not-invoked.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# check-skill-not-invoked.sh - Check if a specific skill was NOT invoked -# Usage: check-skill-not-invoked.sh - -LOG_FILE="${1:-}" -WORK_DIR="${2:-}" -SKILL_NAME="${3:-}" - -if [ -z "$LOG_FILE" ] || [ -z "$SKILL_NAME" ]; then - echo "[Error] Usage: $0 " >&2 - exit 1 -fi - -# Check if the skill was NOT invoked in the log -# Note: Log is JSONL format with "name":"Skill" and "skill":"patent-kit:" -if grep -q '"Skill"' "$LOG_FILE" && grep -q '"skill":".*'"$SKILL_NAME" "$LOG_FILE"; then - # Skill was invoked - we expected it NOT to be - failure - exit 1 -else - # Skill was not invoked - success - exit 0 -fi diff --git a/agents/skill-bench/tools/check-text-contains.sh b/agents/skill-bench/tools/check-text-contains.sh deleted file mode 100755 index f059f92..0000000 --- a/agents/skill-bench/tools/check-text-contains.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -# Check if text content contains specific patterns -# Usage: check-text-contains.sh [pattern2] ... -# log_file: Path to the log file -# work_dir: Path to the workspace directory -# pattern: Text pattern to search for (can specify multiple) - -LOG_FILE="$1" -WORK_DIR="$2" -shift 2 -PATTERNS=("$@") - -if [ -z "$LOG_FILE" ] || [ ${#PATTERNS[@]} -eq 0 ]; then - echo "[Error] Usage: $0 [pattern2] ..." >&2 - exit 1 -fi - -# Check if any pattern is found in assistant text content -FOUND=false -for PATTERN in "${PATTERNS[@]}"; do - if grep -q "\"text\":\"[^\"]*$PATTERN[^\"]*\"" "$LOG_FILE" 2>/dev/null; then - FOUND=true - break - fi -done - -if [ "$FOUND" = "true" ]; then - exit 0 -else - exit 1 -fi diff --git a/agents/skill-bench/tools/check-tool-use.sh b/agents/skill-bench/tools/check-tool-use.sh deleted file mode 100755 index 1b07873..0000000 --- a/agents/skill-bench/tools/check-tool-use.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash -# Check if a specific tool was called with specific parameters -# Usage: check-tool-use.sh -# log_file: Path to the log file -# work_dir: Path to the workspace directory -# tool_name: Name of the tool to check (e.g., "Read", "Write") -# param_name: Name of the parameter to check (e.g., "file_path") -# param_pattern: Pattern to match in the parameter value (regex) - -LOG_FILE="$1" -WORK_DIR="$2" -TOOL_NAME="$3" -PARAM_NAME="${4:-}" -PARAM_PATTERN="${5:-}" - -if [ -z "$LOG_FILE" ] || [ -z "$TOOL_NAME" ]; then - echo "[Error] Usage: $0 [param_name] [param_pattern]" >&2 - exit 1 -fi - -if [ -n "$PARAM_NAME" ] && [ -n "$PARAM_PATTERN" ]; then - # Check if tool was called with specific parameter matching pattern - jq -s "[.[] | select(.type == \"assistant\") | .message.content[]? | select(type == \"object\" and .type == \"tool_use\" and .name == \"$TOOL_NAME\") | select(.input.$PARAM_NAME | test(\"$PARAM_PATTERN\"))] | length > 0" "$LOG_FILE" -elif [ -n "$PARAM_NAME" ]; then - # Check if tool was called with specific parameter (any value) - jq -s "[.[] | select(.type == \"assistant\") | .message.content[]? | select(type == \"object\" and .type == \"tool_use\" and .name == \"$TOOL_NAME\") | .input.$PARAM_NAME] | length > 0" "$LOG_FILE" -else - # Check if tool was called (any parameters) - jq -s "[.[] | select(.type == \"assistant\") | .message.content[]? | select(type == \"object\" and .type == \"tool_use\" and .name == \"$TOOL_NAME\")] | length > 0" "$LOG_FILE" -fi diff --git a/agents/skill-bench/tools/check-workspace-dir.sh b/agents/skill-bench/tools/check-workspace-dir.sh deleted file mode 100755 index b5ac0d3..0000000 --- a/agents/skill-bench/tools/check-workspace-dir.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -# Check if directories exist in the workspace -# Usage: check-workspace-dir.sh [dir2] ... -# log_file: Path to the log file -# work_dir: Path to the workspace directory -# dir: Directory path to check (can specify multiple, all must exist) - -LOG_FILE="$1" -WORK_DIR="$2" -shift 2 -DIRS=("$@") - -if [ -z "$WORK_DIR" ] || [ ${#DIRS[@]} -eq 0 ]; then - echo "[Error] Usage: $0 [dir2] ..." >&2 - exit 1 -fi - -# Check if all directories exist -for DIR in "${DIRS[@]}"; do - if [ ! -d "$WORK_DIR/$DIR" ]; then - exit 1 - fi -done - -exit 0 diff --git a/agents/skill-bench/tools/check-workspace-file.sh b/agents/skill-bench/tools/check-workspace-file.sh deleted file mode 100755 index 674e745..0000000 --- a/agents/skill-bench/tools/check-workspace-file.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -# Check if a file exists in the workspace -# Usage: check-workspace-file.sh -# log_file: Path to the log file -# work_dir: Path to the workspace directory -# file_path: Relative path to the file to check - -LOG_FILE="$1" -WORK_DIR="$2" -FILE_PATH="$3" - -if [ -z "$WORK_DIR" ] || [ -z "$FILE_PATH" ]; then - echo "[Error] Usage: $0 " >&2 - exit 1 -fi - -[ -f "$WORK_DIR/$FILE_PATH" ] diff --git a/agents/skill-bench/tools/setup-db.sh b/agents/skill-bench/tools/setup-db.sh deleted file mode 100755 index 5750d98..0000000 --- a/agents/skill-bench/tools/setup-db.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash -# Generic database setup script -# Usage: -# setup-db.sh init - Initialize database -# setup-db.sh execute - Execute SQL file - -set -e - -COMMAND="${1:-}" -shift || true - -case "$COMMAND" in - init) - # Check if database already exists - if [ -f patents.db ]; then - echo "Database already exists, skipping init" - exit 0 - fi - - # Find initialize-database.sql - # Try multiple possible locations since we might be run from different directories - SQL_FILE="" - - # Check if we're in workspace (has claude-plugin directory) - if [ -d "./claude-plugin" ]; then - SQL_FILE="./claude-plugin/skills/investigation-preparing/references/sql/initialize-database.sql" - fi - - # Fallback to script-relative path - if [ -z "$SQL_FILE" ] || [ ! -f "$SQL_FILE" ]; then - SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" - SQL_FILE="$SCRIPT_DIR/../../plugin/skills/investigation-preparing/references/sql/initialize-database.sql" - fi - - if [ -z "$SQL_FILE" ] || [ ! -f "$SQL_FILE" ]; then - echo "Error: initialize-database.sql not found" - exit 1 - fi - - # Initialize database - sqlite3 patents.db < "$SQL_FILE" - echo "Database initialized" - ;; - - execute) - SQL_FILE="$1" - - if [ -z "$SQL_FILE" ]; then - echo "Error: SQL file not specified" - echo "Usage: setup-db.sh execute " - exit 1 - fi - - if [ ! -f "$SQL_FILE" ]; then - echo "Error: SQL file not found: $SQL_FILE" - exit 1 - fi - - # Execute SQL file - sqlite3 patents.db < "$SQL_FILE" - echo "SQL file executed: $SQL_FILE" - ;; - - *) - echo "Error: Unknown command '$COMMAND'" - echo "Usage:" - echo " setup-db.sh init - Initialize database" - echo " setup-db.sh execute - Execute SQL file" - exit 1 - ;; -esac diff --git a/tests/constitution-reminding/functional.toml b/tests/constitution-reminding/functional.toml deleted file mode 100644 index 050013f..0000000 --- a/tests/constitution-reminding/functional.toml +++ /dev/null @@ -1,29 +0,0 @@ -# Test Case: Constitution Reminding Functional - -name = "functional" -description = "Verify constitution-reminding loads and displays core principles" -timeout = 60 # seconds - -test_prompt = """ -Load the constitution skill to understand the core principles. -""" - -[[checks]] -name = "mcp_server_loaded" -command = { command = "mcp-loaded", server = "google-patent-cli" } - -[[checks]] -name = "skill_loaded" -command = { command = "skill-loaded", skill = "constitution-reminding" } - -[[checks]] -name = "constitution_reminding_invoked" -command = { command = "skill-invoked", skill = "constitution-reminding" } - -[[checks]] -name = "references_instructions_read" -command = { command = "tool-use", tool = "Read", param = "file_path", value = "constitution-reminding.*references/instructions.md" } - -[[checks]] -name = "constitution_text_displayed" -command = { command = "message-contains", text = "I." } diff --git a/tests/constitution-reminding/triggering.toml b/tests/constitution-reminding/triggering.toml deleted file mode 100644 index fd2a607..0000000 --- a/tests/constitution-reminding/triggering.toml +++ /dev/null @@ -1,21 +0,0 @@ -# Test Case: Constitution Reminding - Triggering - -name = "triggering" -description = "Verify constitution-reminding skill is triggered when asked about core principles" -timeout = 60 - -test_prompt = """ -Load the constitution skill to understand the core principles. -""" - -[[checks]] -name = "mcp_server_loaded" -command = { command = "mcp-loaded", server = "google-patent-cli" } - -[[checks]] -name = "skill_loaded" -command = { command = "skill-loaded", skill = "constitution-reminding" } - -[[checks]] -name = "constitution_reminding_invoked" -command = { command = "skill-invoked", skill = "constitution-reminding" } diff --git a/tests/investigating-database/functional-get-patent-id.toml b/tests/investigating-database/functional-get-patent-id.toml deleted file mode 100644 index 32a45c4..0000000 --- a/tests/investigating-database/functional-get-patent-id.toml +++ /dev/null @@ -1,55 +0,0 @@ -# Test Case: Investigating Database - Get Patent ID - -name = "functional-get-patent-id" -description = "Verify investigating-database can retrieve patent ID by row number" -timeout = 120 # seconds - -test_prompt = """ -Get the patent ID at row 2 from the database. -""" - -[[setup]] -path = "patents.db" -content = """ -#!/usr/bin/env bash -# Setup script to create test database -sqlite3 patents.db < Date: Mon, 30 Mar 2026 02:50:36 +0000 Subject: [PATCH 2/2] fix: remove constitution-reminding checks from test cases - Remove constitution_loaded checks from concept-interviewing and targeting tests - Skill no longer exists in current codebase Co-Authored-By: Claude Opus 4.6 --- tests/concept-interviewing/functional-no-spec.toml | 4 ---- tests/targeting/functional-with-spec.toml | 4 ---- 2 files changed, 8 deletions(-) diff --git a/tests/concept-interviewing/functional-no-spec.toml b/tests/concept-interviewing/functional-no-spec.toml index 1ce96ba..3990b7f 100644 --- a/tests/concept-interviewing/functional-no-spec.toml +++ b/tests/concept-interviewing/functional-no-spec.toml @@ -38,10 +38,6 @@ command = { command = "skill-invoked", skill = "concept-interviewing" } name = "patent_assignee_check_invoked" command = { command = "skill-invoked", skill = "patent-assignee-check" } -[[checks]] -name = "constitution_loaded" -command = { command = "skill-invoked", skill = "constitution-reminding" } - [[checks]] name = "references_instructions_read" command = { command = "tool-use", tool = "Read", param = "file_path", value = "concept-interview.*references/instructions.md" } diff --git a/tests/targeting/functional-with-spec.toml b/tests/targeting/functional-with-spec.toml index 0845749..0346e4e 100644 --- a/tests/targeting/functional-with-spec.toml +++ b/tests/targeting/functional-with-spec.toml @@ -53,10 +53,6 @@ command = { command = "skill-loaded", skill = "targeting" } name = "targeting_invoked" command = { command = "skill-invoked", skill = "targeting" } -[[checks]] -name = "constitution_loaded" -command = { command = "skill-invoked", skill = "constitution-reminding" } - [[checks]] name = "specification_read" command = { command = "tool-use", tool = "Read", param = "file_path", value = "0-specifications/specification.md" }