From a7419dbaad220d9ea09736ea7744a56d6857a745 Mon Sep 17 00:00:00 2001 From: sonesuke Date: Mon, 30 Mar 2026 06:00:49 +0000 Subject: [PATCH 1/2] fix: document cypher patterns and forbid direct JSON file access Add explicit cypher query examples and a CRITICAL instruction to use execute_cypher for all result retrieval. Models were bypassing cypher and reading /tmp/patent-search-*.json with python3, causing redundant file reads and wasted time. Co-Authored-By: Claude Opus 4.6 --- .../skills/patent-assignee-check/SKILL.md | 18 +++++++ claude-plugin/skills/patent-search/SKILL.md | 47 +++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/claude-plugin/skills/patent-assignee-check/SKILL.md b/claude-plugin/skills/patent-assignee-check/SKILL.md index cb01f73..b363917 100644 --- a/claude-plugin/skills/patent-assignee-check/SKILL.md +++ b/claude-plugin/skills/patent-assignee-check/SKILL.md @@ -38,6 +38,24 @@ execute_cypher({ }) ``` +**CRITICAL**: After searching, always use `execute_cypher` to retrieve results. +Do NOT read the output JSON file directly. The JSON file is an internal +artifact — all data is available through cypher queries. + +### Result Retrieval Patterns + +**Assignee name variations with counts**: + +```cypher +MATCH (p:Patent) RETURN p.assignee, COUNT(*) AS count ORDER BY count DESC +``` + +**Assignee variations with sample titles**: + +```cypher +MATCH (p:Patent) RETURN p.assignee, p.title, p.snippet LIMIT 20 +``` + ## Parameters - `company_name` (string, required): Company name to check for variations diff --git a/claude-plugin/skills/patent-search/SKILL.md b/claude-plugin/skills/patent-search/SKILL.md index c7de781..06f7942 100644 --- a/claude-plugin/skills/patent-search/SKILL.md +++ b/claude-plugin/skills/patent-search/SKILL.md @@ -38,6 +38,53 @@ execute_cypher({ }) ``` +**CRITICAL**: After searching, always use `execute_cypher` to retrieve results. +Do NOT read the output JSON file directly. The JSON file is an internal +artifact — all data is available through cypher queries. + +### Result Retrieval Patterns + +Use these cypher patterns to retrieve search results: + +**Total count**: + +```cypher +MATCH (p:Patent) RETURN COUNT(*) AS count +``` + +**Top 20 snippets for noise analysis**: + +```cypher +MATCH (p:Patent) RETURN p.id, p.title, p.snippet, p.assignee LIMIT 20 +``` + +**Assignee breakdown**: + +```cypher +MATCH (p:Patent) RETURN p.assignee, COUNT(*) AS count ORDER BY count DESC +``` + +**Date range summary**: + +```cypher +MATCH (p:Patent) RETURN p.filing_date, p.title LIMIT 10 +``` + +### Available Patent Node Fields + +| Field | Description | +| ------------------ | ------------------------------- | +| `id` | Patent ID (e.g., "US9152718B2") | +| `title` | Patent title | +| `snippet` | Search result snippet | +| `abstract_text` | Full abstract | +| `assignee` | Assignee/applicant name | +| `filing_date` | Filing date | +| `publication_date` | Publication date | +| `url` | Google Patents URL | +| `legal_status` | Legal status | +| `family_id` | Patent family ID | + ### Date Filter Examples Search patents filed in 2023: From b188accf3429f5cf8f15a80dd986ce8bd3af1d79 Mon Sep 17 00:00:00 2001 From: sonesuke Date: Mon, 30 Mar 2026 07:33:21 +0000 Subject: [PATCH 2/2] refactor: migrate skill-bench from shell scripts to CLI-based tests Replace runner.sh and tools/*.sh with skill-bench CLI native TOML format. Move test cases from agents/skill-bench/cases/ to tests/. Co-Authored-By: Claude Opus 4.6 --- AGENTS.md | 13 +- agents/skill-bench/.gitignore | 1 - agents/skill-bench/runner.sh | 177 ------------------ agents/skill-bench/tools/check-mcp-loaded.sh | 37 ---- agents/skill-bench/tools/check-mcp-success.sh | 62 ------ .../tools/check-output-contains.sh | 17 -- agents/skill-bench/tools/check-output-file.sh | 14 -- .../tools/check-output-not-contains.sh | 17 -- agents/skill-bench/tools/check-param.sh | 21 --- .../skill-bench/tools/check-skill-invoked.sh | 14 -- .../skill-bench/tools/check-skill-loaded.sh | 29 --- agents/skill-bench/tools/check-workspace.sh | 13 -- mise.toml | 6 + .../functional-with-country.toml | 15 +- .../patent-assignee-check/functional.toml | 15 +- .../patent-assignee-check/triggering.toml | 3 +- .../patent-fetch/functional-data-return.toml | 18 +- .../patent-fetch/functional.toml | 15 +- .../patent-fetch/triggering.toml | 3 +- .../patent-search/functional-filing.toml | 18 +- .../patent-search/functional-priority.toml | 18 +- .../patent-search/functional-publication.toml | 18 +- .../patent-search/functional.toml | 12 +- .../patent-search/triggering.toml | 9 +- 24 files changed, 59 insertions(+), 506 deletions(-) delete mode 100644 agents/skill-bench/.gitignore delete mode 100755 agents/skill-bench/runner.sh delete mode 100755 agents/skill-bench/tools/check-mcp-loaded.sh delete mode 100755 agents/skill-bench/tools/check-mcp-success.sh delete mode 100755 agents/skill-bench/tools/check-output-contains.sh delete mode 100755 agents/skill-bench/tools/check-output-file.sh delete mode 100755 agents/skill-bench/tools/check-output-not-contains.sh delete mode 100755 agents/skill-bench/tools/check-param.sh delete mode 100755 agents/skill-bench/tools/check-skill-invoked.sh delete mode 100755 agents/skill-bench/tools/check-skill-loaded.sh delete mode 100755 agents/skill-bench/tools/check-workspace.sh rename {agents/skill-bench/cases => tests}/patent-assignee-check/functional-with-country.toml (59%) rename {agents/skill-bench/cases => tests}/patent-assignee-check/functional.toml (58%) rename {agents/skill-bench/cases => tests}/patent-assignee-check/triggering.toml (82%) rename {agents/skill-bench/cases => tests}/patent-fetch/functional-data-return.toml (63%) rename {agents/skill-bench/cases => tests}/patent-fetch/functional.toml (53%) rename {agents/skill-bench/cases => tests}/patent-fetch/triggering.toml (81%) rename {agents/skill-bench/cases => tests}/patent-search/functional-filing.toml (53%) rename {agents/skill-bench/cases => tests}/patent-search/functional-priority.toml (53%) rename {agents/skill-bench/cases => tests}/patent-search/functional-publication.toml (54%) rename {agents/skill-bench/cases => tests}/patent-search/functional.toml (61%) rename {agents/skill-bench/cases => tests}/patent-search/triggering.toml (64%) diff --git a/AGENTS.md b/AGENTS.md index 664ac25..5f248db 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -37,11 +37,6 @@ agents/ tools/ # Agent tools load-progress.sh # Read past context (JSONL) record-progress.sh # Write progress logs (JSONL) - skill-bench/ # Claude Code Skill testing framework - runner.sh # Test runner - cases/ # Test case definitions (TOML, test prompts in English) - tools/ # Check scripts for validating test results - logs/ # Test execution logs claude-plugin/ # Claude Code Plugin structure skills/ # Individual skill definitions mise.toml # Task definitions (fmt, clippy, test, pre-commit) @@ -50,11 +45,13 @@ mise.toml # Task definitions (fmt, clippy, test, pre-commit) ## Skill-Bench Testing -`agents/skill-bench/` contains the testing framework for Claude Code skills: +`tests/` contains skill test cases using the `skill-bench` CLI: - **Test cases are in English** - All `test_prompt` values in TOML files must be English -- **Run tests**: `./agents/skill-bench/runner.sh` (executes inside container) -- **Test pattern**: `./agents/skill-bench/runner.sh "cases/*/*.toml"` +- **List tests**: `skill-bench list tests` +- **Run tests**: `skill-bench run tests` +- **Filter by skill**: `skill-bench run tests --skill patent-search` +- **Log directory**: `skill-bench run tests --log logs/` ## Tools diff --git a/agents/skill-bench/.gitignore b/agents/skill-bench/.gitignore deleted file mode 100644 index 333c1e9..0000000 --- a/agents/skill-bench/.gitignore +++ /dev/null @@ -1 +0,0 @@ -logs/ diff --git a/agents/skill-bench/runner.sh b/agents/skill-bench/runner.sh deleted file mode 100755 index c49c236..0000000 --- a/agents/skill-bench/runner.sh +++ /dev/null @@ -1,177 +0,0 @@ -#!/bin/bash -# agents/skill-bench/runner.sh -# Skill test runner for google-patent-cli. -# All execution happens inside the container. -# -# Usage: ./runner.sh [pattern] -# pattern: Glob pattern to match test files (default: "cases/*/*.toml") - -set -o pipefail - -# Determine workspace root -WORKSPACE_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" -# Determine skill-bench root -SKILL_BENCH_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# Resolve pattern relative to skill-bench root -PATTERN="${1:-cases/*/*.toml}" -# Convert to absolute path -if [[ "$PATTERN" != /* ]]; then - TARGET_PATTERN="$SKILL_BENCH_ROOT/$PATTERN" -else - TARGET_PATTERN="$PATTERN" -fi - -echo "==================================================" -echo "[SkillBench] Starting Skill Test Runner" -echo "[SkillBench] Workspace: $WORKSPACE_ROOT" -echo "[SkillBench] Pattern: $TARGET_PATTERN" -echo "==================================================" - -TOTAL_CASES=0 -TOTAL_PASS=0 -TOTAL_FAIL=0 - -# --- Collect test files matching pattern --- -TEST_FILES=() -TEST_SKILLS=() -TEST_NAMES=() - -for TEST_FILE in $TARGET_PATTERN; do - [ -f "$TEST_FILE" ] || continue - - TEST_FILE_REL="${TEST_FILE#$WORKSPACE_ROOT/}" - SKILL_NAME=$(basename "$(dirname "$TEST_FILE_REL")") - TEST_NAME=$(basename "$TEST_FILE" .toml) - - TEST_FILES+=("$TEST_FILE") - TEST_SKILLS+=("$SKILL_NAME") - TEST_NAMES+=("$TEST_NAME") -done - -if [ ${#TEST_FILES[@]} -eq 0 ]; then - echo "[SkillBench] No test files found matching pattern: $TARGET_PATTERN" - exit 1 -fi - -# --- Process each test file --- -for IDX in "${!TEST_FILES[@]}"; do - TEST_FILE="${TEST_FILES[$IDX]}" - SKILL_NAME="${TEST_SKILLS[$IDX]}" - TEST_NAME="${TEST_NAMES[$IDX]}" - TEST_CASE_NAME="${SKILL_NAME}/${TEST_NAME}" - TOTAL_CASES=$((TOTAL_CASES + 1)) - - # Read test configuration - TEST_PROMPT=$(yq eval '.test_prompt' "$TEST_FILE") - TEST_TIMEOUT=$(yq eval '.timeout // 300' "$TEST_FILE") - - echo "" - echo "──────────────────────────────────────────────────" - echo "[SkillBench] Test Case: $TEST_CASE_NAME" - echo "──────────────────────────────────────────────────" - - # --- Phase 1: Setup and Execute trial --- - # Create skill-specific log directory - LOG_DIR="$SKILL_BENCH_ROOT/logs/${SKILL_NAME}" - mkdir -p "$LOG_DIR" - - TIMESTAMP=$(date +%Y%m%d_%H%M%S) - LOG_FILE="$LOG_DIR/${TIMESTAMP}_${TEST_NAME}.log" - WORK_DIR="/tmp/skill-bench-${TIMESTAMP}_${SKILL_NAME}-${TEST_NAME}" - - # Setup workspace - echo "[SkillBench] 📦 Setting up workspace: $WORK_DIR" - rm -rf "${WORK_DIR}" - mkdir -p "${WORK_DIR}" - - # Copy claude-plugin (required for skill testing) - cp -r "$WORKSPACE_ROOT/claude-plugin" "$WORK_DIR/" 2>/dev/null || true - - # Read setup files from test.toml [[setup]] array - NUM_SETUP=$(yq eval '.setup | length // 0' "$TEST_FILE") - if [ "$NUM_SETUP" -gt 0 ]; then - for SETUP_IDX in $(seq 0 $((NUM_SETUP - 1))); do - SETUP_PATH=$(yq eval ".setup[$SETUP_IDX].path" "$TEST_FILE") - SETUP_DIR=$(dirname "$WORK_DIR/$SETUP_PATH") - mkdir -p "$SETUP_DIR" - yq eval ".setup[$SETUP_IDX].content" "$TEST_FILE" > "$WORK_DIR/${SETUP_PATH}" - done - fi - - # Execute trial - echo "[SkillBench] Running trial → $LOG_FILE" - START_TIME=$(date +%s) - - # Unset CLAUDECODE to avoid nested session error - (cd "$WORK_DIR" && unset CLAUDECODE && claude -p \ - --dangerously-skip-permissions \ - --verbose \ - --output-format stream-json \ - --plugin-dir ./claude-plugin \ - -- "$TEST_PROMPT" < /dev/null | jq -c '(. + {timestamp: now})') > "$LOG_FILE" 2>&1 - - EXIT_CODE=$? - END_TIME=$(date +%s) - DURATION=$(( END_TIME - START_TIME )) - - if [ $EXIT_CODE -eq 0 ]; then - echo "[SkillBench] ✅ Trial finished (took ${DURATION}s)" - else - echo "[SkillBench] ⚠️ Trial exited with code $EXIT_CODE (took ${DURATION}s)" - fi - - # --- Phase 2: Evaluate trial --- - echo "[SkillBench] Running evaluation..." - - CASE_PASS=true - TOOLS_DIR="$(cd "$(dirname "$0")" && pwd)/tools" - - # Run checks from test.toml - NUM_CHECKS=$(yq eval '.checks | length' "$TEST_FILE") - for CHECK_IDX in $(seq 0 $((NUM_CHECKS - 1))); do - CHECK_NAME=$(yq eval ".checks[$CHECK_IDX].name" "$TEST_FILE") - CHECK_CMD=$(yq eval ".checks[$CHECK_IDX].command" "$TEST_FILE") - - # Parse check command into script and args - # Use eval to properly handle quoted arguments - CHECK_SCRIPT=$(echo "$CHECK_CMD" | awk '{print $1}') - CHECK_ARGS=$(echo "$CHECK_CMD" | cut -d' ' -f2-) - - if [ -n "$CHECK_ARGS" ]; then - # Command has arguments: script.sh arg1 arg2... - # Use eval to properly expand quoted arguments - if eval "$TOOLS_DIR/$CHECK_SCRIPT \"\$LOG_FILE\" $CHECK_ARGS" >/dev/null 2>&1; then - echo "[SkillBench] ✅ $CHECK_NAME" - else - echo "[SkillBench] ❌ $CHECK_NAME" - CASE_PASS=false - fi - else - # Command has no arguments: script.sh (takes only LOG_FILE) - if $TOOLS_DIR/$CHECK_SCRIPT "$LOG_FILE" >/dev/null 2>&1; then - echo "[SkillBench] ✅ $CHECK_NAME" - else - echo "[SkillBench] ❌ $CHECK_NAME" - CASE_PASS=false - fi - fi - done - - # Display case result - if [ "$CASE_PASS" = true ]; then - echo "[SkillBench] ✅ $TEST_CASE_NAME: PASS" - TOTAL_PASS=$((TOTAL_PASS + 1)) - else - echo "[SkillBench] ❌ $TEST_CASE_NAME: FAIL" - TOTAL_FAIL=$((TOTAL_FAIL + 1)) - fi -done - -# --- Summary --- -echo "" -echo "==================================================" -echo "[SkillBench] Test Summary" -echo "[SkillBench] Total: $TOTAL_CASES | Pass: $TOTAL_PASS | Fail: $TOTAL_FAIL" -echo "==================================================" - -exit "$TOTAL_FAIL" diff --git a/agents/skill-bench/tools/check-mcp-loaded.sh b/agents/skill-bench/tools/check-mcp-loaded.sh deleted file mode 100755 index d7c0118..0000000 --- a/agents/skill-bench/tools/check-mcp-loaded.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -# Check if MCP server loaded successfully in a log file -# Usage: check-mcp-loaded.sh -# Returns: 0 if MCP server loaded successfully, 1 if failed or not found - -LOG_FILE="$1" -MCP_SERVER_NAME="$2" - -if [[ -z "$LOG_FILE" ]] || [[ -z "$MCP_SERVER_NAME" ]]; then - echo "Usage: $0 " >&2 - exit 2 -fi - -if [[ ! -f "$LOG_FILE" ]]; then - echo "Log file not found: $LOG_FILE" >&2 - exit 2 -fi - -# Check MCP server status in init message (first line is init message) -# The mcp_servers array contains objects with name and status fields -STATUS=$(head -1 "$LOG_FILE" | jq -r ' - .mcp_servers? // [] - | .[] | select(.name? | test("'"$MCP_SERVER_NAME"'")) - | .status // "not_found" -') - -if [[ "$STATUS" == "not_found" ]]; then - echo "MCP server $MCP_SERVER_NAME not found in log" >&2 - exit 1 -fi - -if [[ "$STATUS" == "failed" ]]; then - echo "MCP server $MCP_SERVER_NAME failed to load (status: failed)" >&2 - exit 1 -fi - -exit 0 diff --git a/agents/skill-bench/tools/check-mcp-success.sh b/agents/skill-bench/tools/check-mcp-success.sh deleted file mode 100755 index b3ad0ec..0000000 --- a/agents/skill-bench/tools/check-mcp-success.sh +++ /dev/null @@ -1,62 +0,0 @@ -#!/bin/bash -# Check if MCP tool calls succeeded in a log file -# Usage: check-mcp-success.sh [--optional] -# Returns: 0 if all MCP calls succeeded (or none made with --optional), 1 if any failed - -LOG_FILE="$1" -MCP_TOOL_NAME="$2" -OPTIONAL_FLAG="${3:-}" - -if [[ -z "$LOG_FILE" ]] || [[ -z "$MCP_TOOL_NAME" ]]; then - echo "Usage: $0 [--optional]" >&2 - exit 2 -fi - -if [[ ! -f "$LOG_FILE" ]]; then - echo "Log file not found: $LOG_FILE" >&2 - exit 2 -fi - -# Extract tool_use IDs for the specified MCP tool -TOOL_USE_IDS=$(jq -r ' - .[] - | select(.type? == "assistant") - | (.message.content? // []) - | select(type == "array") - | .[] - | select(type == "object" and .type? == "tool_use" and (.name? // "") | test("'"$MCP_TOOL_NAME"'")) - | .id -' "$LOG_FILE") - -ID_COUNT=$(echo "$TOOL_USE_IDS" | grep -c '^\w*$' || true) - -if [[ $ID_COUNT -eq 0 ]]; then - if [[ "$OPTIONAL_FLAG" == "--optional" ]]; then - exit 0 - else - echo "No $MCP_TOOL_NAME tool calls found in log" >&2 - exit 1 - fi -fi - -# Check if any tool_results have is_error: true -while IFS= read -r tool_id; do - if [[ -n "$tool_id" ]]; then - ERROR_CHECK=$(jq -r " - .[] - | select(.type? == \"user\") - | (.message.content? // []) - | select(type == \"array\") - | .[] - | select(type == \"object\" and .type? == \"tool_result\" and .tool_use_id? == \"$tool_id\") - | .is_error // false - " "$LOG_FILE") - - if [[ "$ERROR_CHECK" == "true" ]]; then - echo "MCP tool $MCP_TOOL_NAME (tool_use_id: $tool_id) returned an error" >&2 - exit 1 - fi - fi -done <<< "$TOOL_USE_IDS" - -exit 0 diff --git a/agents/skill-bench/tools/check-output-contains.sh b/agents/skill-bench/tools/check-output-contains.sh deleted file mode 100755 index 47af684..0000000 --- a/agents/skill-bench/tools/check-output-contains.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -# Check if the final output contains a specific string -# Usage: check-output-contains.sh -# -# This checks the final result text, not intermediate tool calls - -LOG_FILE="${1:-}" -PATTERN="${2:-}" - -if [ -z "$LOG_FILE" ] || [ -z "$PATTERN" ]; then - echo "[Error] Usage: $0 " >&2 - exit 1 -fi - -# Extract the final result text and check if it contains the pattern -# The result is in a "result" type message with a "result" field -jq -r 'select(.type == "result") | .result' "$LOG_FILE" 2>/dev/null | grep -q "$PATTERN" diff --git a/agents/skill-bench/tools/check-output-file.sh b/agents/skill-bench/tools/check-output-file.sh deleted file mode 100755 index dba0a34..0000000 --- a/agents/skill-bench/tools/check-output-file.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -# Check if output_file was created in log -# Usage: check-output-file.sh - -LOG_FILE="$1" - -if [ -z "$LOG_FILE" ]; then - echo "[Error] Usage: $0 " >&2 - exit 1 -fi - -# Check if output_file exists in tool_result content -# Use try/catch to handle invalid JSON in content field -jq -s '[.[] | select(.type == "user") | .message.content[]? | select(type == "object" and .type == "tool_result" and .tool_use_id? and .content? != null and (.content | type) == "string") | .content | try fromjson catch null | select(. != null) | .output_file] | length > 0' "$LOG_FILE" diff --git a/agents/skill-bench/tools/check-output-not-contains.sh b/agents/skill-bench/tools/check-output-not-contains.sh deleted file mode 100755 index 242a455..0000000 --- a/agents/skill-bench/tools/check-output-not-contains.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/bash -# Check if the final output does NOT contain a specific pattern -# Usage: check-output-not-contains.sh -# -# This checks the final result text, not intermediate tool calls - -LOG_FILE="${1:-}" -PATTERN="${2:-}" - -if [ -z "$LOG_FILE" ] || [ -z "$PATTERN" ]; then - echo "[Error] Usage: $0 " >&2 - exit 1 -fi - -# Extract the final result text and check if it does NOT contain the pattern -# The result is in a "result" type message with a "result" field -! jq -r 'select(.type == "result") | .result' "$LOG_FILE" 2>/dev/null | grep -q "$PATTERN" diff --git a/agents/skill-bench/tools/check-param.sh b/agents/skill-bench/tools/check-param.sh deleted file mode 100755 index 2e69093..0000000 --- a/agents/skill-bench/tools/check-param.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -# Check if parameter was used in tool call -# Usage: check-param.sh - -LOG_FILE="$1" -TOOL_NAME="$2" -PARAM_NAME="$3" -EXPECTED_VALUE="$4" - -if [ -z "$LOG_FILE" ] || [ -z "$TOOL_NAME" ] || [ -z "$PARAM_NAME" ]; then - echo "[Error] Usage: $0 [expected_value]" >&2 - exit 1 -fi - -if [ -n "$EXPECTED_VALUE" ]; then - # Check if parameter equals expected value (handle both string and array) - jq -s "[.[] | select(.type == \"assistant\") | .message.content[]? | select(type == \"object\" and .type == \"tool_use\") | select(.name | test(\"$TOOL_NAME\"; \"i\")) | .input.$PARAM_NAME | (if type == \"array\" then .[] == \"$EXPECTED_VALUE\" else . == \"$EXPECTED_VALUE\" end)] | any" "$LOG_FILE" -else - # Check if parameter exists - jq -s "[.[] | select(.type == \"assistant\") | .message.content[]? | select(type == \"object\" and .type == \"tool_use\") | select(.name | test(\"$TOOL_NAME\"; \"i\")) | .input.$PARAM_NAME] | length > 0" "$LOG_FILE" -fi diff --git a/agents/skill-bench/tools/check-skill-invoked.sh b/agents/skill-bench/tools/check-skill-invoked.sh deleted file mode 100755 index 6f99948..0000000 --- a/agents/skill-bench/tools/check-skill-invoked.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -# Check if a specific skill was invoked -# Usage: check-skill-invoked.sh - -LOG_FILE="${1:-}" -SKILL_NAME="${2:-}" - -if [ -z "$LOG_FILE" ] || [ -z "$SKILL_NAME" ]; then - echo "[Error] Usage: $0 " >&2 - exit 1 -fi - -# Check if the skill was invoked in the log -grep -q '"Skill"' "$LOG_FILE" && grep -q '"skill":".*'"$SKILL_NAME" "$LOG_FILE" diff --git a/agents/skill-bench/tools/check-skill-loaded.sh b/agents/skill-bench/tools/check-skill-loaded.sh deleted file mode 100755 index 822cfdd..0000000 --- a/agents/skill-bench/tools/check-skill-loaded.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -# Check if a skill was loaded successfully in a log file -# Usage: check-skill-loaded.sh -# Returns: 0 if skill found in init skills array, 1 if not found - -LOG_FILE="$1" -SKILL_NAME="$2" - -if [[ -z "$LOG_FILE" ]] || [[ -z "$SKILL_NAME" ]]; then - echo "Usage: $0 " >&2 - exit 2 -fi - -if [[ ! -f "$LOG_FILE" ]]; then - echo "Log file not found: $LOG_FILE" >&2 - exit 2 -fi - -# Check if skill is in the init skills array (first line is init message) -FOUND=$(head -1 "$LOG_FILE" | jq -c ' - .skills | any(.[]; contains("'$SKILL_NAME'")) -') - -if [[ "$FOUND" != "true" ]]; then - echo "Skill $SKILL_NAME not found in init skills array" >&2 - exit 1 -fi - -exit 0 diff --git a/agents/skill-bench/tools/check-workspace.sh b/agents/skill-bench/tools/check-workspace.sh deleted file mode 100755 index 99f7b7e..0000000 --- a/agents/skill-bench/tools/check-workspace.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -# Check if workspace condition is true -# Usage: check-workspace.sh - -WORK_DIR="$1" -CHECK_CMD="$2" - -if [ -z "$WORK_DIR" ] || [ -z "$CHECK_CMD" ]; then - echo "[Error] Usage: $0 " >&2 - exit 1 -fi - -(cd "$WORK_DIR" && eval "$CHECK_CMD") diff --git a/mise.toml b/mise.toml index b53d9d2..68ebdee 100644 --- a/mise.toml +++ b/mise.toml @@ -9,6 +9,12 @@ run = "RUSTFLAGS=\"-D warnings\" cargo test --all-targets" [tasks.pre-commit] depends = ["fmt", "clippy", "test"] + +[tasks.skill-test] +run = "skill-bench run tests --plugin-dir claude-plugin --log logs" + +[tasks.skill-test-list] +run = "skill-bench list tests" [tasks.coverage] run = """ diff --git a/agents/skill-bench/cases/patent-assignee-check/functional-with-country.toml b/tests/patent-assignee-check/functional-with-country.toml similarity index 59% rename from agents/skill-bench/cases/patent-assignee-check/functional-with-country.toml rename to tests/patent-assignee-check/functional-with-country.toml index 282409a..56efb1e 100644 --- a/agents/skill-bench/cases/patent-assignee-check/functional-with-country.toml +++ b/tests/patent-assignee-check/functional-with-country.toml @@ -10,25 +10,20 @@ Find all spelling variations of Toyota assignee name in Japanese patents (countr [[checks]] name = "mcp_server_loaded" -type = "script" -command = "check-mcp-loaded.sh google-patent-cli" +command = { command = "mcp-loaded", server = "google-patent-cli" } [[checks]] name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh patent-assignee-check" +command = { command = "skill-loaded", skill = "patent-assignee-check" } [[checks]] name = "patent_assignee_check_skill_invoked" -type = "script" -command = "check-skill-invoked.sh patent-assignee-check" +command = { command = "skill-invoked", skill = "patent-assignee-check" } [[checks]] name = "search_patents_mcp_called" -type = "script" -command = "check-mcp-success.sh search_patents" +command = { command = "mcp-success", tool = "search_patents" } [[checks]] name = "country_parameter_used" -type = "script" -command = "check-param.sh search_patents country JP" +command = { command = "tool-use", tool = "search_patents", param = "country", value = "JP" } diff --git a/agents/skill-bench/cases/patent-assignee-check/functional.toml b/tests/patent-assignee-check/functional.toml similarity index 58% rename from agents/skill-bench/cases/patent-assignee-check/functional.toml rename to tests/patent-assignee-check/functional.toml index ba5b612..dee0670 100644 --- a/agents/skill-bench/cases/patent-assignee-check/functional.toml +++ b/tests/patent-assignee-check/functional.toml @@ -10,25 +10,20 @@ Find all spelling variations of Google assignee name in patent databases. Check [[checks]] name = "mcp_server_loaded" -type = "script" -command = "check-mcp-loaded.sh google-patent-cli" +command = { command = "mcp-loaded", server = "google-patent-cli" } [[checks]] name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh patent-assignee-check" +command = { command = "skill-loaded", skill = "patent-assignee-check" } [[checks]] name = "patent_assignee_check_skill_invoked" -type = "script" -command = "check-skill-invoked.sh patent-assignee-check" +command = { command = "skill-invoked", skill = "patent-assignee-check" } [[checks]] name = "search_patents_mcp_called" -type = "script" -command = "check-mcp-success.sh search_patents" +command = { command = "mcp-success", tool = "search_patents" } [[checks]] name = "company_name_parameter_used" -type = "script" -command = "check-param.sh search_patents assignee Google" +command = { command = "tool-use", tool = "search_patents", param = "assignee", value = "Google" } diff --git a/agents/skill-bench/cases/patent-assignee-check/triggering.toml b/tests/patent-assignee-check/triggering.toml similarity index 82% rename from agents/skill-bench/cases/patent-assignee-check/triggering.toml rename to tests/patent-assignee-check/triggering.toml index d9e43bf..5e33ec2 100644 --- a/agents/skill-bench/cases/patent-assignee-check/triggering.toml +++ b/tests/patent-assignee-check/triggering.toml @@ -10,5 +10,4 @@ I want to verify the correct assignee name for "Toyota" in patent databases. [[checks]] name = "patent_assignee_check_skill_invoked" -type = "script" -command = "check-skill-invoked.sh patent-assignee-check" +command = { command = "skill-invoked", skill = "patent-assignee-check" } diff --git a/agents/skill-bench/cases/patent-fetch/functional-data-return.toml b/tests/patent-fetch/functional-data-return.toml similarity index 63% rename from agents/skill-bench/cases/patent-fetch/functional-data-return.toml rename to tests/patent-fetch/functional-data-return.toml index 08c1800..b4cca15 100644 --- a/agents/skill-bench/cases/patent-fetch/functional-data-return.toml +++ b/tests/patent-fetch/functional-data-return.toml @@ -17,30 +17,24 @@ The skill should return the actual patent data, not a file path that requires yo [[checks]] name = "mcp_server_loaded" -type = "script" -command = "check-mcp-loaded.sh google-patent-cli" +command = { command = "mcp-loaded", server = "google-patent-cli" } [[checks]] name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh patent-fetch" +command = { command = "skill-loaded", skill = "patent-fetch" } [[checks]] name = "patent_fetch_skill_invoked" -type = "script" -command = "check-skill-invoked.sh patent-fetch" +command = { command = "skill-invoked", skill = "patent-fetch" } [[checks]] name = "fetch_patent_mcp_called" -type = "script" -command = "check-mcp-success.sh fetch_patent" +command = { command = "mcp-success", tool = "fetch_patent" } [[checks]] name = "title_returned" -type = "script" -command = "check-output-contains.sh 'Serving advertisements'" +command = { command = "message-contains", text = "Serving advertisements" } [[checks]] name = "no_json_file_path_in_output" -type = "script" -command = "check-output-not-contains.sh 'patent-.*\\.json'" +command = { command = "message-contains", text = 'patent-.*\.json', deny = true } diff --git a/agents/skill-bench/cases/patent-fetch/functional.toml b/tests/patent-fetch/functional.toml similarity index 53% rename from agents/skill-bench/cases/patent-fetch/functional.toml rename to tests/patent-fetch/functional.toml index a142dab..3b5f9c6 100644 --- a/agents/skill-bench/cases/patent-fetch/functional.toml +++ b/tests/patent-fetch/functional.toml @@ -10,25 +10,20 @@ Fetch patent details for US9152718B2. [[checks]] name = "mcp_server_loaded" -type = "script" -command = "check-mcp-loaded.sh google-patent-cli" +command = { command = "mcp-loaded", server = "google-patent-cli" } [[checks]] name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh patent-fetch" +command = { command = "skill-loaded", skill = "patent-fetch" } [[checks]] name = "patent_fetch_skill_invoked" -type = "script" -command = "check-skill-invoked.sh patent-fetch" +command = { command = "skill-invoked", skill = "patent-fetch" } [[checks]] name = "fetch_patent_mcp_called" -type = "script" -command = "check-mcp-success.sh fetch_patent" +command = { command = "mcp-success", tool = "fetch_patent" } [[checks]] name = "patent_id_parameter_used" -type = "script" -command = "check-param.sh fetch_patent patent_id US9152718B2" +command = { command = "tool-use", tool = "fetch_patent", param = "patent_id", value = "US9152718B2" } diff --git a/agents/skill-bench/cases/patent-fetch/triggering.toml b/tests/patent-fetch/triggering.toml similarity index 81% rename from agents/skill-bench/cases/patent-fetch/triggering.toml rename to tests/patent-fetch/triggering.toml index a06c200..93886e9 100644 --- a/agents/skill-bench/cases/patent-fetch/triggering.toml +++ b/tests/patent-fetch/triggering.toml @@ -10,5 +10,4 @@ I need to get detailed information for patent US1234567B2. [[checks]] name = "patent_fetch_skill_invoked" -type = "script" -command = "check-skill-invoked.sh patent-fetch" +command = { command = "skill-invoked", skill = "patent-fetch" } diff --git a/agents/skill-bench/cases/patent-search/functional-filing.toml b/tests/patent-search/functional-filing.toml similarity index 53% rename from agents/skill-bench/cases/patent-search/functional-filing.toml rename to tests/patent-search/functional-filing.toml index ffd7070..b12d490 100644 --- a/agents/skill-bench/cases/patent-search/functional-filing.toml +++ b/tests/patent-search/functional-filing.toml @@ -10,30 +10,24 @@ Search for patents about "ai" with filing date between 2023-01-01 and 2023-12-31 [[checks]] name = "mcp_server_loaded" -type = "script" -command = "check-mcp-loaded.sh google-patent-cli" +command = { command = "mcp-loaded", server = "google-patent-cli" } [[checks]] name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh patent-search" +command = { command = "skill-loaded", skill = "patent-search" } [[checks]] name = "patent_search_skill_invoked" -type = "script" -command = "check-skill-invoked.sh patent-search" +command = { command = "skill-invoked", skill = "patent-search" } [[checks]] name = "search_patents_mcp_called" -type = "script" -command = "check-mcp-success.sh search_patents" +command = { command = "mcp-success", tool = "search_patents" } [[checks]] name = "filing_after_parameter_used" -type = "script" -command = "check-param.sh search_patents filing_after 2023-01-01" +command = { command = "tool-use", tool = "search_patents", param = "filing_after", value = "2023-01-01" } [[checks]] name = "filing_before_parameter_used" -type = "script" -command = "check-param.sh search_patents filing_before 2023-12-31" +command = { command = "tool-use", tool = "search_patents", param = "filing_before", value = "2023-12-31" } diff --git a/agents/skill-bench/cases/patent-search/functional-priority.toml b/tests/patent-search/functional-priority.toml similarity index 53% rename from agents/skill-bench/cases/patent-search/functional-priority.toml rename to tests/patent-search/functional-priority.toml index 664a0ea..8457d86 100644 --- a/agents/skill-bench/cases/patent-search/functional-priority.toml +++ b/tests/patent-search/functional-priority.toml @@ -10,30 +10,24 @@ Search for patents about "ai" with priority date between 2023-01-01 and 2023-12- [[checks]] name = "mcp_server_loaded" -type = "script" -command = "check-mcp-loaded.sh google-patent-cli" +command = { command = "mcp-loaded", server = "google-patent-cli" } [[checks]] name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh patent-search" +command = { command = "skill-loaded", skill = "patent-search" } [[checks]] name = "patent_search_skill_invoked" -type = "script" -command = "check-skill-invoked.sh patent-search" +command = { command = "skill-invoked", skill = "patent-search" } [[checks]] name = "search_patents_mcp_called" -type = "script" -command = "check-mcp-success.sh search_patents" +command = { command = "mcp-success", tool = "search_patents" } [[checks]] name = "priority_after_parameter_used" -type = "script" -command = "check-param.sh search_patents priority_after 2023-01-01" +command = { command = "tool-use", tool = "search_patents", param = "priority_after", value = "2023-01-01" } [[checks]] name = "priority_before_parameter_used" -type = "script" -command = "check-param.sh search_patents priority_before 2023-12-31" +command = { command = "tool-use", tool = "search_patents", param = "priority_before", value = "2023-12-31" } diff --git a/agents/skill-bench/cases/patent-search/functional-publication.toml b/tests/patent-search/functional-publication.toml similarity index 54% rename from agents/skill-bench/cases/patent-search/functional-publication.toml rename to tests/patent-search/functional-publication.toml index f906c68..b39a5d2 100644 --- a/agents/skill-bench/cases/patent-search/functional-publication.toml +++ b/tests/patent-search/functional-publication.toml @@ -10,30 +10,24 @@ Search for patents about "ai" with publication date between 2023-01-01 and 2023- [[checks]] name = "mcp_server_loaded" -type = "script" -command = "check-mcp-loaded.sh google-patent-cli" +command = { command = "mcp-loaded", server = "google-patent-cli" } [[checks]] name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh patent-search" +command = { command = "skill-loaded", skill = "patent-search" } [[checks]] name = "patent_search_skill_invoked" -type = "script" -command = "check-skill-invoked.sh patent-search" +command = { command = "skill-invoked", skill = "patent-search" } [[checks]] name = "search_patents_mcp_called" -type = "script" -command = "check-mcp-success.sh search_patents" +command = { command = "mcp-success", tool = "search_patents" } [[checks]] name = "publication_after_parameter_used" -type = "script" -command = "check-param.sh search_patents publication_after 2023-01-01" +command = { command = "tool-use", tool = "search_patents", param = "publication_after", value = "2023-01-01" } [[checks]] name = "publication_before_parameter_used" -type = "script" -command = "check-param.sh search_patents publication_before 2023-12-31" +command = { command = "tool-use", tool = "search_patents", param = "publication_before", value = "2023-12-31" } diff --git a/agents/skill-bench/cases/patent-search/functional.toml b/tests/patent-search/functional.toml similarity index 61% rename from agents/skill-bench/cases/patent-search/functional.toml rename to tests/patent-search/functional.toml index aee9b84..449358b 100644 --- a/agents/skill-bench/cases/patent-search/functional.toml +++ b/tests/patent-search/functional.toml @@ -10,20 +10,16 @@ I want to search for patents about machine learning, show me 5 results. [[checks]] name = "mcp_server_loaded" -type = "script" -command = "check-mcp-loaded.sh google-patent-cli" +command = { command = "mcp-loaded", server = "google-patent-cli" } [[checks]] name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh patent-search" +command = { command = "skill-loaded", skill = "patent-search" } [[checks]] name = "patent_search_skill_invoked" -type = "script" -command = "check-skill-invoked.sh patent-search" +command = { command = "skill-invoked", skill = "patent-search" } [[checks]] name = "search_patents_mcp_called" -type = "script" -command = "check-mcp-success.sh search_patents" +command = { command = "mcp-success", tool = "search_patents" } diff --git a/agents/skill-bench/cases/patent-search/triggering.toml b/tests/patent-search/triggering.toml similarity index 64% rename from agents/skill-bench/cases/patent-search/triggering.toml rename to tests/patent-search/triggering.toml index 4badf47..4b01efa 100644 --- a/agents/skill-bench/cases/patent-search/triggering.toml +++ b/tests/patent-search/triggering.toml @@ -10,15 +10,12 @@ I need to search for patents about artificial intelligence. [[checks]] name = "mcp_server_loaded" -type = "script" -command = "check-mcp-loaded.sh google-patent-cli" +command = { command = "mcp-loaded", server = "google-patent-cli" } [[checks]] name = "skill_loaded" -type = "script" -command = "check-skill-loaded.sh patent-search" +command = { command = "skill-loaded", skill = "patent-search" } [[checks]] name = "patent_search_skill_invoked" -type = "script" -command = "check-skill-invoked.sh patent-search" +command = { command = "skill-invoked", skill = "patent-search" }