From a7419dbaad220d9ea09736ea7744a56d6857a745 Mon Sep 17 00:00:00 2001
From: sonesuke <sonesuke@users.noreply.github.com>
Date: Mon, 30 Mar 2026 06:00:49 +0000
Subject: [PATCH 1/2] fix: document cypher patterns and forbid direct JSON file
 access

Add explicit cypher query examples and a CRITICAL instruction to use
execute_cypher for all result retrieval. Models were bypassing cypher
and reading /tmp/patent-search-*.json with python3, causing redundant
file reads and wasted time.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../skills/patent-assignee-check/SKILL.md     | 18 +++++++
 claude-plugin/skills/patent-search/SKILL.md   | 47 +++++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/claude-plugin/skills/patent-assignee-check/SKILL.md b/claude-plugin/skills/patent-assignee-check/SKILL.md
index cb01f73..b363917 100644
--- a/claude-plugin/skills/patent-assignee-check/SKILL.md
+++ b/claude-plugin/skills/patent-assignee-check/SKILL.md
@@ -38,6 +38,24 @@ execute_cypher({
 })
 ```
 
+**CRITICAL**: After searching, always use `execute_cypher` to retrieve results.
+Do NOT read the output JSON file directly. The JSON file is an internal
+artifact — all data is available through cypher queries.
+
+### Result Retrieval Patterns
+
+**Assignee name variations with counts**:
+
+```cypher
+MATCH (p:Patent) RETURN p.assignee, COUNT(*) AS count ORDER BY count DESC
+```
+
+**Assignee variations with sample titles**:
+
+```cypher
+MATCH (p:Patent) RETURN p.assignee, p.title, p.snippet LIMIT 20
+```
+
 ## Parameters
 
 - `company_name` (string, required): Company name to check for variations
diff --git a/claude-plugin/skills/patent-search/SKILL.md b/claude-plugin/skills/patent-search/SKILL.md
index c7de781..06f7942 100644
--- a/claude-plugin/skills/patent-search/SKILL.md
+++ b/claude-plugin/skills/patent-search/SKILL.md
@@ -38,6 +38,53 @@ execute_cypher({
 })
 ```
 
+**CRITICAL**: After searching, always use `execute_cypher` to retrieve results.
+Do NOT read the output JSON file directly. The JSON file is an internal
+artifact — all data is available through cypher queries.
+
+### Result Retrieval Patterns
+
+Use these cypher patterns to retrieve search results:
+
+**Total count**:
+
+```cypher
+MATCH (p:Patent) RETURN COUNT(*) AS count
+```
+
+**Top 20 snippets for noise analysis**:
+
+```cypher
+MATCH (p:Patent) RETURN p.id, p.title, p.snippet, p.assignee LIMIT 20
+```
+
+**Assignee breakdown**:
+
+```cypher
+MATCH (p:Patent) RETURN p.assignee, COUNT(*) AS count ORDER BY count DESC
+```
+
+**Date range summary**:
+
+```cypher
+MATCH (p:Patent) RETURN p.filing_date, p.title LIMIT 10
+```
+
+### Available Patent Node Fields
+
+| Field              | Description                     |
+| ------------------ | ------------------------------- |
+| `id`               | Patent ID (e.g., "US9152718B2") |
+| `title`            | Patent title                    |
+| `snippet`          | Search result snippet           |
+| `abstract_text`    | Full abstract                   |
+| `assignee`         | Assignee/applicant name         |
+| `filing_date`      | Filing date                     |
+| `publication_date` | Publication date                |
+| `url`              | Google Patents URL              |
+| `legal_status`     | Legal status                    |
+| `family_id`        | Patent family ID                |
+
 ### Date Filter Examples
 
 Search patents filed in 2023:

From b188accf3429f5cf8f15a80dd986ce8bd3af1d79 Mon Sep 17 00:00:00 2001
From: sonesuke <sonesuke@users.noreply.github.com>
Date: Mon, 30 Mar 2026 07:33:21 +0000
Subject: [PATCH 2/2] refactor: migrate skill-bench from shell scripts to
 CLI-based tests

Replace runner.sh and tools/*.sh with skill-bench CLI native TOML format.
Move test cases from agents/skill-bench/cases/ to tests/.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 AGENTS.md                                     |  13 +-
 agents/skill-bench/.gitignore                 |   1 -
 agents/skill-bench/runner.sh                  | 177 ------------------
 agents/skill-bench/tools/check-mcp-loaded.sh  |  37 ----
 agents/skill-bench/tools/check-mcp-success.sh |  62 ------
 .../tools/check-output-contains.sh            |  17 --
 agents/skill-bench/tools/check-output-file.sh |  14 --
 .../tools/check-output-not-contains.sh        |  17 --
 agents/skill-bench/tools/check-param.sh       |  21 ---
 .../skill-bench/tools/check-skill-invoked.sh  |  14 --
 .../skill-bench/tools/check-skill-loaded.sh   |  29 ---
 agents/skill-bench/tools/check-workspace.sh   |  13 --
 mise.toml                                     |   6 +
 .../functional-with-country.toml              |  15 +-
 .../patent-assignee-check/functional.toml     |  15 +-
 .../patent-assignee-check/triggering.toml     |   3 +-
 .../patent-fetch/functional-data-return.toml  |  18 +-
 .../patent-fetch/functional.toml              |  15 +-
 .../patent-fetch/triggering.toml              |   3 +-
 .../patent-search/functional-filing.toml      |  18 +-
 .../patent-search/functional-priority.toml    |  18 +-
 .../patent-search/functional-publication.toml |  18 +-
 .../patent-search/functional.toml             |  12 +-
 .../patent-search/triggering.toml             |   9 +-
 24 files changed, 59 insertions(+), 506 deletions(-)
 delete mode 100644 agents/skill-bench/.gitignore
 delete mode 100755 agents/skill-bench/runner.sh
 delete mode 100755 agents/skill-bench/tools/check-mcp-loaded.sh
 delete mode 100755 agents/skill-bench/tools/check-mcp-success.sh
 delete mode 100755 agents/skill-bench/tools/check-output-contains.sh
 delete mode 100755 agents/skill-bench/tools/check-output-file.sh
 delete mode 100755 agents/skill-bench/tools/check-output-not-contains.sh
 delete mode 100755 agents/skill-bench/tools/check-param.sh
 delete mode 100755 agents/skill-bench/tools/check-skill-invoked.sh
 delete mode 100755 agents/skill-bench/tools/check-skill-loaded.sh
 delete mode 100755 agents/skill-bench/tools/check-workspace.sh
 rename {agents/skill-bench/cases => tests}/patent-assignee-check/functional-with-country.toml (59%)
 rename {agents/skill-bench/cases => tests}/patent-assignee-check/functional.toml (58%)
 rename {agents/skill-bench/cases => tests}/patent-assignee-check/triggering.toml (82%)
 rename {agents/skill-bench/cases => tests}/patent-fetch/functional-data-return.toml (63%)
 rename {agents/skill-bench/cases => tests}/patent-fetch/functional.toml (53%)
 rename {agents/skill-bench/cases => tests}/patent-fetch/triggering.toml (81%)
 rename {agents/skill-bench/cases => tests}/patent-search/functional-filing.toml (53%)
 rename {agents/skill-bench/cases => tests}/patent-search/functional-priority.toml (53%)
 rename {agents/skill-bench/cases => tests}/patent-search/functional-publication.toml (54%)
 rename {agents/skill-bench/cases => tests}/patent-search/functional.toml (61%)
 rename {agents/skill-bench/cases => tests}/patent-search/triggering.toml (64%)

diff --git a/AGENTS.md b/AGENTS.md
index 664ac25..5f248db 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -37,11 +37,6 @@ agents/
     tools/              # Agent tools
       load-progress.sh  # Read past context (JSONL)
       record-progress.sh # Write progress logs (JSONL)
-  skill-bench/          # Claude Code Skill testing framework
-    runner.sh           # Test runner
-    cases/              # Test case definitions (TOML, test prompts in English)
-    tools/              # Check scripts for validating test results
-    logs/               # Test execution logs
 claude-plugin/          # Claude Code Plugin structure
   skills/               # Individual skill definitions
 mise.toml               # Task definitions (fmt, clippy, test, pre-commit)
@@ -50,11 +45,13 @@ mise.toml               # Task definitions (fmt, clippy, test, pre-commit)
 
 ## Skill-Bench Testing
 
-`agents/skill-bench/` contains the testing framework for Claude Code skills:
+`tests/` contains skill test cases using the `skill-bench` CLI:
 
 - **Test cases are in English** - All `test_prompt` values in TOML files must be English
-- **Run tests**: `./agents/skill-bench/runner.sh` (executes inside container)
-- **Test pattern**: `./agents/skill-bench/runner.sh "cases/*/*.toml"`
+- **List tests**: `skill-bench list tests`
+- **Run tests**: `skill-bench run tests`
+- **Filter by skill**: `skill-bench run tests --skill patent-search`
+- **Log directory**: `skill-bench run tests --log logs/`
 
 ## Tools
 
diff --git a/agents/skill-bench/.gitignore b/agents/skill-bench/.gitignore
deleted file mode 100644
index 333c1e9..0000000
--- a/agents/skill-bench/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-logs/
diff --git a/agents/skill-bench/runner.sh b/agents/skill-bench/runner.sh
deleted file mode 100755
index c49c236..0000000
--- a/agents/skill-bench/runner.sh
+++ /dev/null
@@ -1,177 +0,0 @@
-#!/bin/bash
-# agents/skill-bench/runner.sh
-# Skill test runner for google-patent-cli.
-# All execution happens inside the container.
-#
-# Usage: ./runner.sh [pattern]
-#   pattern:  Glob pattern to match test files (default: "cases/*/*.toml")
-
-set -o pipefail
-
-# Determine workspace root
-WORKSPACE_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
-# Determine skill-bench root
-SKILL_BENCH_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-# Resolve pattern relative to skill-bench root
-PATTERN="${1:-cases/*/*.toml}"
-# Convert to absolute path
-if [[ "$PATTERN" != /* ]]; then
-    TARGET_PATTERN="$SKILL_BENCH_ROOT/$PATTERN"
-else
-    TARGET_PATTERN="$PATTERN"
-fi
-
-echo "=================================================="
-echo "[SkillBench] Starting Skill Test Runner"
-echo "[SkillBench] Workspace: $WORKSPACE_ROOT"
-echo "[SkillBench] Pattern: $TARGET_PATTERN"
-echo "=================================================="
-
-TOTAL_CASES=0
-TOTAL_PASS=0
-TOTAL_FAIL=0
-
-# --- Collect test files matching pattern ---
-TEST_FILES=()
-TEST_SKILLS=()
-TEST_NAMES=()
-
-for TEST_FILE in $TARGET_PATTERN; do
-    [ -f "$TEST_FILE" ] || continue
-
-    TEST_FILE_REL="${TEST_FILE#$WORKSPACE_ROOT/}"
-    SKILL_NAME=$(basename "$(dirname "$TEST_FILE_REL")")
-    TEST_NAME=$(basename "$TEST_FILE" .toml)
-
-    TEST_FILES+=("$TEST_FILE")
-    TEST_SKILLS+=("$SKILL_NAME")
-    TEST_NAMES+=("$TEST_NAME")
-done
-
-if [ ${#TEST_FILES[@]} -eq 0 ]; then
-    echo "[SkillBench] No test files found matching pattern: $TARGET_PATTERN"
-    exit 1
-fi
-
-# --- Process each test file ---
-for IDX in "${!TEST_FILES[@]}"; do
-    TEST_FILE="${TEST_FILES[$IDX]}"
-    SKILL_NAME="${TEST_SKILLS[$IDX]}"
-    TEST_NAME="${TEST_NAMES[$IDX]}"
-    TEST_CASE_NAME="${SKILL_NAME}/${TEST_NAME}"
-    TOTAL_CASES=$((TOTAL_CASES + 1))
-
-    # Read test configuration
-    TEST_PROMPT=$(yq eval '.test_prompt' "$TEST_FILE")
-    TEST_TIMEOUT=$(yq eval '.timeout // 300' "$TEST_FILE")
-
-    echo ""
-    echo "──────────────────────────────────────────────────"
-    echo "[SkillBench] Test Case: $TEST_CASE_NAME"
-    echo "──────────────────────────────────────────────────"
-
-    # --- Phase 1: Setup and Execute trial ---
-    # Create skill-specific log directory
-    LOG_DIR="$SKILL_BENCH_ROOT/logs/${SKILL_NAME}"
-    mkdir -p "$LOG_DIR"
-
-    TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-    LOG_FILE="$LOG_DIR/${TIMESTAMP}_${TEST_NAME}.log"
-    WORK_DIR="/tmp/skill-bench-${TIMESTAMP}_${SKILL_NAME}-${TEST_NAME}"
-
-    # Setup workspace
-    echo "[SkillBench]   📦 Setting up workspace: $WORK_DIR"
-    rm -rf "${WORK_DIR}"
-    mkdir -p "${WORK_DIR}"
-
-    # Copy claude-plugin (required for skill testing)
-    cp -r "$WORKSPACE_ROOT/claude-plugin" "$WORK_DIR/" 2>/dev/null || true
-
-    # Read setup files from test.toml [[setup]] array
-    NUM_SETUP=$(yq eval '.setup | length // 0' "$TEST_FILE")
-    if [ "$NUM_SETUP" -gt 0 ]; then
-        for SETUP_IDX in $(seq 0 $((NUM_SETUP - 1))); do
-            SETUP_PATH=$(yq eval ".setup[$SETUP_IDX].path" "$TEST_FILE")
-            SETUP_DIR=$(dirname "$WORK_DIR/$SETUP_PATH")
-            mkdir -p "$SETUP_DIR"
-            yq eval ".setup[$SETUP_IDX].content" "$TEST_FILE" > "$WORK_DIR/${SETUP_PATH}"
-        done
-    fi
-
-    # Execute trial
-    echo "[SkillBench]   Running trial → $LOG_FILE"
-    START_TIME=$(date +%s)
-
-    # Unset CLAUDECODE to avoid nested session error
-    (cd "$WORK_DIR" && unset CLAUDECODE && claude -p \
-        --dangerously-skip-permissions \
-        --verbose \
-        --output-format stream-json \
-        --plugin-dir ./claude-plugin \
-        -- "$TEST_PROMPT" < /dev/null | jq -c '(. + {timestamp: now})') > "$LOG_FILE" 2>&1
-
-    EXIT_CODE=$?
-    END_TIME=$(date +%s)
-    DURATION=$(( END_TIME - START_TIME ))
-
-    if [ $EXIT_CODE -eq 0 ]; then
-        echo "[SkillBench]   ✅ Trial finished (took ${DURATION}s)"
-    else
-        echo "[SkillBench]   ⚠️  Trial exited with code $EXIT_CODE (took ${DURATION}s)"
-    fi
-
-    # --- Phase 2: Evaluate trial ---
-    echo "[SkillBench]   Running evaluation..."
-
-    CASE_PASS=true
-    TOOLS_DIR="$(cd "$(dirname "$0")" && pwd)/tools"
-
-    # Run checks from test.toml
-    NUM_CHECKS=$(yq eval '.checks | length' "$TEST_FILE")
-    for CHECK_IDX in $(seq 0 $((NUM_CHECKS - 1))); do
-        CHECK_NAME=$(yq eval ".checks[$CHECK_IDX].name" "$TEST_FILE")
-        CHECK_CMD=$(yq eval ".checks[$CHECK_IDX].command" "$TEST_FILE")
-
-        # Parse check command into script and args
-        # Use eval to properly handle quoted arguments
-        CHECK_SCRIPT=$(echo "$CHECK_CMD" | awk '{print $1}')
-        CHECK_ARGS=$(echo "$CHECK_CMD" | cut -d' ' -f2-)
-
-        if [ -n "$CHECK_ARGS" ]; then
-            # Command has arguments: script.sh arg1 arg2...
-            # Use eval to properly expand quoted arguments
-            if eval "$TOOLS_DIR/$CHECK_SCRIPT \"\$LOG_FILE\" $CHECK_ARGS" >/dev/null 2>&1; then
-                echo "[SkillBench]     ✅ $CHECK_NAME"
-            else
-                echo "[SkillBench]     ❌ $CHECK_NAME"
-                CASE_PASS=false
-            fi
-        else
-            # Command has no arguments: script.sh (takes only LOG_FILE)
-            if $TOOLS_DIR/$CHECK_SCRIPT "$LOG_FILE" >/dev/null 2>&1; then
-                echo "[SkillBench]     ✅ $CHECK_NAME"
-            else
-                echo "[SkillBench]     ❌ $CHECK_NAME"
-                CASE_PASS=false
-            fi
-        fi
-    done
-
-    # Display case result
-    if [ "$CASE_PASS" = true ]; then
-        echo "[SkillBench]   ✅ $TEST_CASE_NAME: PASS"
-        TOTAL_PASS=$((TOTAL_PASS + 1))
-    else
-        echo "[SkillBench]   ❌ $TEST_CASE_NAME: FAIL"
-        TOTAL_FAIL=$((TOTAL_FAIL + 1))
-    fi
-done
-
-# --- Summary ---
-echo ""
-echo "=================================================="
-echo "[SkillBench] Test Summary"
-echo "[SkillBench] Total: $TOTAL_CASES | Pass: $TOTAL_PASS | Fail: $TOTAL_FAIL"
-echo "=================================================="
-
-exit "$TOTAL_FAIL"
diff --git a/agents/skill-bench/tools/check-mcp-loaded.sh b/agents/skill-bench/tools/check-mcp-loaded.sh
deleted file mode 100755
index d7c0118..0000000
--- a/agents/skill-bench/tools/check-mcp-loaded.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-# Check if MCP server loaded successfully in a log file
-# Usage: check-mcp-loaded.sh <log_file> <mcp_server_name>
-# Returns: 0 if MCP server loaded successfully, 1 if failed or not found
-
-LOG_FILE="$1"
-MCP_SERVER_NAME="$2"
-
-if [[ -z "$LOG_FILE" ]] || [[ -z "$MCP_SERVER_NAME" ]]; then
-  echo "Usage: $0 <log_file> <mcp_server_name>" >&2
-  exit 2
-fi
-
-if [[ ! -f "$LOG_FILE" ]]; then
-  echo "Log file not found: $LOG_FILE" >&2
-  exit 2
-fi
-
-# Check MCP server status in init message (first line is init message)
-# The mcp_servers array contains objects with name and status fields
-STATUS=$(head -1 "$LOG_FILE" | jq -r '
-  .mcp_servers? // []
-  | .[] | select(.name? | test("'"$MCP_SERVER_NAME"'"))
-  | .status // "not_found"
-')
-
-if [[ "$STATUS" == "not_found" ]]; then
-  echo "MCP server $MCP_SERVER_NAME not found in log" >&2
-  exit 1
-fi
-
-if [[ "$STATUS" == "failed" ]]; then
-  echo "MCP server $MCP_SERVER_NAME failed to load (status: failed)" >&2
-  exit 1
-fi
-
-exit 0
diff --git a/agents/skill-bench/tools/check-mcp-success.sh b/agents/skill-bench/tools/check-mcp-success.sh
deleted file mode 100755
index b3ad0ec..0000000
--- a/agents/skill-bench/tools/check-mcp-success.sh
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/bin/bash
-# Check if MCP tool calls succeeded in a log file
-# Usage: check-mcp-success.sh <log_file> <mcp_tool_name> [--optional]
-# Returns: 0 if all MCP calls succeeded (or none made with --optional), 1 if any failed
-
-LOG_FILE="$1"
-MCP_TOOL_NAME="$2"
-OPTIONAL_FLAG="${3:-}"
-
-if [[ -z "$LOG_FILE" ]] || [[ -z "$MCP_TOOL_NAME" ]]; then
-  echo "Usage: $0 <log_file> <mcp_tool_name> [--optional]" >&2
-  exit 2
-fi
-
-if [[ ! -f "$LOG_FILE" ]]; then
-  echo "Log file not found: $LOG_FILE" >&2
-  exit 2
-fi
-
-# Extract tool_use IDs for the specified MCP tool
-TOOL_USE_IDS=$(jq -r '
-  .[]
-  | select(.type? == "assistant")
-  | (.message.content? // [])
-  | select(type == "array")
-  | .[]
-  | select(type == "object" and .type? == "tool_use" and (.name? // "") | test("'"$MCP_TOOL_NAME"'"))
-  | .id
-' "$LOG_FILE")
-
-ID_COUNT=$(echo "$TOOL_USE_IDS" | grep -c '^\w*$' || true)
-
-if [[ $ID_COUNT -eq 0 ]]; then
-  if [[ "$OPTIONAL_FLAG" == "--optional" ]]; then
-    exit 0
-  else
-    echo "No $MCP_TOOL_NAME tool calls found in log" >&2
-    exit 1
-  fi
-fi
-
-# Check if any tool_results have is_error: true
-while IFS= read -r tool_id; do
-  if [[ -n "$tool_id" ]]; then
-    ERROR_CHECK=$(jq -r "
-      .[]
-      | select(.type? == \"user\")
-      | (.message.content? // [])
-      | select(type == \"array\")
-      | .[]
-      | select(type == \"object\" and .type? == \"tool_result\" and .tool_use_id? == \"$tool_id\")
-      | .is_error // false
-    " "$LOG_FILE")
-
-    if [[ "$ERROR_CHECK" == "true" ]]; then
-      echo "MCP tool $MCP_TOOL_NAME (tool_use_id: $tool_id) returned an error" >&2
-      exit 1
-    fi
-  fi
-done <<< "$TOOL_USE_IDS"
-
-exit 0
diff --git a/agents/skill-bench/tools/check-output-contains.sh b/agents/skill-bench/tools/check-output-contains.sh
deleted file mode 100755
index 47af684..0000000
--- a/agents/skill-bench/tools/check-output-contains.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-# Check if the final output contains a specific string
-# Usage: check-output-contains.sh <log_file> <pattern>
-#
-# This checks the final result text, not intermediate tool calls
-
-LOG_FILE="${1:-}"
-PATTERN="${2:-}"
-
-if [ -z "$LOG_FILE" ] || [ -z "$PATTERN" ]; then
-    echo "[Error] Usage: $0 <log_file> <pattern>" >&2
-    exit 1
-fi
-
-# Extract the final result text and check if it contains the pattern
-# The result is in a "result" type message with a "result" field
-jq -r 'select(.type == "result") | .result' "$LOG_FILE" 2>/dev/null | grep -q "$PATTERN"
diff --git a/agents/skill-bench/tools/check-output-file.sh b/agents/skill-bench/tools/check-output-file.sh
deleted file mode 100755
index dba0a34..0000000
--- a/agents/skill-bench/tools/check-output-file.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-# Check if output_file was created in log
-# Usage: check-output-file.sh <log_file>
-
-LOG_FILE="$1"
-
-if [ -z "$LOG_FILE" ]; then
-    echo "[Error] Usage: $0 <log_file>" >&2
-    exit 1
-fi
-
-# Check if output_file exists in tool_result content
-# Use try/catch to handle invalid JSON in content field
-jq -s '[.[] | select(.type == "user") | .message.content[]? | select(type == "object" and .type == "tool_result" and .tool_use_id? and .content? != null and (.content | type) == "string") | .content | try fromjson catch null | select(. != null) | .output_file] | length > 0' "$LOG_FILE"
diff --git a/agents/skill-bench/tools/check-output-not-contains.sh b/agents/skill-bench/tools/check-output-not-contains.sh
deleted file mode 100755
index 242a455..0000000
--- a/agents/skill-bench/tools/check-output-not-contains.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-# Check if the final output does NOT contain a specific pattern
-# Usage: check-output-not-contains.sh <log_file> <pattern>
-#
-# This checks the final result text, not intermediate tool calls
-
-LOG_FILE="${1:-}"
-PATTERN="${2:-}"
-
-if [ -z "$LOG_FILE" ] || [ -z "$PATTERN" ]; then
-    echo "[Error] Usage: $0 <log_file> <pattern>" >&2
-    exit 1
-fi
-
-# Extract the final result text and check if it does NOT contain the pattern
-# The result is in a "result" type message with a "result" field
-! jq -r 'select(.type == "result") | .result' "$LOG_FILE" 2>/dev/null | grep -q "$PATTERN"
diff --git a/agents/skill-bench/tools/check-param.sh b/agents/skill-bench/tools/check-param.sh
deleted file mode 100755
index 2e69093..0000000
--- a/agents/skill-bench/tools/check-param.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-# Check if parameter was used in tool call
-# Usage: check-param.sh <log_file> <tool_name> <param_name> <expected_value>
-
-LOG_FILE="$1"
-TOOL_NAME="$2"
-PARAM_NAME="$3"
-EXPECTED_VALUE="$4"
-
-if [ -z "$LOG_FILE" ] || [ -z "$TOOL_NAME" ] || [ -z "$PARAM_NAME" ]; then
-    echo "[Error] Usage: $0 <log_file> <tool_name> <param_name> [expected_value]" >&2
-    exit 1
-fi
-
-if [ -n "$EXPECTED_VALUE" ]; then
-    # Check if parameter equals expected value (handle both string and array)
-    jq -s "[.[] | select(.type == \"assistant\") | .message.content[]? | select(type == \"object\" and .type == \"tool_use\") | select(.name | test(\"$TOOL_NAME\"; \"i\")) | .input.$PARAM_NAME | (if type == \"array\" then .[] == \"$EXPECTED_VALUE\" else . == \"$EXPECTED_VALUE\" end)] | any" "$LOG_FILE"
-else
-    # Check if parameter exists
-    jq -s "[.[] | select(.type == \"assistant\") | .message.content[]? | select(type == \"object\" and .type == \"tool_use\") | select(.name | test(\"$TOOL_NAME\"; \"i\")) | .input.$PARAM_NAME] | length > 0" "$LOG_FILE"
-fi
diff --git a/agents/skill-bench/tools/check-skill-invoked.sh b/agents/skill-bench/tools/check-skill-invoked.sh
deleted file mode 100755
index 6f99948..0000000
--- a/agents/skill-bench/tools/check-skill-invoked.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-# Check if a specific skill was invoked
-# Usage: check-skill-invoked.sh <log_file> <skill_name>
-
-LOG_FILE="${1:-}"
-SKILL_NAME="${2:-}"
-
-if [ -z "$LOG_FILE" ] || [ -z "$SKILL_NAME" ]; then
-    echo "[Error] Usage: $0 <log_file> <skill_name>" >&2
-    exit 1
-fi
-
-# Check if the skill was invoked in the log
-grep -q '"Skill"' "$LOG_FILE" && grep -q '"skill":".*'"$SKILL_NAME" "$LOG_FILE"
diff --git a/agents/skill-bench/tools/check-skill-loaded.sh b/agents/skill-bench/tools/check-skill-loaded.sh
deleted file mode 100755
index 822cfdd..0000000
--- a/agents/skill-bench/tools/check-skill-loaded.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-# Check if a skill was loaded successfully in a log file
-# Usage: check-skill-loaded.sh <log_file> <skill_name>
-# Returns: 0 if skill found in init skills array, 1 if not found
-
-LOG_FILE="$1"
-SKILL_NAME="$2"
-
-if [[ -z "$LOG_FILE" ]] || [[ -z "$SKILL_NAME" ]]; then
-  echo "Usage: $0 <log_file> <skill_name>" >&2
-  exit 2
-fi
-
-if [[ ! -f "$LOG_FILE" ]]; then
-  echo "Log file not found: $LOG_FILE" >&2
-  exit 2
-fi
-
-# Check if skill is in the init skills array (first line is init message)
-FOUND=$(head -1 "$LOG_FILE" | jq -c '
-  .skills | any(.[]; contains("'$SKILL_NAME'"))
-')
-
-if [[ "$FOUND" != "true" ]]; then
-  echo "Skill $SKILL_NAME not found in init skills array" >&2
-  exit 1
-fi
-
-exit 0
diff --git a/agents/skill-bench/tools/check-workspace.sh b/agents/skill-bench/tools/check-workspace.sh
deleted file mode 100755
index 99f7b7e..0000000
--- a/agents/skill-bench/tools/check-workspace.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-# Check if workspace condition is true
-# Usage: check-workspace.sh <work_dir> <command>
-
-WORK_DIR="$1"
-CHECK_CMD="$2"
-
-if [ -z "$WORK_DIR" ] || [ -z "$CHECK_CMD" ]; then
-    echo "[Error] Usage: $0 <work_dir> <command>" >&2
-    exit 1
-fi
-
-(cd "$WORK_DIR" && eval "$CHECK_CMD")
diff --git a/mise.toml b/mise.toml
index b53d9d2..68ebdee 100644
--- a/mise.toml
+++ b/mise.toml
@@ -9,6 +9,12 @@ run = "RUSTFLAGS=\"-D warnings\" cargo test --all-targets"
 
 [tasks.pre-commit]
 depends = ["fmt", "clippy", "test"]
+
+[tasks.skill-test]
+run = "skill-bench run tests --plugin-dir claude-plugin --log logs"
+
+[tasks.skill-test-list]
+run = "skill-bench list tests"
  
 [tasks.coverage]
 run = """
diff --git a/agents/skill-bench/cases/patent-assignee-check/functional-with-country.toml b/tests/patent-assignee-check/functional-with-country.toml
similarity index 59%
rename from agents/skill-bench/cases/patent-assignee-check/functional-with-country.toml
rename to tests/patent-assignee-check/functional-with-country.toml
index 282409a..56efb1e 100644
--- a/agents/skill-bench/cases/patent-assignee-check/functional-with-country.toml
+++ b/tests/patent-assignee-check/functional-with-country.toml
@@ -10,25 +10,20 @@ Find all spelling variations of Toyota assignee name in Japanese patents (countr
 
 [[checks]]
 name = "mcp_server_loaded"
-type = "script"
-command = "check-mcp-loaded.sh google-patent-cli"
+command = { command = "mcp-loaded", server = "google-patent-cli" }
 
 [[checks]]
 name = "skill_loaded"
-type = "script"
-command = "check-skill-loaded.sh patent-assignee-check"
+command = { command = "skill-loaded", skill = "patent-assignee-check" }
 
 [[checks]]
 name = "patent_assignee_check_skill_invoked"
-type = "script"
-command = "check-skill-invoked.sh patent-assignee-check"
+command = { command = "skill-invoked", skill = "patent-assignee-check" }
 
 [[checks]]
 name = "search_patents_mcp_called"
-type = "script"
-command = "check-mcp-success.sh search_patents"
+command = { command = "mcp-success", tool = "search_patents" }
 
 [[checks]]
 name = "country_parameter_used"
-type = "script"
-command = "check-param.sh search_patents country JP"
+command = { command = "tool-use", tool = "search_patents", param = "country", value = "JP" }
diff --git a/agents/skill-bench/cases/patent-assignee-check/functional.toml b/tests/patent-assignee-check/functional.toml
similarity index 58%
rename from agents/skill-bench/cases/patent-assignee-check/functional.toml
rename to tests/patent-assignee-check/functional.toml
index ba5b612..dee0670 100644
--- a/agents/skill-bench/cases/patent-assignee-check/functional.toml
+++ b/tests/patent-assignee-check/functional.toml
@@ -10,25 +10,20 @@ Find all spelling variations of Google assignee name in patent databases. Check
 
 [[checks]]
 name = "mcp_server_loaded"
-type = "script"
-command = "check-mcp-loaded.sh google-patent-cli"
+command = { command = "mcp-loaded", server = "google-patent-cli" }
 
 [[checks]]
 name = "skill_loaded"
-type = "script"
-command = "check-skill-loaded.sh patent-assignee-check"
+command = { command = "skill-loaded", skill = "patent-assignee-check" }
 
 [[checks]]
 name = "patent_assignee_check_skill_invoked"
-type = "script"
-command = "check-skill-invoked.sh patent-assignee-check"
+command = { command = "skill-invoked", skill = "patent-assignee-check" }
 
 [[checks]]
 name = "search_patents_mcp_called"
-type = "script"
-command = "check-mcp-success.sh search_patents"
+command = { command = "mcp-success", tool = "search_patents" }
 
 [[checks]]
 name = "company_name_parameter_used"
-type = "script"
-command = "check-param.sh search_patents assignee Google"
+command = { command = "tool-use", tool = "search_patents", param = "assignee", value = "Google" }
diff --git a/agents/skill-bench/cases/patent-assignee-check/triggering.toml b/tests/patent-assignee-check/triggering.toml
similarity index 82%
rename from agents/skill-bench/cases/patent-assignee-check/triggering.toml
rename to tests/patent-assignee-check/triggering.toml
index d9e43bf..5e33ec2 100644
--- a/agents/skill-bench/cases/patent-assignee-check/triggering.toml
+++ b/tests/patent-assignee-check/triggering.toml
@@ -10,5 +10,4 @@ I want to verify the correct assignee name for "Toyota" in patent databases.
 
 [[checks]]
 name = "patent_assignee_check_skill_invoked"
-type = "script"
-command = "check-skill-invoked.sh patent-assignee-check"
+command = { command = "skill-invoked", skill = "patent-assignee-check" }
diff --git a/agents/skill-bench/cases/patent-fetch/functional-data-return.toml b/tests/patent-fetch/functional-data-return.toml
similarity index 63%
rename from agents/skill-bench/cases/patent-fetch/functional-data-return.toml
rename to tests/patent-fetch/functional-data-return.toml
index 08c1800..b4cca15 100644
--- a/agents/skill-bench/cases/patent-fetch/functional-data-return.toml
+++ b/tests/patent-fetch/functional-data-return.toml
@@ -17,30 +17,24 @@ The skill should return the actual patent data, not a file path that requires yo
 
 [[checks]]
 name = "mcp_server_loaded"
-type = "script"
-command = "check-mcp-loaded.sh google-patent-cli"
+command = { command = "mcp-loaded", server = "google-patent-cli" }
 
 [[checks]]
 name = "skill_loaded"
-type = "script"
-command = "check-skill-loaded.sh patent-fetch"
+command = { command = "skill-loaded", skill = "patent-fetch" }
 
 [[checks]]
 name = "patent_fetch_skill_invoked"
-type = "script"
-command = "check-skill-invoked.sh patent-fetch"
+command = { command = "skill-invoked", skill = "patent-fetch" }
 
 [[checks]]
 name = "fetch_patent_mcp_called"
-type = "script"
-command = "check-mcp-success.sh fetch_patent"
+command = { command = "mcp-success", tool = "fetch_patent" }
 
 [[checks]]
 name = "title_returned"
-type = "script"
-command = "check-output-contains.sh 'Serving advertisements'"
+command = { command = "message-contains", text = "Serving advertisements" }
 
 [[checks]]
 name = "no_json_file_path_in_output"
-type = "script"
-command = "check-output-not-contains.sh 'patent-.*\\.json'"
+command = { command = "message-contains", text = 'patent-.*\.json', deny = true }
diff --git a/agents/skill-bench/cases/patent-fetch/functional.toml b/tests/patent-fetch/functional.toml
similarity index 53%
rename from agents/skill-bench/cases/patent-fetch/functional.toml
rename to tests/patent-fetch/functional.toml
index a142dab..3b5f9c6 100644
--- a/agents/skill-bench/cases/patent-fetch/functional.toml
+++ b/tests/patent-fetch/functional.toml
@@ -10,25 +10,20 @@ Fetch patent details for US9152718B2.
 
 [[checks]]
 name = "mcp_server_loaded"
-type = "script"
-command = "check-mcp-loaded.sh google-patent-cli"
+command = { command = "mcp-loaded", server = "google-patent-cli" }
 
 [[checks]]
 name = "skill_loaded"
-type = "script"
-command = "check-skill-loaded.sh patent-fetch"
+command = { command = "skill-loaded", skill = "patent-fetch" }
 
 [[checks]]
 name = "patent_fetch_skill_invoked"
-type = "script"
-command = "check-skill-invoked.sh patent-fetch"
+command = { command = "skill-invoked", skill = "patent-fetch" }
 
 [[checks]]
 name = "fetch_patent_mcp_called"
-type = "script"
-command = "check-mcp-success.sh fetch_patent"
+command = { command = "mcp-success", tool = "fetch_patent" }
 
 [[checks]]
 name = "patent_id_parameter_used"
-type = "script"
-command = "check-param.sh fetch_patent patent_id US9152718B2"
+command = { command = "tool-use", tool = "fetch_patent", param = "patent_id", value = "US9152718B2" }
diff --git a/agents/skill-bench/cases/patent-fetch/triggering.toml b/tests/patent-fetch/triggering.toml
similarity index 81%
rename from agents/skill-bench/cases/patent-fetch/triggering.toml
rename to tests/patent-fetch/triggering.toml
index a06c200..93886e9 100644
--- a/agents/skill-bench/cases/patent-fetch/triggering.toml
+++ b/tests/patent-fetch/triggering.toml
@@ -10,5 +10,4 @@ I need to get detailed information for patent US1234567B2.
 
 [[checks]]
 name = "patent_fetch_skill_invoked"
-type = "script"
-command = "check-skill-invoked.sh patent-fetch"
+command = { command = "skill-invoked", skill = "patent-fetch" }
diff --git a/agents/skill-bench/cases/patent-search/functional-filing.toml b/tests/patent-search/functional-filing.toml
similarity index 53%
rename from agents/skill-bench/cases/patent-search/functional-filing.toml
rename to tests/patent-search/functional-filing.toml
index ffd7070..b12d490 100644
--- a/agents/skill-bench/cases/patent-search/functional-filing.toml
+++ b/tests/patent-search/functional-filing.toml
@@ -10,30 +10,24 @@ Search for patents about "ai" with filing date between 2023-01-01 and 2023-12-31
 
 [[checks]]
 name = "mcp_server_loaded"
-type = "script"
-command = "check-mcp-loaded.sh google-patent-cli"
+command = { command = "mcp-loaded", server = "google-patent-cli" }
 
 [[checks]]
 name = "skill_loaded"
-type = "script"
-command = "check-skill-loaded.sh patent-search"
+command = { command = "skill-loaded", skill = "patent-search" }
 
 [[checks]]
 name = "patent_search_skill_invoked"
-type = "script"
-command = "check-skill-invoked.sh patent-search"
+command = { command = "skill-invoked", skill = "patent-search" }
 
 [[checks]]
 name = "search_patents_mcp_called"
-type = "script"
-command = "check-mcp-success.sh search_patents"
+command = { command = "mcp-success", tool = "search_patents" }
 
 [[checks]]
 name = "filing_after_parameter_used"
-type = "script"
-command = "check-param.sh search_patents filing_after 2023-01-01"
+command = { command = "tool-use", tool = "search_patents", param = "filing_after", value = "2023-01-01" }
 
 [[checks]]
 name = "filing_before_parameter_used"
-type = "script"
-command = "check-param.sh search_patents filing_before 2023-12-31"
+command = { command = "tool-use", tool = "search_patents", param = "filing_before", value = "2023-12-31" }
diff --git a/agents/skill-bench/cases/patent-search/functional-priority.toml b/tests/patent-search/functional-priority.toml
similarity index 53%
rename from agents/skill-bench/cases/patent-search/functional-priority.toml
rename to tests/patent-search/functional-priority.toml
index 664a0ea..8457d86 100644
--- a/agents/skill-bench/cases/patent-search/functional-priority.toml
+++ b/tests/patent-search/functional-priority.toml
@@ -10,30 +10,24 @@ Search for patents about "ai" with priority date between 2023-01-01 and 2023-12-
 
 [[checks]]
 name = "mcp_server_loaded"
-type = "script"
-command = "check-mcp-loaded.sh google-patent-cli"
+command = { command = "mcp-loaded", server = "google-patent-cli" }
 
 [[checks]]
 name = "skill_loaded"
-type = "script"
-command = "check-skill-loaded.sh patent-search"
+command = { command = "skill-loaded", skill = "patent-search" }
 
 [[checks]]
 name = "patent_search_skill_invoked"
-type = "script"
-command = "check-skill-invoked.sh patent-search"
+command = { command = "skill-invoked", skill = "patent-search" }
 
 [[checks]]
 name = "search_patents_mcp_called"
-type = "script"
-command = "check-mcp-success.sh search_patents"
+command = { command = "mcp-success", tool = "search_patents" }
 
 [[checks]]
 name = "priority_after_parameter_used"
-type = "script"
-command = "check-param.sh search_patents priority_after 2023-01-01"
+command = { command = "tool-use", tool = "search_patents", param = "priority_after", value = "2023-01-01" }
 
 [[checks]]
 name = "priority_before_parameter_used"
-type = "script"
-command = "check-param.sh search_patents priority_before 2023-12-31"
+command = { command = "tool-use", tool = "search_patents", param = "priority_before", value = "2023-12-31" }
diff --git a/agents/skill-bench/cases/patent-search/functional-publication.toml b/tests/patent-search/functional-publication.toml
similarity index 54%
rename from agents/skill-bench/cases/patent-search/functional-publication.toml
rename to tests/patent-search/functional-publication.toml
index f906c68..b39a5d2 100644
--- a/agents/skill-bench/cases/patent-search/functional-publication.toml
+++ b/tests/patent-search/functional-publication.toml
@@ -10,30 +10,24 @@ Search for patents about "ai" with publication date between 2023-01-01 and 2023-
 
 [[checks]]
 name = "mcp_server_loaded"
-type = "script"
-command = "check-mcp-loaded.sh google-patent-cli"
+command = { command = "mcp-loaded", server = "google-patent-cli" }
 
 [[checks]]
 name = "skill_loaded"
-type = "script"
-command = "check-skill-loaded.sh patent-search"
+command = { command = "skill-loaded", skill = "patent-search" }
 
 [[checks]]
 name = "patent_search_skill_invoked"
-type = "script"
-command = "check-skill-invoked.sh patent-search"
+command = { command = "skill-invoked", skill = "patent-search" }
 
 [[checks]]
 name = "search_patents_mcp_called"
-type = "script"
-command = "check-mcp-success.sh search_patents"
+command = { command = "mcp-success", tool = "search_patents" }
 
 [[checks]]
 name = "publication_after_parameter_used"
-type = "script"
-command = "check-param.sh search_patents publication_after 2023-01-01"
+command = { command = "tool-use", tool = "search_patents", param = "publication_after", value = "2023-01-01" }
 
 [[checks]]
 name = "publication_before_parameter_used"
-type = "script"
-command = "check-param.sh search_patents publication_before 2023-12-31"
+command = { command = "tool-use", tool = "search_patents", param = "publication_before", value = "2023-12-31" }
diff --git a/agents/skill-bench/cases/patent-search/functional.toml b/tests/patent-search/functional.toml
similarity index 61%
rename from agents/skill-bench/cases/patent-search/functional.toml
rename to tests/patent-search/functional.toml
index aee9b84..449358b 100644
--- a/agents/skill-bench/cases/patent-search/functional.toml
+++ b/tests/patent-search/functional.toml
@@ -10,20 +10,16 @@ I want to search for patents about machine learning, show me 5 results.
 
 [[checks]]
 name = "mcp_server_loaded"
-type = "script"
-command = "check-mcp-loaded.sh google-patent-cli"
+command = { command = "mcp-loaded", server = "google-patent-cli" }
 
 [[checks]]
 name = "skill_loaded"
-type = "script"
-command = "check-skill-loaded.sh patent-search"
+command = { command = "skill-loaded", skill = "patent-search" }
 
 [[checks]]
 name = "patent_search_skill_invoked"
-type = "script"
-command = "check-skill-invoked.sh patent-search"
+command = { command = "skill-invoked", skill = "patent-search" }
 
 [[checks]]
 name = "search_patents_mcp_called"
-type = "script"
-command = "check-mcp-success.sh search_patents"
+command = { command = "mcp-success", tool = "search_patents" }
diff --git a/agents/skill-bench/cases/patent-search/triggering.toml b/tests/patent-search/triggering.toml
similarity index 64%
rename from agents/skill-bench/cases/patent-search/triggering.toml
rename to tests/patent-search/triggering.toml
index 4badf47..4b01efa 100644
--- a/agents/skill-bench/cases/patent-search/triggering.toml
+++ b/tests/patent-search/triggering.toml
@@ -10,15 +10,12 @@ I need to search for patents about artificial intelligence.
 
 [[checks]]
 name = "mcp_server_loaded"
-type = "script"
-command = "check-mcp-loaded.sh google-patent-cli"
+command = { command = "mcp-loaded", server = "google-patent-cli" }
 
 [[checks]]
 name = "skill_loaded"
-type = "script"
-command = "check-skill-loaded.sh patent-search"
+command = { command = "skill-loaded", skill = "patent-search" }
 
 [[checks]]
 name = "patent_search_skill_invoked"
-type = "script"
-command = "check-skill-invoked.sh patent-search"
+command = { command = "skill-invoked", skill = "patent-search" }