From 137da255b2abc648bc9cb68e7fce308585b40caa Mon Sep 17 00:00:00 2001
From: William Weishuhn <william.weishuhn3@gmail.com>
Date: Mon, 23 Mar 2026 12:23:51 -0700
Subject: [PATCH] =?UTF-8?q?fix:=20audit=20fixes=20=E2=80=94=20MCP=20tools,?=
 =?UTF-8?q?=20telemetry,=20matrix=20wiring,=20docs,=20tests?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses 9 issues from codebase audit:

Critical:
- Expose lock_verify, get_history, ci_report as MCP server tools
- Wire telemetry enrichment fields into lock/history/ci-report/scan commands
- Wire matrix comment renderer into scan --format pr-comment-matrix

High:
- Add score dedup sync comments between src/score.ts and api/worker.ts
- Update README with lock, history, ci-report, badge, score commands
- Add 5 CLI integration tests for lock, history, ci-report commands

Medium:
- Add 10 security tests for validateArgs/validatePath
- Add status note to github-app/README.md
- Add rationale comments to scoring weights and performance thresholds

302/302 tests pass. Build and lint clean.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 README.md                     |  39 +++++++++++
 action/README.md              |   4 +-
 api/src/worker.ts             |  18 ++---
 github-app/README.md          |   2 +
 src/commands/ci-report.ts     |   8 +++
 src/commands/history.ts       |   5 ++
 src/commands/lock.ts          |  15 +++++
 src/commands/scan.ts          |  25 +++++--
 src/score.ts                  |  15 +++--
 src/server.ts                 | 120 +++++++++++++++++++++++++++++++++-
 src/telemetry.ts              |   1 +
 tests/cli-entrypoint.test.ts  |  42 ++++++++++++
 tests/server-security.test.ts |  46 ++++++++++++-
 13 files changed, 319 insertions(+), 21 deletions(-)
diff --git a/README.md b/README.md
index 870342e..567dbdd 100644
--- a/README.md
+++ b/README.md
@@ -81,6 +81,12 @@ Or add it manually to your config:
 | `watch <config>` | Watch a server for changes, alert on regressions |
 | `suggest` | Detect your stack and recommend MCP servers from the registry |
 | `serve` | Start as an MCP server for AI agents |
+| `lock` | Snapshot MCP server schemas into a lock file |
+| `lock verify` | Verify live servers match the lock file |
+| `history` | Show health score trends for your MCP servers |
+| `ci-report` | Generate CI report for GitHub issue creation |
+| `score <cmd>` | Score an MCP server's health (0-100) |
+| `badge <cmd>` | Generate an SVG health score badge for README |
 
 Run with no arguments for an interactive menu:
 
@@ -162,8 +168,41 @@ jobs:
           security: true
 ```
 
+Action inputs:
+
+| Input | Description | Default |
+|-------|-------------|---------|
+| `command` | Server command to test | (required if no `target`) |
+| `target` | Path to target config JSON | |
+| `targets` | Path to MCP config file for multi-server matrix scan | |
+| `deep` | Also invoke safe tools | `false` |
+| `security` | Run security analysis | `false` |
+| `fail-on-regression` | Fail the action on issues | `true` |
+| `comment-on-pr` | Post report as PR comment | `true` |
+| `set-status` | Set a commit status check (green/red) on the HEAD SHA | `true` |
+| `github-token` | Token for PR comments and commit statuses | `${{ github.token }}` |
+
 The action runs checks on every PR, comments a markdown report, and blocks merge on regressions. See [`action/README.md`](./action/README.md) for all options.
 
+### Lock Files
+
+```bash
+$ npx @kryptosai/mcp-observatory lock              # Snapshot all server schemas
+$ npx @kryptosai/mcp-observatory lock verify        # Verify no drift since last lock
+```
+
+### Trend Tracking
+
+```bash
+$ npx @kryptosai/mcp-observatory history            # Show health trends over time
+```
+
+### Nightly Scans
+
+```bash
+$ npx @kryptosai/mcp-observatory ci-report          # Generate regression report for CI
+```
+
 ## MCP Server Mode
 
 **No other testing tool is itself an MCP server.** Add Observatory as a server and your AI agent can autonomously test, diagnose, and monitor your other MCP servers.
diff --git a/action/README.md b/action/README.md
index e647e31..6c1e734 100644
--- a/action/README.md
+++ b/action/README.md
@@ -29,7 +29,9 @@ jobs:
 | `security` | Run security analysis | `false` |
 | `fail-on-regression` | Fail the action on issues | `true` |
 | `comment-on-pr` | Post report as PR comment | `true` |
-| `github-token` | Token for PR comments | `${{ github.token }}` |
+| `set-status` | Set a commit status check (green/red) on the HEAD SHA | `true` |
+| `targets` | Path to MCP config file for multi-server matrix scan | |
+| `github-token` | Token for PR comments and commit statuses | `${{ github.token }}` |
 | `node-version` | Node.js version | `22` |
 
 ## Outputs
diff --git a/api/src/worker.ts b/api/src/worker.ts
index 7110fe3..544c338 100644
--- a/api/src/worker.ts
+++ b/api/src/worker.ts
@@ -134,9 +134,9 @@ interface RunArtifact {
   fatalError?: string;
 }
 
-// ---------------------------------------------------------------------------
-// Score computation (ported from src/score.ts)
-// ---------------------------------------------------------------------------
+// ── Score computation (duplicated from src/score.ts) ────────────────────────
+// IMPORTANT: This logic is duplicated from src/score.ts because the Worker
+// can't import from the main package. Keep both files in sync when making changes.
 
 const STATUS_SCORES: Record<string, number> = {
   pass: 100,
@@ -214,6 +214,8 @@ function scorePerformance(
   );
   const p95 = sorted[p95Index] ?? 0;
 
+  // p95 latency thresholds for performance scoring
+  // <500ms = excellent (100), <1s = good (80), <2s = acceptable (60), <5s = slow (40), >5s = poor (20)
   let score: number;
   if (p95 < 500) score = 100;
   else if (p95 < 1000) score = 80;
@@ -237,11 +239,11 @@ function computeHealthScore(
   performanceMetrics?: PerformanceMetrics,
 ): HealthScore {
   const w = {
-    protocolCompliance: 0.3,
-    schemaQuality: 0.2,
-    security: 0.2,
-    reliability: 0.2,
-    performance: 0.1,
+    protocolCompliance: 0.3,  // Highest — spec compliance is foundational for interop
+    schemaQuality: 0.2,       // Good schemas enable AI agents to use tools correctly
+    security: 0.2,            // Parity with quality — both critical for production use
+    reliability: 0.2,         // Tools/prompts/resources actually responding as expected
+    performance: 0.1,         // Lowest — latency matters less than correctness
   };
 
   const dimensions: ScoreDimension[] = [
diff --git a/github-app/README.md b/github-app/README.md
index 8f783cb..8c7203f 100644
--- a/github-app/README.md
+++ b/github-app/README.md
@@ -1,5 +1,7 @@
 # MCP Observatory GitHub App
 
+> **Status**: Planned feature — not yet deployed. This is the future hosted Observatory GitHub App.
+
 A GitHub App that automatically analyzes MCP server configurations in pull requests and posts health score reports as PR comments.
 
 ## Setup
diff --git a/src/commands/ci-report.ts b/src/commands/ci-report.ts
index 230907f..095a9a7 100644
--- a/src/commands/ci-report.ts
+++ b/src/commands/ci-report.ts
@@ -2,6 +2,7 @@ import { readdir, readFile } from "node:fs/promises";
 import path from "node:path";
 import type { Command } from "commander";
 import type { RunArtifact } from "../types.js";
+import { buildEvent, recordEvent } from "../telemetry.js";
 import { validateRunArtifact } from "../validate.js";
 import { defaultRunsDirectory } from "../storage.js";
 
@@ -96,6 +97,13 @@ export function registerCiReportCommands(program: Command): void {
           process.stdout.write(JSON.stringify(report, null, 2) + "\n");
         }
 
+        recordEvent(buildEvent("command_complete", "ci-report", "cli", {
+          nightlyScan: true,
+          issueCreated: report.hasRegressions,
+          matrixServerCount: report.serverCount,
+          matrixFailCount: report.failCount,
+        }));
+
         if (report.hasRegressions) {
           process.exitCode = 1;
         }
diff --git a/src/commands/history.ts b/src/commands/history.ts
index c8b35f2..ea39691 100644
--- a/src/commands/history.ts
+++ b/src/commands/history.ts
@@ -1,5 +1,6 @@
 import type { Command } from "commander";
 import { readHistory, getTrend, renderTrendLabel } from "../history.js";
+import { buildEvent, recordEvent } from "../telemetry.js";
 import { ANSI, c } from "./helpers.js";
 
 export function registerHistoryCommands(program: Command): void {
@@ -58,5 +59,9 @@ export function registerHistoryCommands(program: Command): void {
           `  ${paddedId} ${c(gradeColor, current.grade)} (${current.healthScore})  ${label}\n`,
         );
       }
+
+      recordEvent(buildEvent("command_complete", "history", "cli", {
+        historyEntryCount: history.entries.length,
+      }));
     });
 }
diff --git a/src/commands/lock.ts b/src/commands/lock.ts
index 55bfeb2..18207bd 100644
--- a/src/commands/lock.ts
+++ b/src/commands/lock.ts
@@ -1,6 +1,7 @@
 import type { Command } from "commander";
 
 import { scanForTargets } from "../discovery.js";
+import { buildEvent, recordEvent } from "../telemetry.js";
 import {
   readLockFile,
   writeLockFile,
@@ -82,6 +83,11 @@ export function registerLockCommands(program: Command): void {
       process.stdout.write(
         `\n  ${c(ANSI.green, "✓")} Locked ${entries.length} server${entries.length === 1 ? "" : "s"} to ${lockPath}\n\n`,
       );
+
+      recordEvent(buildEvent("command_complete", "lock", "cli", {
+        lockFileExists: true,
+        lockServerCount: entries.length,
+      }));
     });
 
   lockCmd
@@ -109,6 +115,7 @@ export function registerLockCommands(program: Command): void {
       );
 
       let anyFailed = false;
+      let totalDriftCount = 0;
 
       for (const t of targets) {
         const lockEntry = lockMap.get(t.config.targetId);
@@ -129,6 +136,7 @@ export function registerLockCommands(program: Command): void {
             process.stdout.write(`  ${c(ANSI.green, "✓")} ${t.config.targetId}\n`);
           } else {
             anyFailed = true;
+            totalDriftCount += result.drift.length;
             process.stdout.write(`  ${c(ANSI.red, "✗")} ${t.config.targetId}\n`);
             for (const d of result.drift) {
               process.stdout.write(
@@ -145,6 +153,13 @@ export function registerLockCommands(program: Command): void {
 
       process.stdout.write("\n");
 
+      recordEvent(buildEvent("command_complete", "lock-verify", "cli", {
+        lockFileExists: true,
+        lockServerCount: lock.servers.length,
+        lockDriftDetected: anyFailed,
+        lockDriftCount: totalDriftCount,
+      }));
+
       if (anyFailed) {
         process.exitCode = 1;
       }
diff --git a/src/commands/scan.ts b/src/commands/scan.ts
index 713909a..df8c634 100644
--- a/src/commands/scan.ts
+++ b/src/commands/scan.ts
@@ -7,12 +7,13 @@ import {
 } from "../index.js";
 import { appendHistory, buildHistoryEntry } from "../history.js";
 import { buildEvent, recordEvent } from "../telemetry.js";
+import type { RunArtifact } from "../types.js";
 import { TOOL_VERSION } from "../version.js";
 import { ANSI, LOGO, c, useColor } from "./helpers.js";
 
 // ── Scan implementation ─────────────────────────────────────────────────────
 
-async function runScan(bin: string, configPath: string | undefined, invokeTools: boolean, securityCheck?: boolean): Promise<void> {
+async function runScan(bin: string, configPath: string | undefined, invokeTools: boolean, securityCheck?: boolean, format?: string): Promise<void> {
   const t0 = Date.now();
   process.stdout.write(useColor() ? c(ANSI.cyan, LOGO) + `  ${c(ANSI.dim, `v${TOOL_VERSION}`)}\n\n` : LOGO + `  v${TOOL_VERSION}\n\n`);
 
@@ -53,6 +54,7 @@ async function runScan(bin: string, configPath: string | undefined, invokeTools:
   }
 
   const results: ScanRow[] = [];
+  const artifacts: RunArtifact[] = [];
   const checkStatusMap: Record<string, string> = {};
   let passCount = 0;
   let failCount = 0;
@@ -64,6 +66,7 @@ async function runScan(bin: string, configPath: string | undefined, invokeTools:
     process.stdout.write(`  ${c(ANSI.dim, "⟳")} Checking ${c(ANSI.bold, t.config.targetId)}...`);
     try {
       const artifact = await runTarget(t.config, { invokeTools, securityCheck });
+      artifacts.push(artifact);
       const toolsCheck = artifact.checks.find((ch) => ch.id === "tools");
       const promptsCheck = artifact.checks.find((ch) => ch.id === "prompts");
       const resourcesCheck = artifact.checks.find((ch) => ch.id === "resources");
@@ -164,6 +167,12 @@ async function runScan(bin: string, configPath: string | undefined, invokeTools:
   }
   process.stdout.write("\n");
 
+  if (format === "pr-comment-matrix" && artifacts.length > 0) {
+    const { renderMatrixComment } = await import("../reporters/pr-comment-matrix.js");
+    const rows = artifacts.map(a => ({ artifact: a }));
+    process.stdout.write(renderMatrixComment(rows) + "\n");
+  }
+
   recordEvent(buildEvent("command_complete", "scan", "cli", {
     serversScanned: results.length,
     toolsFound: totalTools,
@@ -178,6 +187,9 @@ async function runScan(bin: string, configPath: string | undefined, invokeTools:
       t.config.adapter === "http" ? (t.config as { url: string }).url : `${(t.config as { command: string }).command} ${t.config.args.join(" ")}`,
     ),
     checkStatuses: checkStatusMap,
+    matrixServerCount: results.length,
+    matrixPassCount: passCount,
+    matrixFailCount: failCount,
   }));
 
   if (failCount > 0) {
@@ -193,11 +205,12 @@ export function registerScanCommands(program: Command, bin: string): void {
     .description("Check all MCP servers in your Claude configs.")
     .option("--config <path>", "Path to a specific MCP config file.")
     .option("--security", "Run deep security scan (credential patterns, response analysis). Lightweight security is always included.")
+    .option("--format <format>", "Output format: terminal or pr-comment-matrix.", "terminal")
     .option("--no-color", "Disable colored output.");
 
   // `scan` with no subcommand — basic scan
-  scanCmd.action(async (options: { config?: string; security?: boolean }) => {
-    await runScan(bin, options.config, false, options.security);
+  scanCmd.action(async (options: { config?: string; security?: boolean; format: string }) => {
+    await runScan(bin, options.config, false, options.security, options.format);
   });
 
   // `scan deep` — scan + invoke tools
@@ -206,10 +219,12 @@ export function registerScanCommands(program: Command, bin: string): void {
     .description("Scan and also invoke safe tools to verify they execute.")
     .option("--config <path>", "Path to a specific MCP config file.")
     .option("--security", "Run deep security scan (credential patterns, response analysis). Lightweight security is always included.")
-    .action(async (options: { config?: string; security?: boolean }) => {
+    .option("--format <format>", "Output format: terminal or pr-comment-matrix.", "terminal")
+    .action(async (options: { config?: string; security?: boolean; format: string }) => {
       // Inherit parent config option if set
       const parentConfig = scanCmd.opts().config as string | undefined;
       const parentSecurity = scanCmd.opts().security as boolean | undefined;
-      await runScan(bin, options.config ?? parentConfig, true, options.security ?? parentSecurity ?? true);
+      const parentFormat = scanCmd.opts().format as string;
+      await runScan(bin, options.config ?? parentConfig, true, options.security ?? parentSecurity ?? true, options.format ?? parentFormat);
     });
 }
diff --git a/src/score.ts b/src/score.ts
index 9256b7f..801306f 100644
--- a/src/score.ts
+++ b/src/score.ts
@@ -1,3 +1,6 @@
+// IMPORTANT: Scoring logic is duplicated in api/src/worker.ts for the Cloudflare Worker
+// deployment (which can't import from src/). Keep both files in sync when making changes.
+
 import type { CheckResult, HealthGrade, HealthScore, PerformanceMetrics, ScoreDimension } from "./types.js";
 
 export interface ScoreWeights {
@@ -9,11 +12,11 @@ export interface ScoreWeights {
 }
 
 export const DEFAULT_WEIGHTS: ScoreWeights = {
-  protocolCompliance: 0.30,
-  schemaQuality: 0.20,
-  security: 0.20,
-  reliability: 0.20,
-  performance: 0.10,
+  protocolCompliance: 0.30, // Highest — spec compliance is foundational for interop
+  schemaQuality: 0.20,      // Good schemas enable AI agents to use tools correctly
+  security: 0.20,           // Parity with quality — both critical for production use
+  reliability: 0.20,        // Tools/prompts/resources actually responding as expected
+  performance: 0.10,        // Lowest — latency matters less than correctness
 };
 
 const STATUS_SCORES: Record<string, number> = {
@@ -80,6 +83,8 @@ function scorePerformance(
   const p95Index = Math.min(Math.ceil(sorted.length * 0.95) - 1, sorted.length - 1);
   const p95 = sorted[p95Index] ?? 0;
 
+  // p95 latency thresholds for performance scoring
+  // <500ms = excellent (100), <1s = good (80), <2s = acceptable (60), <5s = slow (40), >5s = poor (20)
   let score: number;
   if (p95 < 500) score = 100;
   else if (p95 < 1000) score = 80;
diff --git a/src/server.ts b/src/server.ts
index 9af2742..407256c 100644
--- a/src/server.ts
+++ b/src/server.ts
@@ -21,6 +21,9 @@ import type { RunArtifact } from "./types.js";
 import { compareResponses } from "./verify.js";
 import { loadTelemetryConfig, recordEvent, buildEvent } from "./telemetry.js";
 import { TOOL_VERSION } from "./version.js";
+import { readLockFile, verifyAgainstLock } from "./lockfile.js";
+import { readHistory, getTrend, renderTrendLabel } from "./history.js";
+import { buildCiReport } from "./commands/ci-report.js";
 
 // ── Security: Command Allowlist ────────────────────────────────────────────
 // MCP server mode is invoked by an LLM, not an operator. Arbitrary command
@@ -41,7 +44,7 @@ const ALLOWED_COMMANDS = new Set([
 // Reject args containing shell metacharacters that could enable injection.
 const DANGEROUS_ARG_PATTERN = /[;|`]|\$\(|&&|\|\|/;
 
-function validateArgs(args: string[]): void {
+export function validateArgs(args: string[]): void {
   for (const arg of args) {
     if (DANGEROUS_ARG_PATTERN.test(arg)) {
       throw new Error(
@@ -633,6 +636,121 @@ export async function startServer(): Promise<void> {
     },
   );
 
+  server.tool(
+    "lock_verify",
+    "Verify that live MCP servers still match a previously saved lock file. Detects schema drift, added/removed tools, and breaking changes.",
+    {
+      config: z.string().optional().describe("Path to MCP config file."),
+    },
+    async ({ config }) => {
+      const startMs = Date.now();
+      try {
+        const lockFile = await readLockFile();
+        const targets = await scanForTargets(config);
+        const results: string[] = [];
+        let anyFailed = false;
+
+        for (const t of targets) {
+          const lockEntry = lockFile.servers.find(s => s.targetId === t.config.targetId);
+          if (!lockEntry) continue;
+
+          const artifact = await runTarget(t.config);
+          const result = verifyAgainstLock(lockEntry, artifact);
+          if (result.passed) {
+            results.push(`✓ ${t.config.targetId}: no drift`);
+          } else {
+            anyFailed = true;
+            results.push(`✗ ${t.config.targetId}: ${result.drift.length} changes`);
+            for (const d of result.drift) {
+              results.push(`  - ${d.category}: ${d.name} — ${d.change}`);
+            }
+          }
+        }
+
+        if (results.length === 0) {
+          results.push("No servers in lock file match discovered targets.");
+        }
+
+        logRequest("lock_verify", startMs, anyFailed);
+        return { content: [{ type: "text", text: results.join("\n") }] };
+      } catch (error) {
+        const msg = error instanceof Error ? error.message : String(error);
+        logRequest("lock_verify", startMs, true);
+        return { content: [{ type: "text", text: `Lock verify failed: ${msg}` }], isError: true };
+      }
+    },
+  );
+
+  server.tool(
+    "get_history",
+    "Get health score trends for MCP servers from run history.",
+    {
+      target: z.string().optional().describe("Filter to a specific target ID."),
+    },
+    async ({ target }) => {
+      const startMs = Date.now();
+      try {
+        const history = await readHistory();
+        let targetIds = [...new Set(history.entries.map(e => e.targetId))];
+        if (target) targetIds = targetIds.filter(id => id === target);
+
+        if (targetIds.length === 0) {
+          logRequest("get_history", startMs);
+          return { content: [{ type: "text", text: "No history found. Run a scan or test first." }] };
+        }
+
+        const lines: string[] = [];
+        for (const id of targetIds) {
+          const trend = getTrend(id, history);
+          if (!trend) continue;
+          const { current } = trend;
+          const label = renderTrendLabel(trend);
+          lines.push(`${id}: ${current.grade} (${current.healthScore}) ${label}`);
+        }
+
+        logRequest("get_history", startMs);
+        return { content: [{ type: "text", text: lines.join("\n") }] };
+      } catch (error) {
+        const msg = error instanceof Error ? error.message : String(error);
+        logRequest("get_history", startMs, true);
+        return { content: [{ type: "text", text: `History failed: ${msg}` }], isError: true };
+      }
+    },
+  );
+
+  server.tool(
+    "ci_report",
+    "Generate a CI regression report from run artifacts.",
+    {
+      artifactsDir: z.string().optional().describe("Directory containing run artifacts. Defaults to .mcp-observatory/runs/"),
+    },
+    async ({ artifactsDir }) => {
+      const startMs = Date.now();
+      try {
+        const { readdir, readFile } = await import("node:fs/promises");
+        const dir = artifactsDir ?? path.join(process.cwd(), ".mcp-observatory", "runs");
+        const files = await readdir(dir);
+        const artifacts: RunArtifact[] = [];
+        for (const f of files) {
+          if (!f.endsWith(".json")) continue;
+          try {
+            const raw = await readFile(path.join(dir, f), "utf8");
+            const parsed = JSON.parse(raw) as Record<string, unknown>;
+            if (parsed["artifactType"] === "run") artifacts.push(parsed as unknown as RunArtifact);
+          } catch { /* skip invalid */ }
+        }
+
+        const report = buildCiReport(artifacts);
+        logRequest("ci_report", startMs);
+        return { content: [{ type: "text", text: JSON.stringify(report, null, 2) }] };
+      } catch (error) {
+        const msg = error instanceof Error ? error.message : String(error);
+        logRequest("ci_report", startMs, true);
+        return { content: [{ type: "text", text: `CI report failed: ${msg}` }], isError: true };
+      }
+    },
+  );
+
   const transport = new StdioServerTransport();
   await server.connect(transport);
 }
diff --git a/src/telemetry.ts b/src/telemetry.ts
index 51b5023..36e46a6 100644
--- a/src/telemetry.ts
+++ b/src/telemetry.ts
@@ -192,6 +192,7 @@ export function recordEvent(event: TelemetryEvent): void {
     method: "POST",
     headers: { "Content-Type": "application/json" },
     body,
+    // 3s timeout — telemetry is fire-and-forget to avoid blocking user workflows
     signal: AbortSignal.timeout(3_000),
   }).catch(() => {
     // Silently ignore — telemetry must never block or fail visibly
diff --git a/tests/cli-entrypoint.test.ts b/tests/cli-entrypoint.test.ts
index 8a869c7..3b533ba 100644
--- a/tests/cli-entrypoint.test.ts
+++ b/tests/cli-entrypoint.test.ts
@@ -1,5 +1,7 @@
 import { describe, expect, it } from "vitest";
 import { execFileSync } from "node:child_process";
+import os from "node:os";
+import fs from "node:fs";
 import path from "node:path";
 
 const CLI = path.resolve("src/cli.ts");
@@ -151,4 +153,44 @@ describe("CLI entrypoint", () => {
     const { exitCode } = runCli(["nonexistent-command"]);
     expect(exitCode).not.toBe(0);
   });
+
+  // ── Lock commands ───────────────────────────────────────────────
+  it("lock subcommand shows help", () => {
+    const { stdout, exitCode } = runCli(["lock", "--help"]);
+    expect(exitCode).toBe(0);
+    expect(stdout).toContain("lock");
+  });
+
+  // ── History commands ────────────────────────────────────────────
+  it("history subcommand shows help", () => {
+    const { stdout, exitCode } = runCli(["history", "--help"]);
+    expect(exitCode).toBe(0);
+    expect(stdout).toContain("history");
+  });
+
+  it("history with no data shows empty message", () => {
+    const tmpDir = path.join(os.tmpdir(), `obs-test-${Date.now()}`);
+    fs.mkdirSync(tmpDir, { recursive: true });
+    const { stdout, exitCode } = runCli(["history"], { cwd: tmpDir });
+    expect(exitCode).toBe(0);
+    expect(stdout).toContain("No history");
+    fs.rmSync(tmpDir, { recursive: true, force: true });
+  });
+
+  // ── CI Report commands ──────────────────────────────────────────
+  it("ci-report subcommand shows help", () => {
+    const { stdout, exitCode } = runCli(["ci-report", "--help"]);
+    expect(exitCode).toBe(0);
+    expect(stdout).toContain("ci-report");
+  });
+
+  it("ci-report with empty dir outputs valid JSON", () => {
+    const tmpDir = path.join(os.tmpdir(), `obs-test-${Date.now()}`);
+    fs.mkdirSync(tmpDir, { recursive: true });
+    const { stdout, exitCode } = runCli(["ci-report", "--artifacts-dir", tmpDir]);
+    expect(exitCode).toBe(0);
+    const parsed = JSON.parse(stdout) as Record<string, unknown>;
+    expect(parsed).toHaveProperty("hasRegressions", false);
+    fs.rmSync(tmpDir, { recursive: true, force: true });
+  });
 });
diff --git a/tests/server-security.test.ts b/tests/server-security.test.ts
index 4cda481..bab7cee 100644
--- a/tests/server-security.test.ts
+++ b/tests/server-security.test.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from "vitest";
 
-import { validateCommand, validatePath } from "../src/server.js";
+import { validateArgs, validateCommand, validatePath } from "../src/server.js";
 
 describe("MCP Server Command Allowlist", () => {
   it("allows npx commands", () => {
@@ -79,3 +79,47 @@ describe("Path Validation", () => {
     expect(() => validatePath("/tmp/runs-evil/file.json", "/tmp/runs")).toThrow(/resolves outside/);
   });
 });
+
+describe("validateArgs", () => {
+  it("rejects semicolon injection", () => {
+    expect(() => validateArgs(["; rm -rf /"])).toThrow();
+  });
+
+  it("rejects backtick injection", () => {
+    expect(() => validateArgs(["`whoami`"])).toThrow();
+  });
+
+  it("rejects command substitution", () => {
+    expect(() => validateArgs(["$(cat /etc/passwd)"])).toThrow();
+  });
+
+  it("rejects && chaining", () => {
+    expect(() => validateArgs(["foo && bar"])).toThrow();
+  });
+
+  it("rejects || chaining", () => {
+    expect(() => validateArgs(["foo || bar"])).toThrow();
+  });
+
+  it("rejects pipe", () => {
+    expect(() => validateArgs(["foo | bar"])).toThrow();
+  });
+
+  it("accepts normal arguments", () => {
+    expect(() => validateArgs(["--verbose", "/path/to/file", "hello world"])).not.toThrow();
+  });
+});
+
+describe("validatePath", () => {
+  it("rejects path traversal", () => {
+    expect(() => validatePath("../../../etc/passwd", "/home/user")).toThrow();
+  });
+
+  it("rejects absolute escape", () => {
+    expect(() => validatePath("/etc/passwd", "/home/user")).toThrow();
+  });
+
+  it("accepts paths within allowed root", () => {
+    expect(() => validatePath("/home/user/subdir/file.txt", "/home/user")).not.toThrow();
+  });
+});