fix: audit fixes — 9 issues resolved (#82)

KryptosAI · claude · web-flow · commit e3d3c137eead · 2026-03-23T13:27:32.000-07:00
MCP server tools, telemetry wiring, matrix comment, README docs, CLI tests, security tests, magic number comments. 302/302 tests pass.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/README.md b/README.md
@@ -81,6 +81,12 @@ Or add it manually to your config:
 | `watch <config>` | Watch a server for changes, alert on regressions |
 | `suggest` | Detect your stack and recommend MCP servers from the registry |
 | `serve` | Start as an MCP server for AI agents |
+| `lock` | Snapshot MCP server schemas into a lock file |
+| `lock verify` | Verify live servers match the lock file |
+| `history` | Show health score trends for your MCP servers |
+| `ci-report` | Generate CI report for GitHub issue creation |
+| `score <cmd>` | Score an MCP server's health (0-100) |
+| `badge <cmd>` | Generate an SVG health score badge for README |
 
 Run with no arguments for an interactive menu:
 
@@ -162,8 +168,41 @@ jobs:
           security: true
 ```
 
+Action inputs:
+
+| Input | Description | Default |
+|-------|-------------|---------|
+| `command` | Server command to test | (required if no `target`) |
+| `target` | Path to target config JSON | |
+| `targets` | Path to MCP config file for multi-server matrix scan | |
+| `deep` | Also invoke safe tools | `false` |
+| `security` | Run security analysis | `false` |
+| `fail-on-regression` | Fail the action on issues | `true` |
+| `comment-on-pr` | Post report as PR comment | `true` |
+| `set-status` | Set a commit status check (green/red) on the HEAD SHA | `true` |
+| `github-token` | Token for PR comments and commit statuses | `${{ github.token }}` |
+
 The action runs checks on every PR, comments a markdown report, and blocks merge on regressions. See [`action/README.md`](./action/README.md) for all options.
 
+### Lock Files
+
+```bash
+$ npx @kryptosai/mcp-observatory lock              # Snapshot all server schemas
+$ npx @kryptosai/mcp-observatory lock verify        # Verify no drift since last lock
+```
+
+### Trend Tracking
+
+```bash
+$ npx @kryptosai/mcp-observatory history            # Show health trends over time
+```
+
+### Nightly Scans
+
+```bash
+$ npx @kryptosai/mcp-observatory ci-report          # Generate regression report for CI
+```
+
 ## MCP Server Mode
 
 **No other testing tool is itself an MCP server.** Add Observatory as a server and your AI agent can autonomously test, diagnose, and monitor your other MCP servers.
diff --git a/action/README.md b/action/README.md
@@ -29,7 +29,9 @@ jobs:
 | `security` | Run security analysis | `false` |
 | `fail-on-regression` | Fail the action on issues | `true` |
 | `comment-on-pr` | Post report as PR comment | `true` |
-| `github-token` | Token for PR comments | `${{ github.token }}` |
+| `set-status` | Set a commit status check (green/red) on the HEAD SHA | `true` |
+| `targets` | Path to MCP config file for multi-server matrix scan | |
+| `github-token` | Token for PR comments and commit statuses | `${{ github.token }}` |
 | `node-version` | Node.js version | `22` |
 
 ## Outputs
diff --git a/api/src/worker.ts b/api/src/worker.ts
@@ -134,9 +134,9 @@ interface RunArtifact {
   fatalError?: string;
 }
 
-// ---------------------------------------------------------------------------
-// Score computation (ported from src/score.ts)
-// ---------------------------------------------------------------------------
+// ── Score computation (duplicated from src/score.ts) ────────────────────────
+// IMPORTANT: This logic is duplicated from src/score.ts because the Worker
+// can't import from the main package. Keep both files in sync when making changes.
 
 const STATUS_SCORES: Record<string, number> = {
   pass: 100,
@@ -214,6 +214,8 @@ function scorePerformance(
   );
   const p95 = sorted[p95Index] ?? 0;
 
+  // p95 latency thresholds for performance scoring
+  // <500ms = excellent (100), <1s = good (80), <2s = acceptable (60), <5s = slow (40), >5s = poor (20)
   let score: number;
   if (p95 < 500) score = 100;
   else if (p95 < 1000) score = 80;
@@ -237,11 +239,11 @@ function computeHealthScore(
   performanceMetrics?: PerformanceMetrics,
 ): HealthScore {
   const w = {
-    protocolCompliance: 0.3,
-    schemaQuality: 0.2,
-    security: 0.2,
-    reliability: 0.2,
-    performance: 0.1,
+    protocolCompliance: 0.3,  // Highest — spec compliance is foundational for interop
+    schemaQuality: 0.2,       // Good schemas enable AI agents to use tools correctly
+    security: 0.2,            // Parity with quality — both critical for production use
+    reliability: 0.2,         // Tools/prompts/resources actually responding as expected
+    performance: 0.1,         // Lowest — latency matters less than correctness
   };
 
   const dimensions: ScoreDimension[] = [
diff --git a/github-app/README.md b/github-app/README.md
@@ -1,5 +1,7 @@
 # MCP Observatory GitHub App
 
+> **Status**: Planned feature — not yet deployed. This is the future hosted Observatory GitHub App.
+
 A GitHub App that automatically analyzes MCP server configurations in pull requests and posts health score reports as PR comments.
 
 ## Setup
diff --git a/src/commands/ci-report.ts b/src/commands/ci-report.ts
@@ -2,6 +2,7 @@ import { readdir, readFile } from "node:fs/promises";
 import path from "node:path";
 import type { Command } from "commander";
 import type { RunArtifact } from "../types.js";
+import { buildEvent, recordEvent } from "../telemetry.js";
 import { validateRunArtifact } from "../validate.js";
 import { defaultRunsDirectory } from "../storage.js";
 
@@ -96,6 +97,13 @@ export function registerCiReportCommands(program: Command): void {
           process.stdout.write(JSON.stringify(report, null, 2) + "\n");
         }
 
+        recordEvent(buildEvent("command_complete", "ci-report", "cli", {
+          nightlyScan: true,
+          issueCreated: report.hasRegressions,
+          matrixServerCount: report.serverCount,
+          matrixFailCount: report.failCount,
+        }));
+
         if (report.hasRegressions) {
           process.exitCode = 1;
         }
diff --git a/src/commands/history.ts b/src/commands/history.ts
@@ -1,5 +1,6 @@
 import type { Command } from "commander";
 import { readHistory, getTrend, renderTrendLabel } from "../history.js";
+import { buildEvent, recordEvent } from "../telemetry.js";
 import { ANSI, c } from "./helpers.js";
 
 export function registerHistoryCommands(program: Command): void {
@@ -58,5 +59,9 @@ export function registerHistoryCommands(program: Command): void {
           `  ${paddedId} ${c(gradeColor, current.grade)} (${current.healthScore})  ${label}\n`,
         );
       }
+
+      recordEvent(buildEvent("command_complete", "history", "cli", {
+        historyEntryCount: history.entries.length,
+      }));
     });
 }
diff --git a/src/commands/lock.ts b/src/commands/lock.ts
@@ -1,6 +1,7 @@
 import type { Command } from "commander";
 
 import { scanForTargets } from "../discovery.js";
+import { buildEvent, recordEvent } from "../telemetry.js";
 import {
   readLockFile,
   writeLockFile,
@@ -82,6 +83,11 @@ export function registerLockCommands(program: Command): void {
       process.stdout.write(
         `\n  ${c(ANSI.green, "✓")} Locked ${entries.length} server${entries.length === 1 ? "" : "s"} to ${lockPath}\n\n`,
       );
+
+      recordEvent(buildEvent("command_complete", "lock", "cli", {
+        lockFileExists: true,
+        lockServerCount: entries.length,
+      }));
     });
 
   lockCmd
@@ -109,6 +115,7 @@ export function registerLockCommands(program: Command): void {
       );
 
       let anyFailed = false;
+      let totalDriftCount = 0;
 
       for (const t of targets) {
         const lockEntry = lockMap.get(t.config.targetId);
@@ -129,6 +136,7 @@ export function registerLockCommands(program: Command): void {
             process.stdout.write(`  ${c(ANSI.green, "✓")} ${t.config.targetId}\n`);
           } else {
             anyFailed = true;
+            totalDriftCount += result.drift.length;
             process.stdout.write(`  ${c(ANSI.red, "✗")} ${t.config.targetId}\n`);
             for (const d of result.drift) {
               process.stdout.write(
@@ -145,6 +153,13 @@ export function registerLockCommands(program: Command): void {
 
       process.stdout.write("\n");
 
+      recordEvent(buildEvent("command_complete", "lock-verify", "cli", {
+        lockFileExists: true,
+        lockServerCount: lock.servers.length,
+        lockDriftDetected: anyFailed,
+        lockDriftCount: totalDriftCount,
+      }));
+
       if (anyFailed) {
         process.exitCode = 1;
       }
diff --git a/src/commands/scan.ts b/src/commands/scan.ts
@@ -7,12 +7,13 @@ import {
 } from "../index.js";
 import { appendHistory, buildHistoryEntry } from "../history.js";
 import { buildEvent, recordEvent } from "../telemetry.js";
+import type { RunArtifact } from "../types.js";
 import { TOOL_VERSION } from "../version.js";
 import { ANSI, LOGO, c, useColor } from "./helpers.js";
 
 // ── Scan implementation ─────────────────────────────────────────────────────
 
-async function runScan(bin: string, configPath: string | undefined, invokeTools: boolean, securityCheck?: boolean): Promise<void> {
+async function runScan(bin: string, configPath: string | undefined, invokeTools: boolean, securityCheck?: boolean, format?: string): Promise<void> {
   const t0 = Date.now();
   process.stdout.write(useColor() ? c(ANSI.cyan, LOGO) + `  ${c(ANSI.dim, `v${TOOL_VERSION}`)}\n\n` : LOGO + `  v${TOOL_VERSION}\n\n`);
 
@@ -53,6 +54,7 @@ async function runScan(bin: string, configPath: string | undefined, invokeTools:
   }
 
   const results: ScanRow[] = [];
+  const artifacts: RunArtifact[] = [];
   const checkStatusMap: Record<string, string> = {};
   let passCount = 0;
   let failCount = 0;
@@ -64,6 +66,7 @@ async function runScan(bin: string, configPath: string | undefined, invokeTools:
     process.stdout.write(`  ${c(ANSI.dim, "⟳")} Checking ${c(ANSI.bold, t.config.targetId)}...`);
     try {
       const artifact = await runTarget(t.config, { invokeTools, securityCheck });
+      artifacts.push(artifact);
       const toolsCheck = artifact.checks.find((ch) => ch.id === "tools");
       const promptsCheck = artifact.checks.find((ch) => ch.id === "prompts");
       const resourcesCheck = artifact.checks.find((ch) => ch.id === "resources");
@@ -164,6 +167,12 @@ async function runScan(bin: string, configPath: string | undefined, invokeTools:
   }
   process.stdout.write("\n");
 
+  if (format === "pr-comment-matrix" && artifacts.length > 0) {
+    const { renderMatrixComment } = await import("../reporters/pr-comment-matrix.js");
+    const rows = artifacts.map(a => ({ artifact: a }));
+    process.stdout.write(renderMatrixComment(rows) + "\n");
+  }
+
   recordEvent(buildEvent("command_complete", "scan", "cli", {
     serversScanned: results.length,
     toolsFound: totalTools,
@@ -178,6 +187,9 @@ async function runScan(bin: string, configPath: string | undefined, invokeTools:
       t.config.adapter === "http" ? (t.config as { url: string }).url : `${(t.config as { command: string }).command} ${t.config.args.join(" ")}`,
     ),
     checkStatuses: checkStatusMap,
+    matrixServerCount: results.length,
+    matrixPassCount: passCount,
+    matrixFailCount: failCount,
   }));
 
   if (failCount > 0) {
@@ -193,11 +205,12 @@ export function registerScanCommands(program: Command, bin: string): void {
     .description("Check all MCP servers in your Claude configs.")
     .option("--config <path>", "Path to a specific MCP config file.")
     .option("--security", "Run deep security scan (credential patterns, response analysis). Lightweight security is always included.")
+    .option("--format <format>", "Output format: terminal or pr-comment-matrix.", "terminal")
     .option("--no-color", "Disable colored output.");
 
   // `scan` with no subcommand — basic scan
-  scanCmd.action(async (options: { config?: string; security?: boolean }) => {
-    await runScan(bin, options.config, false, options.security);
+  scanCmd.action(async (options: { config?: string; security?: boolean; format: string }) => {
+    await runScan(bin, options.config, false, options.security, options.format);
   });
 
   // `scan deep` — scan + invoke tools
@@ -206,10 +219,12 @@ export function registerScanCommands(program: Command, bin: string): void {
     .description("Scan and also invoke safe tools to verify they execute.")
     .option("--config <path>", "Path to a specific MCP config file.")
     .option("--security", "Run deep security scan (credential patterns, response analysis). Lightweight security is always included.")
-    .action(async (options: { config?: string; security?: boolean }) => {
+    .option("--format <format>", "Output format: terminal or pr-comment-matrix.", "terminal")
+    .action(async (options: { config?: string; security?: boolean; format: string }) => {
       // Inherit parent config option if set
       const parentConfig = scanCmd.opts().config as string | undefined;
       const parentSecurity = scanCmd.opts().security as boolean | undefined;
-      await runScan(bin, options.config ?? parentConfig, true, options.security ?? parentSecurity ?? true);
+      const parentFormat = scanCmd.opts().format as string;
+      await runScan(bin, options.config ?? parentConfig, true, options.security ?? parentSecurity ?? true, options.format ?? parentFormat);
     });
 }
diff --git a/src/score.ts b/src/score.ts
@@ -1,3 +1,6 @@
+// IMPORTANT: Scoring logic is duplicated in api/src/worker.ts for the Cloudflare Worker
+// deployment (which can't import from src/). Keep both files in sync when making changes.
+
 import type { CheckResult, HealthGrade, HealthScore, PerformanceMetrics, ScoreDimension } from "./types.js";
 
 export interface ScoreWeights {
@@ -9,11 +12,11 @@ export interface ScoreWeights {
 }
 
 export const DEFAULT_WEIGHTS: ScoreWeights = {
-  protocolCompliance: 0.30,
-  schemaQuality: 0.20,
-  security: 0.20,
-  reliability: 0.20,
-  performance: 0.10,
+  protocolCompliance: 0.30, // Highest — spec compliance is foundational for interop
+  schemaQuality: 0.20,      // Good schemas enable AI agents to use tools correctly
+  security: 0.20,           // Parity with quality — both critical for production use
+  reliability: 0.20,        // Tools/prompts/resources actually responding as expected
+  performance: 0.10,        // Lowest — latency matters less than correctness
 };
 
 const STATUS_SCORES: Record<string, number> = {
@@ -80,6 +83,8 @@ function scorePerformance(
   const p95Index = Math.min(Math.ceil(sorted.length * 0.95) - 1, sorted.length - 1);
   const p95 = sorted[p95Index] ?? 0;
 
+  // p95 latency thresholds for performance scoring
+  // <500ms = excellent (100), <1s = good (80), <2s = acceptable (60), <5s = slow (40), >5s = poor (20)
   let score: number;
   if (p95 < 500) score = 100;
   else if (p95 < 1000) score = 80;
diff --git a/src/server.ts b/src/server.ts
diff --git a/src/telemetry.ts b/src/telemetry.ts
diff --git a/tests/cli-entrypoint.test.ts b/tests/cli-entrypoint.test.ts
diff --git a/tests/server-security.test.ts b/tests/server-security.test.ts