Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ Or add it manually to your config:
| `watch <config>` | Watch a server for changes, alert on regressions |
| `suggest` | Detect your stack and recommend MCP servers from the registry |
| `serve` | Start as an MCP server for AI agents |
| `lock` | Snapshot MCP server schemas into a lock file |
| `lock verify` | Verify live servers match the lock file |
| `history` | Show health score trends for your MCP servers |
| `ci-report` | Generate CI report for GitHub issue creation |
| `score <cmd>` | Score an MCP server's health (0-100) |
| `badge <cmd>` | Generate an SVG health score badge for README |

Run with no arguments for an interactive menu:

Expand Down Expand Up @@ -162,8 +168,41 @@ jobs:
security: true
```

Action inputs:

| Input | Description | Default |
|-------|-------------|---------|
| `command` | Server command to test | (required if no `target`) |
| `target` | Path to target config JSON | |
| `targets` | Path to MCP config file for multi-server matrix scan | |
| `deep` | Also invoke safe tools | `false` |
| `security` | Run security analysis | `false` |
| `fail-on-regression` | Fail the action on issues | `true` |
| `comment-on-pr` | Post report as PR comment | `true` |
| `set-status` | Set a commit status check (green/red) on the HEAD SHA | `true` |
| `github-token` | Token for PR comments and commit statuses | `${{ github.token }}` |

The action runs checks on every PR, comments a markdown report, and blocks merge on regressions. See [`action/README.md`](./action/README.md) for all options.

### Lock Files

```bash
$ npx @kryptosai/mcp-observatory lock # Snapshot all server schemas
$ npx @kryptosai/mcp-observatory lock verify # Verify no drift since last lock
```

### Trend Tracking

```bash
$ npx @kryptosai/mcp-observatory history # Show health trends over time
```

### Nightly Scans

```bash
$ npx @kryptosai/mcp-observatory ci-report # Generate regression report for CI
```

## MCP Server Mode

**No other testing tool is itself an MCP server.** Add Observatory as a server and your AI agent can autonomously test, diagnose, and monitor your other MCP servers.
Expand Down
4 changes: 3 additions & 1 deletion action/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ jobs:
| `security` | Run security analysis | `false` |
| `fail-on-regression` | Fail the action on issues | `true` |
| `comment-on-pr` | Post report as PR comment | `true` |
| `github-token` | Token for PR comments | `${{ github.token }}` |
| `set-status` | Set a commit status check (green/red) on the HEAD SHA | `true` |
| `targets` | Path to MCP config file for multi-server matrix scan | |
| `github-token` | Token for PR comments and commit statuses | `${{ github.token }}` |
| `node-version` | Node.js version | `22` |

## Outputs
Expand Down
18 changes: 10 additions & 8 deletions api/src/worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,9 @@ interface RunArtifact {
fatalError?: string;
}

// ---------------------------------------------------------------------------
// Score computation (ported from src/score.ts)
// ---------------------------------------------------------------------------
// ── Score computation (duplicated from src/score.ts) ────────────────────────
// IMPORTANT: This logic is duplicated from src/score.ts because the Worker
// can't import from the main package. Keep both files in sync when making changes.

const STATUS_SCORES: Record<string, number> = {
pass: 100,
Expand Down Expand Up @@ -214,6 +214,8 @@ function scorePerformance(
);
const p95 = sorted[p95Index] ?? 0;

// p95 latency thresholds for performance scoring
// <500ms = excellent (100), <1s = good (80), <2s = acceptable (60), <5s = slow (40), >5s = poor (20)
let score: number;
if (p95 < 500) score = 100;
else if (p95 < 1000) score = 80;
Expand All @@ -237,11 +239,11 @@ function computeHealthScore(
performanceMetrics?: PerformanceMetrics,
): HealthScore {
const w = {
protocolCompliance: 0.3,
schemaQuality: 0.2,
security: 0.2,
reliability: 0.2,
performance: 0.1,
protocolCompliance: 0.3, // Highest — spec compliance is foundational for interop
schemaQuality: 0.2, // Good schemas enable AI agents to use tools correctly
security: 0.2, // Parity with quality — both critical for production use
reliability: 0.2, // Tools/prompts/resources actually responding as expected
performance: 0.1, // Lowest — latency matters less than correctness
};

const dimensions: ScoreDimension[] = [
Expand Down
2 changes: 2 additions & 0 deletions github-app/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# MCP Observatory GitHub App

> **Status**: Planned feature — not yet deployed. This is the future hosted Observatory GitHub App.
A GitHub App that automatically analyzes MCP server configurations in pull requests and posts health score reports as PR comments.

## Setup
Expand Down
8 changes: 8 additions & 0 deletions src/commands/ci-report.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { readdir, readFile } from "node:fs/promises";
import path from "node:path";
import type { Command } from "commander";
import type { RunArtifact } from "../types.js";
import { buildEvent, recordEvent } from "../telemetry.js";
import { validateRunArtifact } from "../validate.js";
import { defaultRunsDirectory } from "../storage.js";

Expand Down Expand Up @@ -96,6 +97,13 @@ export function registerCiReportCommands(program: Command): void {
process.stdout.write(JSON.stringify(report, null, 2) + "\n");
}

recordEvent(buildEvent("command_complete", "ci-report", "cli", {
nightlyScan: true,
issueCreated: report.hasRegressions,
matrixServerCount: report.serverCount,
matrixFailCount: report.failCount,
}));

if (report.hasRegressions) {
process.exitCode = 1;
}
Expand Down
5 changes: 5 additions & 0 deletions src/commands/history.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import type { Command } from "commander";
import { readHistory, getTrend, renderTrendLabel } from "../history.js";
import { buildEvent, recordEvent } from "../telemetry.js";
import { ANSI, c } from "./helpers.js";

export function registerHistoryCommands(program: Command): void {
Expand Down Expand Up @@ -58,5 +59,9 @@ export function registerHistoryCommands(program: Command): void {
` ${paddedId} ${c(gradeColor, current.grade)} (${current.healthScore}) ${label}\n`,
);
}

recordEvent(buildEvent("command_complete", "history", "cli", {
historyEntryCount: history.entries.length,
}));
});
}
15 changes: 15 additions & 0 deletions src/commands/lock.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import type { Command } from "commander";

import { scanForTargets } from "../discovery.js";
import { buildEvent, recordEvent } from "../telemetry.js";
import {
readLockFile,
writeLockFile,
Expand Down Expand Up @@ -82,6 +83,11 @@ export function registerLockCommands(program: Command): void {
process.stdout.write(
`\n ${c(ANSI.green, "✓")} Locked ${entries.length} server${entries.length === 1 ? "" : "s"} to ${lockPath}\n\n`,
);

recordEvent(buildEvent("command_complete", "lock", "cli", {
lockFileExists: true,
lockServerCount: entries.length,
}));
});

lockCmd
Expand Down Expand Up @@ -109,6 +115,7 @@ export function registerLockCommands(program: Command): void {
);

let anyFailed = false;
let totalDriftCount = 0;

for (const t of targets) {
const lockEntry = lockMap.get(t.config.targetId);
Expand All @@ -129,6 +136,7 @@ export function registerLockCommands(program: Command): void {
process.stdout.write(` ${c(ANSI.green, "✓")} ${t.config.targetId}\n`);
} else {
anyFailed = true;
totalDriftCount += result.drift.length;
process.stdout.write(` ${c(ANSI.red, "✗")} ${t.config.targetId}\n`);
for (const d of result.drift) {
process.stdout.write(
Expand All @@ -145,6 +153,13 @@ export function registerLockCommands(program: Command): void {

process.stdout.write("\n");

recordEvent(buildEvent("command_complete", "lock-verify", "cli", {
lockFileExists: true,
lockServerCount: lock.servers.length,
lockDriftDetected: anyFailed,
lockDriftCount: totalDriftCount,
}));

if (anyFailed) {
process.exitCode = 1;
}
Expand Down
25 changes: 20 additions & 5 deletions src/commands/scan.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ import {
} from "../index.js";
import { appendHistory, buildHistoryEntry } from "../history.js";
import { buildEvent, recordEvent } from "../telemetry.js";
import type { RunArtifact } from "../types.js";
import { TOOL_VERSION } from "../version.js";
import { ANSI, LOGO, c, useColor } from "./helpers.js";

// ── Scan implementation ─────────────────────────────────────────────────────

async function runScan(bin: string, configPath: string | undefined, invokeTools: boolean, securityCheck?: boolean): Promise<void> {
async function runScan(bin: string, configPath: string | undefined, invokeTools: boolean, securityCheck?: boolean, format?: string): Promise<void> {
const t0 = Date.now();
process.stdout.write(useColor() ? c(ANSI.cyan, LOGO) + ` ${c(ANSI.dim, `v${TOOL_VERSION}`)}\n\n` : LOGO + ` v${TOOL_VERSION}\n\n`);

Expand Down Expand Up @@ -53,6 +54,7 @@ async function runScan(bin: string, configPath: string | undefined, invokeTools:
}

const results: ScanRow[] = [];
const artifacts: RunArtifact[] = [];
const checkStatusMap: Record<string, string> = {};
let passCount = 0;
let failCount = 0;
Expand All @@ -64,6 +66,7 @@ async function runScan(bin: string, configPath: string | undefined, invokeTools:
process.stdout.write(` ${c(ANSI.dim, "⟳")} Checking ${c(ANSI.bold, t.config.targetId)}...`);
try {
const artifact = await runTarget(t.config, { invokeTools, securityCheck });
artifacts.push(artifact);
const toolsCheck = artifact.checks.find((ch) => ch.id === "tools");
const promptsCheck = artifact.checks.find((ch) => ch.id === "prompts");
const resourcesCheck = artifact.checks.find((ch) => ch.id === "resources");
Expand Down Expand Up @@ -164,6 +167,12 @@ async function runScan(bin: string, configPath: string | undefined, invokeTools:
}
process.stdout.write("\n");

if (format === "pr-comment-matrix" && artifacts.length > 0) {
const { renderMatrixComment } = await import("../reporters/pr-comment-matrix.js");
const rows = artifacts.map(a => ({ artifact: a }));
process.stdout.write(renderMatrixComment(rows) + "\n");
}

recordEvent(buildEvent("command_complete", "scan", "cli", {
serversScanned: results.length,
toolsFound: totalTools,
Expand All @@ -178,6 +187,9 @@ async function runScan(bin: string, configPath: string | undefined, invokeTools:
t.config.adapter === "http" ? (t.config as { url: string }).url : `${(t.config as { command: string }).command} ${t.config.args.join(" ")}`,
),
checkStatuses: checkStatusMap,
matrixServerCount: results.length,
matrixPassCount: passCount,
matrixFailCount: failCount,
}));

if (failCount > 0) {
Expand All @@ -193,11 +205,12 @@ export function registerScanCommands(program: Command, bin: string): void {
.description("Check all MCP servers in your Claude configs.")
.option("--config <path>", "Path to a specific MCP config file.")
.option("--security", "Run deep security scan (credential patterns, response analysis). Lightweight security is always included.")
.option("--format <format>", "Output format: terminal or pr-comment-matrix.", "terminal")
.option("--no-color", "Disable colored output.");

// `scan` with no subcommand — basic scan
scanCmd.action(async (options: { config?: string; security?: boolean }) => {
await runScan(bin, options.config, false, options.security);
scanCmd.action(async (options: { config?: string; security?: boolean; format: string }) => {
await runScan(bin, options.config, false, options.security, options.format);
});

// `scan deep` — scan + invoke tools
Expand All @@ -206,10 +219,12 @@ export function registerScanCommands(program: Command, bin: string): void {
.description("Scan and also invoke safe tools to verify they execute.")
.option("--config <path>", "Path to a specific MCP config file.")
.option("--security", "Run deep security scan (credential patterns, response analysis). Lightweight security is always included.")
.action(async (options: { config?: string; security?: boolean }) => {
.option("--format <format>", "Output format: terminal or pr-comment-matrix.", "terminal")
.action(async (options: { config?: string; security?: boolean; format: string }) => {
// Inherit parent config option if set
const parentConfig = scanCmd.opts().config as string | undefined;
const parentSecurity = scanCmd.opts().security as boolean | undefined;
await runScan(bin, options.config ?? parentConfig, true, options.security ?? parentSecurity ?? true);
const parentFormat = scanCmd.opts().format as string;
await runScan(bin, options.config ?? parentConfig, true, options.security ?? parentSecurity ?? true, options.format ?? parentFormat);
});
}
15 changes: 10 additions & 5 deletions src/score.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
// IMPORTANT: Scoring logic is duplicated in api/src/worker.ts for the Cloudflare Worker
// deployment (which can't import from src/). Keep both files in sync when making changes.

import type { CheckResult, HealthGrade, HealthScore, PerformanceMetrics, ScoreDimension } from "./types.js";

export interface ScoreWeights {
Expand All @@ -9,11 +12,11 @@ export interface ScoreWeights {
}

export const DEFAULT_WEIGHTS: ScoreWeights = {
protocolCompliance: 0.30,
schemaQuality: 0.20,
security: 0.20,
reliability: 0.20,
performance: 0.10,
protocolCompliance: 0.30, // Highest — spec compliance is foundational for interop
schemaQuality: 0.20, // Good schemas enable AI agents to use tools correctly
security: 0.20, // Parity with quality — both critical for production use
reliability: 0.20, // Tools/prompts/resources actually responding as expected
performance: 0.10, // Lowest — latency matters less than correctness
};

const STATUS_SCORES: Record<string, number> = {
Expand Down Expand Up @@ -80,6 +83,8 @@ function scorePerformance(
const p95Index = Math.min(Math.ceil(sorted.length * 0.95) - 1, sorted.length - 1);
const p95 = sorted[p95Index] ?? 0;

// p95 latency thresholds for performance scoring
// <500ms = excellent (100), <1s = good (80), <2s = acceptable (60), <5s = slow (40), >5s = poor (20)
let score: number;
if (p95 < 500) score = 100;
else if (p95 < 1000) score = 80;
Expand Down
Loading
Loading