Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions .github/workflows/measurement-validation.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
name: Measurement Validation

on:
push:
branches: [main]
pull_request:
branches: [main]

permissions:
contents: read
pull-requests: write

jobs:
validate:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v5

- uses: oven-sh/setup-bun@v2
with:
bun-version: latest

- name: Install dependencies
run: bun install --frozen-lockfile

- name: TypeScript type-check
run: bun run check

- name: Run unit tests
run: bun test src/layout.test.ts

- name: Performance trends (chrome)
run: bun run validator:trends --browser=chrome --json > /tmp/perf-chrome.json || true

- name: Regression detection
id: regression
run: |
bun run validator:regression-detect --json > /tmp/regressions.json 2>&1 || true
cat /tmp/regressions.json

- name: Upload validation artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: measurement-validation-results
path: |
/tmp/perf-chrome.json
/tmp/regressions.json
if-no-files-found: warn

- name: Post PR summary
if: github.event_name == 'pull_request'
uses: actions/github-script@v7
with:
script: |
const fs = require('fs')
let perfSummary = '_(no data)_'
let regressionSummary = '_(no data)_'

try {
const perf = JSON.parse(fs.readFileSync('/tmp/perf-chrome.json', 'utf-8'))
const degraded = (perf.metrics || []).filter(m => m.trend === 'degrading')
perfSummary = degraded.length === 0
? '✅ All benchmarks within expected range'
: `⚠️ ${degraded.length} degraded benchmark(s)`
} catch {}

try {
const reg = JSON.parse(fs.readFileSync('/tmp/regressions.json', 'utf-8'))
const total =
(reg.accuracyRegressions || []).length +
(reg.performanceRegressions || []).length
regressionSummary = reg.hasBlocker
? `❌ Critical regression(s) detected — ${total} issue(s)`
: total > 0
? `⚠️ ${total} regression(s) detected`
: '✅ No regressions detected'
} catch {}

const body = [
'## 📊 Measurement Validator Results',
'',
`**Performance (Chrome):** ${perfSummary}`,
`**Regressions:** ${regressionSummary}`,
'',
`_Workflow run: [${context.runId}](https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId})_`,
].join('\n')

github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body,
})
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,11 @@ report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json

# Finder (MacOS) folder config
.DS_Store

# measurement-validator runtime files
.measurement-results.db
.measurement-results.db-shm
.measurement-results.db-wal

# npm lockfile (project uses bun.lock)
package-lock.json
211 changes: 211 additions & 0 deletions docs/measurement-validator/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
# Measurement Validator — Phase 4 Documentation

## Overview

The measurement-validator Phase 4 components add GitHub CI integration,
performance tracking, regression detection, a live dashboard server, SQLite
persistence, and Slack notifications on top of the existing accuracy and
benchmark infrastructure.

All components are built with TypeScript and Bun's built-in APIs — no
extra runtime dependencies are needed beyond what is already in
`package.json`.

---

## Components

### 1. GitHub Actions Workflow

**File:** `.github/workflows/measurement-validation.yml`

Runs automatically on every push to `main` and on every pull request:

- TypeScript type-check (`bun run check`)
- Unit tests (`bun test src/layout.test.ts`)
- Performance trends for Chrome
- Regression detection across configured browsers
- Uploads JSON artifacts (performance + regressions)
- Posts a summary comment to open PRs

### 2. Performance Tracker

**File:** `src/measurement-validator/performance-tracker.ts`

Loads benchmark snapshots from `benchmarks/<browser>.json`, compares each
entry against a baseline stored in `.measurement-baseline.json`, and
produces a `PerformanceReport`.

```typescript
import { trackPerformance, writeBaseline, formatPerformanceReport } from './performance-tracker.js'

// Compare current benchmarks against baseline
const report = await trackPerformance('chrome', { warnPct: 10, criticalPct: 25 })
console.log(formatPerformanceReport(report))

// Write a new baseline from current snapshots
await writeBaseline(['chrome', 'safari'])
```

### 3. Regression Detector

**File:** `src/measurement-validator/regression-detector.ts`

Detects accuracy and performance regressions across multiple browsers.

```typescript
import { detectRegressions, formatRegressionReport } from './regression-detector.js'

const report = await detectRegressions(['chrome', 'safari', 'firefox'])
console.log(formatRegressionReport(report))

if (report.hasBlocker) process.exit(1)
```

### 4. Dashboard Server

**File:** `src/measurement-validator/dashboard-server.ts`

An HTTP server (Bun.serve) that exposes the accuracy/benchmark/status data
as a JSON API and serves an embedded HTML dashboard.

```typescript
import { DashboardServer } from './dashboard-server.js'

const server = new DashboardServer({ port: 3001 })
server.start()
// http://localhost:3001 — dashboard UI
// http://localhost:3001/api/status — status JSON
// http://localhost:3001/api/accuracy/chrome — accuracy data
```

**API endpoints:**

| Method | Path | Description |
|--------|------|-------------|
| GET | `/` | HTML dashboard |
| GET | `/health` | Liveness check |
| GET | `/api/status` | `status/dashboard.json` |
| GET | `/api/accuracy/:browser` | `accuracy/<browser>.json` |
| GET | `/api/benchmarks/:browser` | `benchmarks/<browser>.json` |
| GET | `/api/runs` | Recent validation runs (SQLite) |
| GET | `/api/runs/summaries` | High-level trend summaries |
| POST | `/api/runs` | Insert a new run record |

### 5. Results Database

**File:** `src/measurement-validator/results-database.ts`

SQLite persistence via Bun's built-in `bun:sqlite`. Stores validation run
records with accuracy, benchmark, and regression data.

```typescript
import { ResultsDatabase } from './results-database.js'

const db = new ResultsDatabase()

db.insertRun({
runAt: new Date().toISOString(),
browser: 'chrome',
accuracyTotal: 7680,
accuracyMatches: 7680,
benchmarkJson: JSON.stringify(benchmarkReport),
regressionJson: JSON.stringify(regressionReport),
tags: 'pr:123',
})

const recent = db.queryRuns({ browser: 'chrome', limit: 20 })
const summaries = db.querySummaries({ since: '2026-01-01T00:00:00Z' })
db.close()
```

### 6. Slack Notifier

**File:** `src/measurement-validator/slack-notifier.ts`

Sends formatted Slack messages via an Incoming Webhook URL. Reads the URL
from `SLACK_WEBHOOK_URL` environment variable when using the factory helper.

```typescript
import { SlackNotifier, createSlackNotifierFromEnv } from './slack-notifier.js'

const notifier = createSlackNotifierFromEnv() // reads SLACK_WEBHOOK_URL
if (notifier) {
await notifier.notifyRegressionReport(report)
await notifier.notifyPerformanceReport(perfReport)
await notifier.notifyText('Custom message')
}
```

---

## CLI Scripts

### `bun run validator:dashboard`

Start the dashboard HTTP server.

```
bun run validator:dashboard [--port=3001] [--host=127.0.0.1] [--no-db]
```

### `bun run validator:trends`

Print performance trend report.

```
bun run validator:trends [--browser=chrome] [--warn=10] [--critical=25] [--json]
```

### `bun run validator:watch`

Watch the `accuracy/` and `benchmarks/` directories and re-run regression
detection whenever a snapshot file changes.

```
bun run validator:watch [--browsers=chrome,safari,firefox] [--slack-webhook=<url>]
```

### `bun run validator:regression-detect`

Run one-shot regression detection (used in CI).

```
bun run validator:regression-detect [--browsers=chrome] [--json] [--fail-on-critical]
```

---

## Configuration

### Performance Baseline

Write a baseline from the current benchmark snapshots:

```bash
bun -e "import('./src/measurement-validator/performance-tracker.js').then(m => m.writeBaseline(['chrome', 'safari']))"
```

This creates `.measurement-baseline.json` which is checked into version
control. Commit it alongside any intentional performance changes.

### Slack Webhook

Set the `SLACK_WEBHOOK_URL` environment variable (e.g. in a GitHub Actions
secret) to enable Slack notifications. The notifier is disabled silently
when the variable is absent.

---

## Data Files

| File | Purpose |
|------|---------|
| `accuracy/chrome.json` | Chrome accuracy snapshot (baseline) |
| `accuracy/safari.json` | Safari accuracy snapshot (baseline) |
| `accuracy/firefox.json` | Firefox accuracy snapshot (baseline) |
| `benchmarks/chrome.json` | Chrome benchmark snapshot |
| `benchmarks/safari.json` | Safari benchmark snapshot |
| `status/dashboard.json` | Aggregated status dashboard |
| `.measurement-baseline.json` | Performance baseline (generated, commit after intentional changes) |
| `.measurement-results.db` | SQLite results history (not committed) |
7 changes: 6 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"src",
"!src/layout.test.ts",
"!src/test-data.ts",
"!src/measurement-validator",
"pages/demos",
"pages/assets"
],
Expand Down Expand Up @@ -68,7 +69,11 @@
"site:build": "rm -rf site && bun run scripts/build-demo-site.ts",
"start": "HOST=${HOST:-127.0.0.1}; PORT=3000; pids=$(lsof -tiTCP:$PORT -sTCP:LISTEN 2>/dev/null); if [ -n \"$pids\" ]; then echo \"Freeing port $PORT: terminating $pids\"; kill $pids 2>/dev/null || true; sleep 1; pids=$(lsof -tiTCP:$PORT -sTCP:LISTEN 2>/dev/null); if [ -n \"$pids\" ]; then echo \"Port $PORT still busy: killing $pids\"; kill -9 $pids 2>/dev/null || true; fi; fi; bun pages/*.html pages/demos/*.html pages/demos/*/index.html --host=$HOST:$PORT",
"start:lan": "HOST=0.0.0.0 bun run start",
"start:watch": "HOST=${HOST:-127.0.0.1}; PORT=3000; pids=$(lsof -tiTCP:$PORT -sTCP:LISTEN 2>/dev/null); if [ -n \"$pids\" ]; then echo \"Freeing port $PORT: terminating $pids\"; kill $pids 2>/dev/null || true; sleep 1; pids=$(lsof -tiTCP:$PORT -sTCP:LISTEN 2>/dev/null); if [ -n \"$pids\" ]; then echo \"Port $PORT still busy: killing $pids\"; kill -9 $pids 2>/dev/null || true; fi; fi; bun pages/*.html pages/demos/*.html pages/demos/*/index.html --watch --no-clear-screen --host=$HOST:$PORT"
"start:watch": "HOST=${HOST:-127.0.0.1}; PORT=3000; pids=$(lsof -tiTCP:$PORT -sTCP:LISTEN 2>/dev/null); if [ -n \"$pids\" ]; then echo \"Freeing port $PORT: terminating $pids\"; kill $pids 2>/dev/null || true; sleep 1; pids=$(lsof -tiTCP:$PORT -sTCP:LISTEN 2>/dev/null); if [ -n \"$pids\" ]; then echo \"Port $PORT still busy: killing $pids\"; kill -9 $pids 2>/dev/null || true; fi; fi; bun pages/*.html pages/demos/*.html pages/demos/*/index.html --watch --no-clear-screen --host=$HOST:$PORT",
"validator:dashboard": "bun run scripts/validator-dashboard.ts",
"validator:trends": "bun run scripts/validator-trends.ts",
"validator:watch": "bun run scripts/validator-watch.ts",
"validator:regression-detect": "bun run scripts/validator-regression-detect.ts"
},
"devDependencies": {
"@types/bun": "latest",
Expand Down
35 changes: 35 additions & 0 deletions scripts/validator-dashboard.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env bun
// validator-dashboard.ts — start the measurement-validator HTTP dashboard.
//
// Usage:
// bun run scripts/validator-dashboard.ts [--port=3001] [--host=127.0.0.1] [--no-db]
//
// Flags:
// --port=N Port to listen on (default 3001)
// --host=H Hostname/IP to bind (default 127.0.0.1)
// --no-db Disable SQLite persistence (serve read-only data only)

import { DashboardServer } from '../src/measurement-validator/dashboard-server.js'

function parseFlag(name: string): string | null {
const prefix = `--${name}=`
const arg = process.argv.find(v => v.startsWith(prefix))
return arg !== undefined ? arg.slice(prefix.length) : null
}

function hasFlag(name: string): boolean {
return process.argv.includes(`--${name}`)
}

const port = Number(parseFlag('port') ?? 3001)
const host = parseFlag('host') ?? '127.0.0.1'
const enableDatabase = !hasFlag('no-db')

const server = new DashboardServer({ port, host, enableDatabase })
server.start()

process.on('SIGINT', () => {
console.log('\nShutting down dashboard server…')
server.stop()
process.exit(0)
})
Loading