Chris0Jeky · Chris0Jeky · Apr 12, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
@@ -107,6 +107,16 @@ jobs:
       dotnet-version: 8.0.x
       node-version: 24.13.1
 
+  visual-regression:
+    name: Visual Regression
+    if: github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'testing') || contains(github.event.pull_request.labels.*.name, 'visual')))
+    needs:
+      - backend-solution
+    uses: ./.github/workflows/reusable-visual-regression.yml
+    with:
+      dotnet-version: 8.0.x
+      node-version: 24.13.1
+
   load-concurrency-harness:
     name: Load and Concurrency Harness
     if: github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'testing'))

@@ -0,0 +1,120 @@
+name: Reusable Visual Regression
+
+on:
+  workflow_call:
+    inputs:
+      dotnet-version:
+        description: .NET SDK version used for backend setup
+        required: false
+        default: "8.0.x"
+        type: string
+      node-version:
+        description: Node.js version used for frontend setup
+        required: false
+        default: "24.13.1"
+        type: string
+
+permissions:
+  contents: read
+
+env:
+  NUGET_PACKAGES: ${{ github.workspace }}/.nuget/packages
+
+jobs:
+  visual-regression:
+    name: Visual Regression
+    runs-on: ubuntu-latest
+    timeout-minutes: 20
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Setup .NET
+        uses: actions/setup-dotnet@v5
+        with:
+          dotnet-version: ${{ inputs.dotnet-version }}
+          cache: true
+          cache-dependency-path: |
+            backend/Taskdeck.sln
+            backend/**/*.csproj
+
+      - name: Setup Node
+        uses: actions/setup-node@v6
+        with:
+          node-version: ${{ inputs.node-version }}
+          cache: npm
+          cache-dependency-path: frontend/taskdeck-web/package-lock.json
+
+      - name: Restore backend
+        run: dotnet restore backend/Taskdeck.sln
+
+      - name: Install frontend dependencies
+        working-directory: frontend/taskdeck-web
+        run: npm ci
+
+      - name: Cache Playwright browsers
+        uses: actions/cache@v5
+        with:
+          path: ~/.cache/ms-playwright
+          key: ms-playwright-${{ runner.os }}-${{ hashFiles('frontend/taskdeck-web/package-lock.json') }}
+
+      - name: Install Playwright browser
+        working-directory: frontend/taskdeck-web
+        run: npx playwright install --with-deps chromium
+
+      - name: Remove stale visual E2E database
+        working-directory: frontend/taskdeck-web
+        run: node -e "require('fs').rmSync('taskdeck.e2e.visual.ci.db',{force:true});"
+
+      - name: Check for existing baselines
+        id: baselines
+        working-directory: frontend/taskdeck-web
+        run: |
+          if [ -d "tests/visual/__screenshots__" ] && [ "$(find tests/visual/__screenshots__ -name '*.png' 2>/dev/null | head -1)" ]; then
+            echo "exist=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "exist=false" >> "$GITHUB_OUTPUT"
+            echo "::warning::No baseline screenshots found. Running with --update-snapshots to generate initial baselines. Download the visual-regression-baselines artifact and commit them."
+          fi
+
+      - name: Run visual regression tests
+        timeout-minutes: 12
+        working-directory: frontend/taskdeck-web
+        env:
+          CI: "true"
+          TASKDECK_E2E_DB: taskdeck.e2e.visual.ci.db
+          TASKDECK_RUN_DEMO: "0"
+        run: |
+          if [ "${{ steps.baselines.outputs.exist }}" = "false" ]; then
+            npx playwright test --config playwright.visual.config.ts --update-snapshots --reporter=line
+          else
+            npx playwright test --config playwright.visual.config.ts --reporter=line
+          fi
+
+      - name: Upload generated baselines
+        if: steps.baselines.outputs.exist == 'false'
+        uses: actions/upload-artifact@v7
+        with:
+          name: visual-regression-baselines
+          path: frontend/taskdeck-web/tests/visual/__screenshots__/
+          if-no-files-found: warn
+          retention-days: 30
+
+      - name: Upload visual diff artifacts
+        if: failure()
+        uses: actions/upload-artifact@v7
+        with:
+          name: visual-regression-diffs
+          path: |
+            frontend/taskdeck-web/test-results/
+          if-no-files-found: ignore
+          retention-days: 14
+
+      - name: Upload Playwright HTML report
+        if: failure()
+        uses: actions/upload-artifact@v7
+        with:
+          name: visual-regression-report
+          path: frontend/taskdeck-web/playwright-report
+          if-no-files-found: ignore
+          retention-days: 14
diff --git a/docs/STATUS.md b/docs/STATUS.md
@@ -28,6 +28,7 @@ Current constraints are mostly hardening and consistency:
 - LLM flow now supports config-gated `OpenAI` and `Gemini` providers with deterministic `Mock` fallback for safe local/test posture; degraded provider responses are now structurally distinct (`messageType: "degraded"` + `degradedReason`) and the health endpoint supports opt-in probe verification (`?probe=true`); chat-to-proposal pipeline improvements delivered: `LlmIntentClassifier` now uses compiled regex patterns with word-distance matching, stemming/plurals, broader verb coverage, and negative context filtering for negations and other-tool questions (`#571`); parse failures now return structured hint payloads with closest-match suggestions and a frontend hint card with "try this instead" pre-fill (`#572`); dedicated classifier and chat-to-proposal integration test coverage added (`#577`); LLM-assisted instruction extraction now delivered (`#573`): OpenAI and Gemini providers request structured JSON output with a system prompt describing supported instruction patterns, parse the response into `LlmCompletionResult.Instructions`, and fall back to the static `LlmIntentClassifier` when structured parsing fails; `ChatService` iterates LLM-extracted instructions (supporting multiple proposals from a single message) and falls back to raw user message parsing when no instructions are extracted; Mock provider unchanged for deterministic test behavior; multi-instruction batch parsing now delivered (`#574`): `ParseBatchInstructionAsync` splits multiple natural-language instructions into individual planner calls, `ChatService` routes multi-instruction messages through batch parsing to generate multiple proposals from a single chat message; board-context LLM prompting now delivered (`#575`, expanded in `#617`): `BoardContextBuilder` constructs bounded board context (columns, card IDs, titles, labels) grouped per column and appends it to system prompts across OpenAI and Gemini providers via `LlmSystemPromptBuilder`; card IDs are included as first-8 hex chars so the LLM can generate `move card <id>` instructions; context budget increased to 4000 chars with single-query card fetch; **remaining gap**: conversational refinement (`#576`) remains undelivered; analysis at `docs/analysis/2026-03-29_chat_nlp_proposal_gap.md`
 - managed-key shared-token abuse-control strategy is now explicitly seeded in `#235` to `#240` before broad external exposure
 - testing-harness guardrail expansion from `#254` to `#260` is shipped; remaining work is normal follow-up hardening rather than the original wave
+- visual regression harness delivered (`#88`): Playwright-based screenshot comparison for 7 key UI surfaces (board empty/populated, command palette open/search, archive, inbox, home); separate `playwright.visual.config.ts` with fixed viewport (1280x720), animations disabled, 0.5% pixel tolerance; CI Extended integration via `reusable-visual-regression.yml` with diff artifact upload on failure; policy document at `docs/testing/VISUAL_REGRESSION_POLICY.md`
 - rigorous test expansion wave seeded 2026-04-03 (`#721` tracker, 22 issues `#699`–`#726`): systematic codebase audit identified 25+ untested infrastructure repositories, zero tests on the central worker, 6 controllers with untested HTTP surfaces, and no golden-path integration test for the capture → proposal → board pipeline; execution is tracked in `docs/TESTING_GUIDE.md`; first delivery: infrastructure repository integration tests (`#699`/`#730` — 77 tests across 7 repo classes against real SQLite); **major wave delivery 2026-04-04** (PRs `#732`–`#739`, 8 issues, ~300 new tests): SEC-20 ChangePassword fix (`#722`/`#732`), golden-path capture→board integration test (`#703`/`#735` — 7 tests proving full pipeline), cross-user data isolation tests (`#704`/`#733` — 38 tests across all major API boundaries), LlmQueueToProposalWorker integration tests (`#700`/`#734` — 24 tests, previously zero coverage), controller HTTP integration tests (`#702`/`#738` — 67 tests covering 6 untested controllers, found 2 pre-existing bugs), proposal lifecycle edge cases (`#708`/`#736` — 74 tests for state machine/expiry/race conditions), OAuth/auth edge cases (`#707`/`#737` — 44 tests, found and fixed `Substring` overflow bug in `ExternalLoginAsync`), MCP full resource/tool inventory (`#653`/`#739` — 9 resources + 11 tools with 42 tests, GP-06 compliant, user-scoping gap fixed during review); **second wave delivery 2026-04-04** (PRs `#740`–`#755`, 8 issues, ~586 new tests with two rounds of adversarial review, 47 review-fix commits): domain entity state machine exhaustive tests (`#701`/`#740` — 174 tests across 7 entities: CommandRun, ArchiveItem, ChatSession, UserPreference, NotificationPreference, CardLabel, CardCommentMention), SignalR hub and realtime integration tests (`#706`/`#751` — 19 tests covering auth, presence lifecycle, multi-user, authorization, edge cases), LLM provider abstraction and tool-calling edge cases (`#709`/`#747` — 101 tests across orchestrator, provider, classifier, registry), data export/import round-trip integrity tests (`#713`/`#752` — 64 tests covering JSON, CSV, GDPR, database, cross-format validation), API error contract regression and boundary validation (`#714`/`#753` — 57 tests across 7 endpoint families with GP-03 contract enforcement), archive and restore lifecycle integration tests (`#715`/`#755` — 74 tests: 45 domain + 29 API covering state machine, cross-user isolation, conflict detection, audit trail), board metrics and analytics accuracy verification (`#718`/`#749` — 61 tests: 51 service + 10 controller covering throughput, cycle time, WIP, blocked cards, done-column heuristic), notification delivery, deduplication, and preference filtering (`#719`/`#746` — 36 tests covering all 5 notification types, deduplication, preference filtering, cross-user isolation, batch operations)
 - MVP dogfooding flow now supports canonical checklist bootstrap in chat (proposal-first, board-scoped); broader template coverage remains future work
 - collaborative editing now includes board/card presence visibility and conflict-hinting guardrails for stale card writes

diff --git a/docs/TESTING_GUIDE.md b/docs/TESTING_GUIDE.md
@@ -596,6 +596,36 @@ cd frontend/taskdeck-web
 npm run test:e2e:audit:headed
 ```
 
+## Visual Regression Tests
+
+Visual regression tests capture baseline screenshots of key UI surfaces and compare them against future renders to catch unintended layout changes.
+
+**Policy document**: `docs/testing/VISUAL_REGRESSION_POLICY.md` (thresholds, false-positive mitigation, baseline management)
+
+**Test location**: `frontend/taskdeck-web/tests/visual/`
+
+**Config**: `frontend/taskdeck-web/playwright.visual.config.ts`
+
+**Covered surfaces**: board view (empty + populated), command palette (open + search), archive view, inbox/capture view, home view
+
+Run visual tests:
+
+```bash
+cd frontend/taskdeck-web
+npm run test:visual
+```
+
+Update baselines after intentional UI changes:
+
+```bash
+cd frontend/taskdeck-web
+npm run test:visual:update
+```
+
+Key settings: fixed viewport 1280x720, animations disabled, 0.5% pixel tolerance, platform-specific baselines (CI canonical platform: ubuntu-latest).
+
+CI integration: runs in CI Extended pipeline with `testing` or `visual` PR labels. Diff artifacts uploaded on failure for review.
+
 ## Demo Tooling Policy
 
 Default CI posture: