tphummel · tphummel · Dec 24, 2025
@@ -0,0 +1,33 @@
+name: Build puzzle SQLite artifact
+
+on:
+  push:
+    branches: [main]
+
+jobs:
+  build-sqlite:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install sqlite-utils
+        run: python -m pip install --upgrade pip && python -m pip install -r scripts/requirements-sqlite-utils.txt
+
+      - name: Build puzzle database
+        run: python scripts/build_puzzle_db.py --db-path puzzle-data.db
+
+      - name: Capture short SHA
+        run: echo "SHORT_SHA=${GITHUB_SHA::7}" >> $GITHUB_ENV
+
+      - name: Upload SQLite artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: puzzle-sqlite-${{ env.SHORT_SHA }}
+          path: puzzle-data.db
+          if-no-files-found: error
@@ -0,0 +1,10 @@
+{
+  "name": "wordle",
+  "version": "1.0.0",
+  "description": "Utility scripts for the Wordle puzzle site.",
+  "type": "commonjs",
+  "private": true,
+  "scripts": {
+    "build:puzzle-db": "python scripts/build_puzzle_db.py"
+  }
+}
@@ -0,0 +1,42 @@
+# Puzzle SQLite artifact
+
+This repository stores puzzle data as markdown files with YAML front matter under `content/w`, and Hugo emits structured exports
+into the `public/` directory during a build. The `scripts/build_puzzle_db.py` helper uses the [sqlite-utils](https://github.com/simonw/sqlite-utils)
+tooling from Simon Willison's SQLite suite to read the already-rendered artifacts and write them into a SQLite database that can
+be attached to CI artifacts.
+
+## What Hugo already outputs
+
+Hugo produces machine-readable files that the loader ingests directly:
+
+- Per-guest JSON results at `public/w/<puzzle-date>/<guest>.json` that include the `results` matrix and `enrichedResults` payloads.
+- Raw TSV copies of submitted share text at `public/w/<puzzle-date>/guest-entries.tsv`.
+
+## Schema
+
+The generated database contains two tables derived from those exports:
+
+| Column | Type | Purpose |
+| --- | --- | --- |
+| `guest_results.puzzle_date` | `TEXT` | Puzzle date derived from the `public/w/<date>` folder name. |
+| `guest_results.guest` | `TEXT` | Guest name taken from the JSON filename. |
+| `guest_results.results` | `JSON` | `results` array from the per-guest JSON export. |
+| `guest_results.enriched_results` | `JSON` | `enrichedResults` array from the per-guest JSON export. |
+| `guest_results.source_path` | `TEXT` | Relative path to the originating JSON file for traceability. |
+| `guest_entries_raw.puzzle_date` | `TEXT PRIMARY KEY` | Puzzle date matching the TSV source folder. |
+| `guest_entries_raw.guest_entries_tsv` | `TEXT` | Raw TSV content from `guest-entries.tsv`. |
+| `guest_entries_raw.source_path` | `TEXT` | Relative path to the TSV file that was imported. |
+
+## Artifact packaging
+
+The workflow uploads the SQLite file with the `actions/upload-artifact` step. That action already compresses artifacts before storing them, so an additional tarball (e.g., `tar.gz`) is unnecessary. Keeping the raw `.db` file simplifies local use while still benefiting from GitHub's built-in compression during transfer and storage.
+
+## Implementation language considerations
+
+When building the database locally or in CI, the simplest approach is to pick a single runtime that can parse YAML and talk to SQLite without shelling out to other languages. The main trade-offs are:
+
+- **Python** — Ships with `sqlite3` in the standard library, so only `pyyaml` is required for YAML parsing. The code stays readable, and dependency weight stays low (one third-party package).
+- **Ruby** — Bundles YAML parsing via the standard library (`Psych`), so there are no extra packages for that, but it does require the `sqlite3` gem. This is still a concise single-language solution if Ruby is already available in the workflow.
+- **Node.js** — Requires installing both a YAML parser (e.g., `yaml`/`js-yaml`) and either a SQLite client library or the `sqlite3` CLI. While perfectly viable, it usually introduces more moving parts than Python or Ruby for this specific task.
+
+If minimizing dependencies is the top priority, **Python** tends to be the lightest option because it pairs a built-in SQLite client with a single YAML dependency. A pure Ruby script is a close alternative when Ruby is first-class in the environment. Favor a single-language script over a mixed pipeline to keep the build logic easy to read and maintain.
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+"""
+Build a SQLite database from the static site outputs using the sqlite-utils
+library from Simon Willison's SQLite tools collection.
+
+This loader avoids parsing Hugo front matter directly; instead it consumes the
+structured JSON and TSV artifacts that Hugo already writes under `public/w`. The
+resulting database is convenient for downstream analysis while staying aligned
+with the generated site content.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+from typing import Iterable, Mapping
+
+import sqlite_utils
+
+
+def iter_guest_json(public_root: Path) -> Iterable[Mapping[str, object]]:
+    """Yield guest result JSON objects from `public/w/<date>/<guest>.json` files."""
+
+    if not public_root.exists():
+        return []
+
+    for date_dir in sorted(public_root.iterdir()):
+        if not date_dir.is_dir():
+            continue
+
+        puzzle_date = date_dir.name
+        for json_path in sorted(date_dir.glob("*.json")):
+            if json_path.name == "index.json":
+                continue
+
+            payload = json.loads(json_path.read_text(encoding="utf-8"))
+            guest = json_path.stem
+            yield {
+                "puzzle_date": puzzle_date,
+                "guest": guest,
+                "results": payload.get("results"),
+                "enriched_results": payload.get("enrichedResults"),
+                "source_path": str(json_path),
+            }
+
+
+def iter_guest_tsv(public_root: Path) -> Iterable[Mapping[str, object]]:
+    """Yield raw TSV content from `public/w/<date>/guest-entries.tsv` files."""
+
+    if not public_root.exists():
+        return []
+
+    for date_dir in sorted(public_root.iterdir()):
+        if not date_dir.is_dir():
+            continue
+
+        tsv_path = date_dir / "guest-entries.tsv"
+        if tsv_path.exists():
+            yield {
+                "puzzle_date": date_dir.name,
+                "guest_entries_tsv": tsv_path.read_text(encoding="utf-8"),
+                "source_path": str(tsv_path),
+            }
+
+
+def build_database(public_root: Path, db_path: Path) -> None:
+    db = sqlite_utils.Database(db_path)
+
+    guest_json_rows = list(iter_guest_json(public_root))
+    if guest_json_rows:
+        table = db["guest_results"]
+        table.upsert_all(
+            guest_json_rows,
+            pk=("puzzle_date", "guest"),
+            alter=True,
+        )
+        table.transform(types={"results": "JSON", "enriched_results": "JSON"})
+
+    tsv_rows = list(iter_guest_tsv(public_root))
+    if tsv_rows:
+        db["guest_entries_raw"].upsert_all(
+            tsv_rows,
+            pk="puzzle_date",
+            alter=True,
+        )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Build puzzle SQLite from static outputs.")
+    parser.add_argument(
+        "--public-root",
+        default=Path("public") / "w",
+        type=Path,
+        help="Root directory containing Hugo-generated per-day folders (default: public/w)",
+    )
+    parser.add_argument(
+        "--db-path",
+        default=Path("puzzle-data.db"),
+        type=Path,
+        help="Path to the SQLite database to write (default: puzzle-data.db)",
+    )
+
+    args = parser.parse_args()
+    build_database(args.public_root, args.db_path)
@@ -0,0 +1 @@
+sqlite-utils>=3.37