cofin · cofin · Apr 7, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/.github/workflows/perf-benchmark.yml b/.github/workflows/perf-benchmark.yml
@@ -0,0 +1,39 @@
+name: Performance Benchmark
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    steps:
+      - uses: actions/checkout@v5
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+
+      - name: Set up Python
+        run: uv python install 3.12
+
+      - name: Install dependencies
+        run: uv sync --all-extras --dev
+
+      - name: Build Mojo core
+        run: make build
+
+      - name: Run benchmark (generation + embedding)
+        run: |
+          make benchmark
+          uv run python tools/benchmark.py --mode generation --rounds 20 --max-new-tokens 64 > benchmark_generation.json
+          uv run python tools/benchmark.py --mode embedding --rounds 20 > benchmark_embedding.json
+
+      - name: Upload benchmark artifacts
+        uses: actions/upload-artifact@v7
+        with:
+          name: benchmark-results
+          path: benchmark_*.json
+          retention-days: 90
diff --git a/.gitignore b/.gitignore
@@ -24,3 +24,4 @@ CLAUDE.md
 GEMINI.md
 docs/*.json
 benchmark-model/
+.agents
diff --git a/Makefile b/Makefile
@@ -27,6 +27,7 @@ help: ## Display this help
 install: clean ## Install everything (Python, Mojo, Beads)
 	@echo "${INFO} Installing..."
 	@if ! command -v uv >/dev/null 2>&1; then curl -LsSf https://astral.sh/uv/install.sh | sh; fi
+	@uv python pin 3.12 >/dev/null 2>&1
 	@uv venv
 	@$(MAKE) py-install
 	@$(MAKE) beads-install

diff --git a/README.md b/README.md
@@ -82,9 +82,9 @@ asyncio.run(main())
 Generate dense vector embeddings natively through Mojo's optimized batched kernel operations. Pass a single string or a list of strings to process them in parallel.
 
 ```python
-from mogemma import EmbeddingModel
+from mogemma import SyncEmbeddingModel
 
-model = EmbeddingModel()
+model = SyncEmbeddingModel()
 embeddings = model.embed(["Hello, world!", "Mojo runs Gemma inference."])
 print(embeddings.shape)  # (2, 768)
 ```
@@ -126,7 +126,7 @@ Current runtime status:
 - `gpu` / `gpu:N` execute via a mathematically verified runtime polyfill
 
 ```python
-from mogemma import EmbeddingConfig, EmbeddingModel, GenerationConfig, SyncGemmaModel
+from mogemma import EmbeddingConfig, SyncEmbeddingModel, GenerationConfig, SyncGemmaModel
 
 generation = SyncGemmaModel(
     GenerationConfig(
@@ -135,14 +135,18 @@ generation = SyncGemmaModel(
     )
 )
 
-embeddings = EmbeddingModel(
+embeddings = SyncEmbeddingModel(
     EmbeddingConfig(
         model_path="gemma3-1b-it",
         device="cpu",
     )
 )
 ```
 
+> **GPU Requirements:** GPU acceleration requires Mojo nightly with GPU support,
+> compatible GPU drivers (NVIDIA CUDA, AMD ROCm, or Apple Metal), and sufficient VRAM
+> for model weights and KV cache.
+
 ## Runtime Requirements
 
 MoGemma leverages the latest Mojo features for maximum performance.

diff --git a/docs/architecture/backend-architecture.md b/docs/architecture/backend-architecture.md
@@ -2,7 +2,7 @@
 
 ## Current Runtime Shape
 
-- Python model classes (`SyncGemmaModel`, `EmbeddingModel`) resolve a backend and cache it at construction time.
+- Python model classes (`SyncGemmaModel`, `SyncEmbeddingModel`) resolve a backend and cache it at construction time.
 - `cpu` is implemented by `CPUCoreBackend`, which delegates into `mogemma._core` (`init_model`, `step`, `generate_embeddings`).
 - Backend IDs are normalized in one place (`resolve_backend_id`) and support:
   - `cpu`

diff --git a/docs/architecture/device-selection.md b/docs/architecture/device-selection.md
@@ -59,9 +59,9 @@ cpu_model = SyncGemmaModel(
 ```
 
 ```python
-from mogemma import EmbeddingConfig, EmbeddingModel
+from mogemma import EmbeddingConfig, SyncEmbeddingModel
 
-embedding_model = EmbeddingModel(
+embedding_model = SyncEmbeddingModel(
     EmbeddingConfig(model_path="gemma3-270m-it", device="cpu")
 )
 ```

diff --git a/docs/performance-baselines.md b/docs/performance-baselines.md
@@ -29,6 +29,10 @@ Different CI runners have different baselines. Baselines should be strictly sepa
 
 CI/release validation now executes generation benchmarks in `.github/workflows/ci.yml` through the `check-release` job.
 
+## Automated CI Benchmarks
+
+Standard CPU baselines are captured automatically in GitHub Actions via `.github/workflows/perf-benchmark.yml`. This workflow runs on every push to `main` and on every pull request, executing both generation and embedding benchmarks with synthetic stubs. Results are uploaded as artifacts (retained for 90 days) and visible in the Actions tab. This is purely informational — the workflow does not gate or fail PRs based on variance.
+
 ## Baseline artifacts
 
 Latest captured snapshots are stored in this directory:

diff --git a/pyproject.toml b/pyproject.toml
@@ -17,14 +17,13 @@ dependencies = [
     "numpy",
     "typing_extensions",
     "safetensors",
-    "tensorstore",
 ]
-description = "Python/Mojo interface for Google Gemma 3"
+description = "Python/Mojo interface for Google Gemma 4"
 license = { file = "LICENSE" }
 name = "mogemma"
 readme = "README.md"
 requires-python = ">=3.10"
-version = "0.3.0"
+version = "0.4.0"
 
 [project.urls]
 Issue = "https://github.com/cofin/mogemma/issues/"
@@ -143,17 +142,26 @@ split-on-trailing-comma = false
 
 [tool.ruff.lint.per-file-ignores]
 "src/py/tests/test_*.py" = [
+    "ANN202",
     "ARG001",
     "D100",
     "D101",
     "D102",
     "D103",
     "D107",
+    "E402",
+    "ERA001",
+    "PLR2004",
+    "PT011",
     "S101",
+    "S105",
+    "S106",
+    "S108",
+    "S110",
     "S603",
-    "PLR2004",
+    "SIM105",
+    "PLC0415",
     "SLF001",
-    "E402",
 ]
 "src/py/mogemma/typing.py" = ["A005"]
 "src/mo/tests/test_*.py" = [
@@ -188,7 +196,7 @@ dev = [
 build = ["bump-my-version>=0.31.1", "mojo>=0.26.1a1"]
 lint = ["mypy>=1.13.0", "pyright>=1.1.386", "ruff>=0.14.14"]
 test = [
-    "obstore>=0.3.08",
+    "obstore>=0.4.08",
     "pillow>=12.1.1",
     "pytest>=9.0.2",
     "pytest-cov>=4.0.0",
@@ -200,7 +208,7 @@ test = [
 allow_dirty = true
 commit = false
 commit_args = "--no-verify"
-current_version = "0.3.0"
+current_version = "0.4.0"
 ignore_missing_files = false
 ignore_missing_version = false
 message = "chore(release): bump to v{new_version}"