ryancinsight · google-labs-jules · Sep 14, 2025 · Sep 14, 2025 · Sep 14, 2025 · Sep 15, 2025
diff --git a/.config/nextest.toml b/.config/nextest.toml
@@ -0,0 +1,3 @@
+[profile.default]
+slow-timeout = { period = "60s", terminate-after = 3 }
+global-timeout = "20m"
diff --git a/.gemini/richards_gradient_derivation.md b/.gemini/richards_gradient_derivation.md
@@ -0,0 +1,89 @@
+# Extended Richards Curve Gradient Derivations
+
+## Forward Pass
+```
+σ = [1 + β * exp(-k(input-m))]^(-1/ν)
+```
+
+Where (for defaults):
+- β = 1.0
+- input = x (after all transformations with defaults)
+
+## Gradients
+
+### ∂σ/∂ν (nu gradient)
+Using logarithmic differentiation:
+```
+ln(σ) = (-1/ν) * ln(base)
+(1/σ) * ∂σ/∂ν = (1/ν²) * ln(base)
+∂σ/∂ν = σ * ln(base) / ν²
+```
+
+### ∂σ/∂k (k gradient)
+Chain rule through base and exponent:
+```
+∂σ/∂base = (-1/ν) * base^(-1/ν - 1) = (-1/ν) * σ / base
+∂base/∂exponent = β * exp(exponent) = β * exp_term  
+∂exponent/∂k = -(input - m)
+
+∂σ/∂k = (∂σ/∂base) * (∂base/∂exponent) * (∂exponent/∂k)
+      = [(-1/ν) * σ / base] * [β * exp_term] * [-(input - m)]
+      = (1/ν) * σ * β * exp_term * (input - m) / base
+```
+
+For β=1: exp_term/base = 1 - σ^ν (approximately 1 - σ for small ν differences)
+
+More accurately, for Richards curve:
+```
+∂σ/∂k = (1/ν) * σ * exp_term * (input - m) / base
+```
+
+### ∂σ/∂m (m gradient)
+```
+∂exponent/∂m = k
+
+∂σ/∂m = (∂σ/∂base) * (∂base/∂exponent) * (∂exponent/∂m)
+      = [(-1/ν) * σ / base] * [β * exp_term] * [k]
+      = (-k/ν) * σ * β * exp_term / base
+```
+
+### ∂σ/∂β (beta gradient)
+```
+∂base/∂β = exp(exponent) = exp_term
+
+∂σ/∂β = (∂σ/∂base) * (∂base/∂β)
+      = [(-1/ν) * σ / base] * exp_term
+      = (-1/ν) * σ * exp_term / base
+```
+
+### ∂σ/∂temp (temperature gradient)
+Chain through input:
+```
+∂input/∂temp = -input_scale * scale * adaptive_normalized / temp²
+             = -input_scale * scale * temp_scaled / temp
+
+∂σ/∂input = (∂σ/∂base) * (∂base/∂exponent) * (∂exponent/∂input)
+          = [(-1/ν) * σ / base] * [β * exp_term] * [-k]
+          = (k/ν) * σ * β * exp_term / base
+
+∂σ/∂temp = (∂σ/∂input) * (∂input/∂temp)
+```
+
+## Final Formulas (β=1 default)
+
+```rust
+// Nu gradient
+d_sigma_d_nu = sigma * base.ln() / (nu * nu)
+
+// K gradient  
+d_sigma_d_k = (1.0 / nu) * sigma * exp_term * (input - m) / base
+
+// M gradient
+d_sigma_d_m = (-k / nu) * sigma * exp_term / base
+
+// Beta gradient (for β learnable)
+d_sigma_d_beta = (-1.0 / nu) * sigma * exp_term / base
+
+// Temperature gradient
+d_sigma_d_temp = (k / nu) * sigma * exp_term / base * (-temp_scaled / temp)
+```
diff --git a/.github/codecov.yml b/.github/codecov.yml
@@ -0,0 +1,22 @@
+# # ref: https://docs.codecov.com/docs/codecovyml-reference
+# comment out coverage job for now, https://github.com/tekaratzas/RustGPT/pull/11#issuecomment-3361854174
+# coverage:
+#   # Hold ourselves to a high bar
+#   range: 55..100
+#   round: down
+#   precision: 1
+#   status:
+#     # ref: https://docs.codecov.com/docs/commit-status
+#     project:
+#       default:
+#         # Avoid false negatives
+#         threshold: 1%
+
+# # Test files aren't important for coverage
+# ignore:
+#   - "tests"
+
+# # Make comments less noisy
+# comment:
+#   layout: "files"
+#   require_changes: yes
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
@@ -0,0 +1,73 @@
+permissions:
+  contents: read
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+  merge_group:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:    
+  RUST_TOOLCHAIN: stable
+
+name: Check
+jobs:
+  fmt:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+    name: fmt
+    permissions:
+      # Give the default GITHUB_TOKEN write permission to commit and push the
+      # added or changed files to the repository.
+      contents: write
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Install rust
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: nightly #${{ env.RUST_TOOLCHAIN }}  
+          components: rustfmt
+      - run: cargo fmt --check
+
+  clippy:
+    runs-on: ubuntu-latest
+    name: clippy
+    permissions:
+      contents: read
+      checks: write
+    strategy:
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Install ${{ env.RUST_TOOLCHAIN }}
+        uses: dtolnay/rust-toolchain@master  # master
+        with:
+          toolchain: ${{ env.RUST_TOOLCHAIN }}
+          components: clippy
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@v2
+      - run: cargo clippy --workspace --all-features --all-targets -- -D warnings
+
+  typos:
+      runs-on: ubuntu-latest
+      name: typos
+      permissions:
+        contents: read
+      strategy:
+        fail-fast: false
+      steps:
+        - uses: actions/checkout@v4
+          with:
+            submodules: true
+        - name: Check spelling
+          uses: crate-ci/typos@master
+
+
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,68 @@
+permissions:
+  contents: read
+on:
+  push:
+    branches: [main, master]
+  pull_request:
+  merge_group:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:    
+  RUST_TOOLCHAIN: stable
+
+name: Test
+jobs:
+  required:
+    runs-on: ubuntu-latest
+    name: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+      - name: Install ${{ env.RUST_TOOLCHAIN }}
+        uses: dtolnay/rust-toolchain@master
+        with:
+          toolchain: ${{ env.RUST_TOOLCHAIN }}
+      - name: cargo generate-lockfile
+        if: hashFiles('Cargo.lock') == ''
+        run: cargo generate-lockfile
+      # https://twitter.com/jonhoo/status/1571290371124260865
+      - name: Rust Cache
+        uses: Swatinem/rust-cache@v2
+      - name: Install nextest
+        uses: taiki-e/install-action@nextest
+      - name: cargo nextest --locked
+        run: cargo nextest run --locked --workspace --all-features --all-targets
+
+  # comment out coverage job for now, https://github.com/tekaratzas/RustGPT/pull/11#issuecomment-3361854174
+  # coverage:
+  #   runs-on: ubuntu-latest
+  #   name: coverage
+  #   steps:
+  #     - uses: actions/checkout@v4
+  #       with:
+  #         submodules: true
+  #     - name: Install rust
+  #       uses: dtolnay/rust-toolchain@master
+  #       with:
+  #         toolchain: ${{ env.RUST_TOOLCHAIN }}
+  #         components: llvm-tools-preview
+  #     - name: cargo install cargo-llvm-cov
+  #       uses: taiki-e/install-action@cargo-llvm-cov
+  #     - name: cargo generate-lockfile
+  #       if: hashFiles('Cargo.lock') == ''
+  #       run: cargo generate-lockfile
+  #     - name: Rust Cache
+  #       uses: Swatinem/rust-cache@v2        
+  #     - name: Install nextest
+  #       uses: taiki-e/install-action@nextest
+  #     - name: cargo llvm-cov
+  #       run: cargo llvm-cov nextest --locked --workspace --all-features --all-targets --lcov --output-path lcov.info
+  #     - name: Upload to codecov.io
+  #       uses: codecov/codecov-action@v5
+  #       with:
+  #         fail_ci_if_error: true
+  #         token: ${{ secrets.CODECOV_TOKEN }} # required
diff --git a/.gitignore b/.gitignore
@@ -1 +1,15 @@
 /target
+/target_ci/
+
+# Model files
+models/*.bin
+models/*.ckpt
+models/*.pth
+models/*.h5
+models/*.pb
+models/*.onnx
+*.bin
+*.csv
+
+# Local logs / run artifacts
+/logs/
diff --git a/.trae/documents/Add Denoising Cross-Entropy for Diffusion.md b/.trae/documents/Add Denoising Cross-Entropy for Diffusion.md
@@ -0,0 +1,40 @@
+## Goals
+- Implement a denoising cross-entropy (DCE) training variant for diffusion to match CE-style logs/metrics, enabling apples-to-apples comparison with Transformer/TRM.
+- Combine denoising MSE with CE over the output projection of recovered x0 (configurable weights), or run CE-only.
+
+## CLI Additions
+- `--diffusion_ce` (bool): use DCE pipeline for diffusion pretraining.
+- `--diffusion_ce_weight <f32>` (default: 0.5): CE loss weight.
+- `--diffusion_mse_weight <f32>` (default: 0.5): MSE loss weight. If `--diffusion_ce` and `diffusion_mse_weight=0`, runs CE-only.
+
+## Training Pipeline (LLM::train_diffusion_ce)
+1) Tokenize batch sequences; slice `input_ids = seq[..len-1]` and `target_ids = seq[1..]`.
+2) Embed: `x0 = TokenEmbeddings.forward([input_ids])` → shape `[seq_len, embed_dim]`.
+3) Sample noise ε and timestep t; compute `x_t = NoiseScheduler.q_sample(x0, t, ε)`.
+4) Predict noise: forward `x_t` through all DiffusionBlocks with `set_timestep(t)` to get `ε_θ`.
+5) Recover x0_hat: `x0_hat = (x_t - sqrt(1-ᾱ_t) * ε_θ) / sqrt(ᾱ_t)` using scheduler’s `sqrt_alpha_cumprod(t)` and `sqrt_one_minus_alpha_cumprod(t)`.
+6) Logits: pass `x0_hat` through final `DynamicTanhNorm` (if present) and `OutputProjection` to get `[seq_len, vocab_size]`.
+7) Loss:
+- MSE: `mse = mean((ε_θ - ε)^2)`.
+- CE: standard token-level CE on logits vs `target_ids`.
+- Total: `loss = mse_weight*mse + ce_weight*ce`.
+8) Gradients:
+- CE grads: `dL/dlogits` → OutputProjection.backward → `grad_hidden` (shape `[seq_len, embed_dim]`), then through final norm (if present) to get `grad_x0_hat`.
+- Chain rule to predicted noise: `grad_eps = grad_x0_hat * (-sqrt(1-ᾱ_t)/sqrt(ᾱ_t))` (broadcast scalar).
+- MSE grads: `grad_eps += 2*(ε_θ - ε)/N`.
+- Backprop `grad_eps` through DiffusionBlocks with `compute_gradients(input=x_t, grads=grad_eps)` and `apply_gradients`.
+- Optionally backprop into TokenEmbeddings via `grad_x0` if desired; default: leave embeddings updated via CE path only when explicitly enabled (keep simple: no embedding update for DCE unless requested; can add `--diffusion_update_embeddings` flag).
+9) Logging: print per-epoch `loss`, `mse`, `ce`, and `grad_norm` formatted like `train_with_warmup` for consistency.
+
+## Integration
+- In `main.rs`, if `--diffusion_ce` present during diffusion pretraining, call `train_diffusion_ce(pretraining_examples, epochs, lr, batch_size, ce_weight, mse_weight)` instead of `train_diffusion`.
+- Instruction tuning stays with CE (`train_with_warmup`).
+
+## Tests
+- Unit: verify `x0_hat` recovery formula correctness by round-trip (`q_sample` then recover) on synthetic data.
+- Integration: small dataset run prints CE and MSE (when both enabled), and losses decrease.
+- Gradient shapes: ensure DiffusionBlock `compute_gradients` receives correct shapes and param gradients non-empty.
+
+## Notes
+- Keeps backward compatibility; no changes to existing CE training for Transformer/TRM.
+- Defaults provide balanced MSE+CE; adjust via flags for experiments.
diff --git a/.trae/documents/Add TRM Architecture Toggle With Diffusion Precedence.md b/.trae/documents/Add TRM Architecture Toggle With Diffusion Precedence.md
@@ -0,0 +1,25 @@
+## Goal
+- Restore explicit TRM architecture selection in CLI.
+- Ensure three architectures are available: Transformer, Diffusion, TRM.
+- When both `--trm` and `--diffusion` are set, select Diffusion (TRM can use either; diffusion takes precedence when requested).
+- Keep existing training flows intact; TRM uses standard training, Diffusion uses denoising training.
+
+## Changes
+- `src/main.rs`:
+  - Add `--trm` flag in `Args`.
+  - Set `architecture` as:
+    - If `--diffusion`: `ArchitectureType::Diffusion`
+    - Else if `--trm`: `ArchitectureType::TRM`
+    - Else: `ArchitectureType::Transformer`
+  - Logging for TRM stages analogous to others.
+- No changes to `model_builder.rs` (already supports TRM).
+- No changes to `LLM` training logic; TRM paths are handled by `train_with_warmup` which toggles TRM training mode internally.
+
+## Verification
+- Build and run:
+  - `cargo run --release --bin main -- --trm` → TRM architecture logs and training/tuning.
+  - `cargo run --release --bin main -- --trm --diffusion` → Diffusion selected (precedence), denoising training.
+  - `cargo run --release --bin main` → Transformer.
+
+## Scope
+- Minimal CLI and selection updates only; no structural refactors required for TRM support.