From aa82abf37457c0c6dfe7a9ef2a89aefcc5c050a2 Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Tue, 19 Aug 2025 15:31:37 +0200
Subject: [PATCH 01/24] Add accelerated demeaning with Irons-Tuck algorithm

Implement fixest's Irons-Tuck-Grand acceleration algorithm for high-dimensional
fixed effects demeaning in Rust. This is a coefficient-space iterative method
that provides significant speedups over naive alternating projections.

Key features:
- Irons-Tuck acceleration with grand acceleration steps
- Support for 2-FE and 3+ FE cases with optimized projectors
- Algorithm aligned with R fixest implementation
- Auto-vectorized loops (no explicit SIMD dependencies)

Reference: https://github.com/lrberge/fixest (CCC_demean.cpp)
---
 .gitignore                                    |   1 +
 benchmarks/bench_demean_r.R                   |  71 ++
 benchmarks/bench_native_comparison.py         | 216 +++++
 benchmarks/demean_benchmark.py                | 456 ++++++++++
 docs/specs/demean_accelerated_optimization.md | 370 +++++++++
 pyfixest/core/_core_impl.pyi                  |   7 +
 pyfixest/core/demean_accelerated.py           |  73 ++
 src/demean.rs                                 |  27 +-
 src/demean_accelerated/coef_space.rs          | 785 ++++++++++++++++++
 src/demean_accelerated/mod.rs                 | 127 +++
 src/lib.rs                                    |   2 +
 11 files changed, 2133 insertions(+), 2 deletions(-)
 create mode 100644 benchmarks/bench_demean_r.R
 create mode 100644 benchmarks/bench_native_comparison.py
 create mode 100644 benchmarks/demean_benchmark.py
 create mode 100644 docs/specs/demean_accelerated_optimization.md
 create mode 100644 pyfixest/core/demean_accelerated.py
 create mode 100644 src/demean_accelerated/coef_space.rs
 create mode 100644 src/demean_accelerated/mod.rs

diff --git a/.gitignore b/.gitignore
index f5378e980..899602ad4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,3 +42,4 @@ coverage.xml
 # pixi environments
 .pixi/*
 !.pixi/config.toml
+benchmarks/results/
diff --git a/benchmarks/bench_demean_r.R b/benchmarks/bench_demean_r.R
new file mode 100644
index 000000000..fb9a55620
--- /dev/null
+++ b/benchmarks/bench_demean_r.R
@@ -0,0 +1,71 @@
+#!/usr/bin/env Rscript
+# Benchmark fixest demeaning directly in R
+# Usage: Rscript bench_demean_r.R [n_obs] [dgp_type] [n_fe]
+
+library(fixest)
+
+args <- commandArgs(trailingOnly = TRUE)
+n_obs <- if (length(args) >= 1) as.integer(args[1]) else 100000L
+dgp_type <- if (length(args) >= 2) args[2] else "difficult"
+n_fe <- if (length(args) >= 3) as.integer(args[3]) else 3L
+
+# Set single thread for fair comparison
+setFixest_nthreads(1)
+
+# Generate data matching Python benchmark DGP
+set.seed(42)
+n_year <- 10L
+n_indiv_per_firm <- 23L
+n_indiv <- max(1L, round(n_obs / n_year))
+n_firm <- max(1L, round(n_indiv / n_indiv_per_firm))
+
+indiv_id <- rep(1:n_indiv, each = n_year)[1:n_obs]
+year <- rep(1:n_year, times = n_indiv)[1:n_obs]
+
+if (dgp_type == "simple") {
+  firm_id <- sample(1:n_firm, n_obs, replace = TRUE)
+} else {
+  # difficult: sequential assignment
+  firm_id <- rep(1:n_firm, length.out = n_obs)
+}
+
+# Generate outcome
+x1 <- rnorm(n_obs)
+firm_fe <- rnorm(n_firm)[firm_id]
+unit_fe <- rnorm(n_indiv)[indiv_id]
+year_fe <- rnorm(n_year)[year]
+y <- x1 + firm_fe + unit_fe + year_fe + rnorm(n_obs)
+
+df <- data.frame(
+  y = y,
+  x1 = x1,
+  indiv_id = indiv_id,
+  year = year,
+  firm_id = firm_id
+)
+
+# Build formula based on n_fe
+if (n_fe == 2) {
+  fml <- y ~ 1 | indiv_id + year
+} else {
+  fml <- y ~ 1 | indiv_id + year + firm_id
+}
+
+# Warm up
+invisible(feols(fml, data = df))
+
+# Benchmark
+n_runs <- 5L
+times <- numeric(n_runs)
+
+for (i in 1:n_runs) {
+  start <- Sys.time()
+  fit <- feols(fml, data = df)
+  end <- Sys.time()
+  times[i] <- as.numeric(end - start, units = "secs") * 1000  # ms
+}
+
+cat(sprintf("fixest (R native) - n=%d, type=%s, %dFE\n", n_obs, dgp_type, n_fe))
+cat(sprintf("  Times (ms): %s\n", paste(round(times, 2), collapse = ", ")))
+cat(sprintf("  Median: %.2f ms\n", median(times)))
+cat(sprintf("  Min: %.2f ms\n", min(times)))
diff --git a/benchmarks/bench_native_comparison.py b/benchmarks/bench_native_comparison.py
new file mode 100644
index 000000000..5782a1e65
--- /dev/null
+++ b/benchmarks/bench_native_comparison.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python3
+"""
+Benchmark comparing pyfixest demean vs native fixest (via R subprocess).
+
+Runs fixest directly in R to avoid rpy2 overhead, then compares with pyfixest.
+"""
+
+from __future__ import annotations
+
+import json
+import subprocess
+import time
+from pathlib import Path
+from statistics import median
+
+import numpy as np
+
+
+def generate_dgp(
+    n: int,
+    dgp_type: str = "simple",
+    n_years: int = 10,
+    n_indiv_per_firm: int = 23,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Generate test data matching fixest benchmark DGP."""
+    np.random.seed(42)
+
+    n_indiv = max(1, round(n / n_years))
+    n_firm = max(1, round(n_indiv / n_indiv_per_firm))
+
+    indiv_id = np.repeat(np.arange(n_indiv), n_years)[:n]
+    year = np.tile(np.arange(n_years), n_indiv)[:n]
+
+    if dgp_type == "simple":
+        firm_id = np.random.randint(0, n_firm, size=n)
+    else:  # difficult
+        firm_id = np.tile(np.arange(n_firm), (n // n_firm) + 1)[:n]
+
+    x1 = np.random.randn(n)
+    firm_fe = np.random.randn(n_firm)[firm_id]
+    unit_fe = np.random.randn(n_indiv)[indiv_id]
+    year_fe = np.random.randn(n_years)[year]
+    y = x1 + firm_fe + unit_fe + year_fe + np.random.randn(n)
+
+    x = np.column_stack([y, x1])
+    weights = np.ones(n)
+
+    return x, indiv_id, year, firm_id, weights
+
+
+def run_r_benchmark(n_obs: int, dgp_type: str, n_fe: int, n_runs: int = 5) -> dict:
+    """Run fixest benchmark in R subprocess."""
+    r_script = Path(__file__).parent / "bench_demean_r.R"
+
+    try:
+        result = subprocess.run(
+            ["Rscript", str(r_script), str(n_obs), dgp_type, str(n_fe)],
+            capture_output=True,
+            text=True,
+            timeout=300,
+        )
+
+        if result.returncode != 0:
+            return {"error": result.stderr, "times": [], "median": float("inf")}
+
+        # Parse output
+        lines = result.stdout.strip().split("\n")
+        median_ms = None
+        for line in lines:
+            if "Median:" in line:
+                median_ms = float(line.split(":")[1].strip().replace(" ms", ""))
+
+        return {
+            "median": median_ms if median_ms else float("inf"),
+            "output": result.stdout,
+        }
+    except subprocess.TimeoutExpired:
+        return {"error": "timeout", "median": float("inf")}
+    except FileNotFoundError:
+        return {"error": "R not found", "median": float("inf")}
+
+
+def run_rust_benchmark(
+    x: np.ndarray,
+    flist: np.ndarray,
+    weights: np.ndarray,
+    n_runs: int = 5,
+    use_simple: bool = False,
+) -> dict:
+    """Run pyfixest Rust demean benchmark."""
+    import os
+
+    if use_simple:
+        os.environ["PYFIXEST_DEMEAN_SIMPLE"] = "1"
+    elif "PYFIXEST_DEMEAN_SIMPLE" in os.environ:
+        del os.environ["PYFIXEST_DEMEAN_SIMPLE"]
+
+    try:
+        from pyfixest.core.demean import demean
+
+        times = []
+        for _ in range(n_runs):
+            x_copy = x.copy()
+            start = time.perf_counter()
+            _result, converged = demean(x_copy, flist, weights)
+            elapsed = (time.perf_counter() - start) * 1000  # ms
+            times.append(elapsed)
+
+        return {
+            "median": median(times),
+            "times": times,
+            "converged": converged,
+        }
+    except Exception as e:
+        return {"error": str(e), "median": float("inf")}
+    finally:
+        if "PYFIXEST_DEMEAN_SIMPLE" in os.environ:
+            del os.environ["PYFIXEST_DEMEAN_SIMPLE"]
+
+
+def main():
+    """Run benchmark comparing pyfixest demean vs native fixest."""
+    configs = [
+        (10_000, "simple", 2),
+        (10_000, "difficult", 2),
+        (10_000, "simple", 3),
+        (10_000, "difficult", 3),
+        (100_000, "simple", 2),
+        (100_000, "difficult", 2),
+        (100_000, "simple", 3),
+        (100_000, "difficult", 3),
+    ]
+
+    results = []
+
+    print("=" * 70)
+    print("PyFixest vs Fixest Native Benchmark")
+    print("=" * 70)
+
+    for n_obs, dgp_type, n_fe in configs:
+        print(f"\nConfig: n={n_obs:,}, type={dgp_type}, fe={n_fe}")
+        print("-" * 50)
+
+        # Generate data
+        x, indiv_id, year, firm_id, weights = generate_dgp(n_obs, dgp_type)
+
+        if n_fe == 2:
+            flist = np.column_stack([indiv_id, year]).astype(np.uint64)
+        else:
+            flist = np.column_stack([indiv_id, year, firm_id]).astype(np.uint64)
+
+        # Run R benchmark
+        r_result = run_r_benchmark(n_obs, dgp_type, n_fe)
+        r_time = r_result.get("median", float("inf"))
+        print(f"  fixest (R native):   {r_time:8.2f} ms")
+
+        # Run Rust accelerated benchmark
+        rust_result = run_rust_benchmark(x, flist, weights)
+        rust_time = rust_result.get("median", float("inf"))
+
+        if r_time > 0 and rust_time < float("inf"):
+            ratio = rust_time / r_time
+            print(f"  pyfixest (Rust):     {rust_time:8.2f} ms ({ratio:.2f}x)")
+        else:
+            print(f"  pyfixest (Rust):     {rust_time:8.2f} ms")
+
+        # Run Rust simple benchmark
+        rust_simple = run_rust_benchmark(x, flist, weights, use_simple=True)
+        rust_simple_time = rust_simple.get("median", float("inf"))
+
+        if r_time > 0 and rust_simple_time < float("inf"):
+            ratio = rust_simple_time / r_time
+            print(f"  pyfixest (simple):   {rust_simple_time:8.2f} ms ({ratio:.2f}x)")
+        else:
+            print(f"  pyfixest (simple):   {rust_simple_time:8.2f} ms")
+
+        results.append(
+            {
+                "n_obs": n_obs,
+                "dgp_type": dgp_type,
+                "n_fe": n_fe,
+                "fixest_r_ms": r_time,
+                "pyfixest_rust_ms": rust_time,
+                "pyfixest_simple_ms": rust_simple_time,
+            }
+        )
+
+    # Summary
+    print("\n" + "=" * 70)
+    print("SUMMARY (pyfixest accelerated vs fixest)")
+    print("=" * 70)
+
+    print(f"{'Config':<35} {'fixest':>10} {'pyfixest':>10} {'ratio':>8}")
+    print("-" * 65)
+
+    for r in results:
+        config = f"n={r['n_obs']:,} {r['dgp_type']:9} {r['n_fe']}FE"
+        fixest = r["fixest_r_ms"]
+        pyfixest = r["pyfixest_rust_ms"]
+
+        if fixest > 0 and fixest < float("inf") and pyfixest < float("inf"):
+            ratio = pyfixest / fixest
+            print(f"{config:<35} {fixest:>8.1f}ms {pyfixest:>8.1f}ms {ratio:>7.2f}x")
+        else:
+            print(f"{config:<35} {'N/A':>10} {'N/A':>10}")
+
+    # Save results
+    output_path = Path(__file__).parent / "results" / "native_comparison.json"
+    output_path.parent.mkdir(exist_ok=True)
+    with open(output_path, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\nResults saved to {output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/demean_benchmark.py b/benchmarks/demean_benchmark.py
new file mode 100644
index 000000000..6a587b75f
--- /dev/null
+++ b/benchmarks/demean_benchmark.py
@@ -0,0 +1,456 @@
+#!/usr/bin/env python3
+"""
+Benchmark script for comparing demeaning implementations.
+
+Oriented on fixest_benchmarks/bench_ols.R but focused on demeaning only
+and optimized for fast iteration.
+
+Usage:
+    python benchmarks/demean_benchmark.py           # Fast mode (~30s)
+    python benchmarks/demean_benchmark.py --full    # Full mode (~5min)
+    python benchmarks/demean_benchmark.py --save    # Save results to JSON
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from statistics import median
+from typing import Callable
+
+import numpy as np
+
+
+@dataclass
+class BenchmarkConfig:
+    """Configuration for a single benchmark run."""
+
+    n_obs: int
+    dgp_type: str  # "simple" or "difficult"
+    n_fe: int
+    n_iters: int
+
+
+@dataclass
+class BenchmarkResult:
+    """Result of a benchmark run."""
+
+    config: BenchmarkConfig
+    backend: str
+    times: list[float]
+    median_time: float
+    available: bool
+    error: str | None = None
+
+
+def generate_dgp(
+    n: int,
+    dgp_type: str = "simple",
+    n_years: int = 10,
+    n_indiv_per_firm: int = 23,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Generate data matching fixest_benchmarks DGP.
+
+    Parameters
+    ----------
+    n : int
+        Number of observations
+    dgp_type : str
+        "simple" (random firm assignment) or "difficult" (sequential)
+    n_years : int
+        Number of years
+    n_indiv_per_firm : int
+        Average individuals per firm
+
+    Returns
+    -------
+    x : np.ndarray
+        Feature matrix (n, 1)
+    flist : np.ndarray
+        Fixed effect IDs (n, 2 or 3) - [indiv_id, year] or [indiv_id, year, firm_id]
+    weights : np.ndarray
+        Sample weights (n,)
+    """
+    n_indiv = max(1, round(n / n_years))
+    n_firm = max(1, round(n_indiv / n_indiv_per_firm))
+
+    # Create FE IDs
+    indiv_id = np.repeat(np.arange(n_indiv), n_years)[:n]
+    year = np.tile(np.arange(n_years), n_indiv)[:n]
+
+    if dgp_type == "simple":
+        # Random firm assignment - easier convergence
+        firm_id = np.random.randint(0, n_firm, size=n)
+    elif dgp_type == "difficult":
+        # Sequential firm assignment - harder convergence (messy data)
+        firm_id = np.tile(np.arange(n_firm), (n // n_firm) + 1)[:n]
+    else:
+        raise ValueError(f"Unknown dgp_type: {dgp_type}")
+
+    # Generate features
+    x1 = np.random.randn(n)
+
+    # Generate y with FE structure
+    firm_fe = np.random.randn(n_firm)[firm_id]
+    unit_fe = np.random.randn(n_indiv)[indiv_id]
+    year_fe = np.random.randn(n_years)[year]
+    y = x1 + firm_fe + unit_fe + year_fe + np.random.randn(n)
+
+    # Stack into matrices
+    x = np.column_stack([y, x1])  # Demean both y and x1
+    weights = np.ones(n)
+
+    return x, indiv_id, year, firm_id, weights
+
+
+def get_demean_backends() -> dict[str, Callable | None]:
+    """Get available demeaning backends with graceful fallbacks."""
+    backends: dict[str, Callable | None] = {}
+
+    # Rust accelerated (default)
+    try:
+        from pyfixest.core.demean import demean as demean_rust
+
+        backends["rust-accelerated"] = demean_rust
+    except ImportError:
+        backends["rust-accelerated"] = None
+
+    # Rust simple (via env var)
+    def demean_rust_simple(x, flist, weights, tol=1e-8, maxiter=100_000):
+        os.environ["PYFIXEST_DEMEAN_SIMPLE"] = "1"
+        try:
+            from pyfixest.core.demean import demean as demean_rust
+
+            return demean_rust(x, flist, weights, tol, maxiter)
+        finally:
+            del os.environ["PYFIXEST_DEMEAN_SIMPLE"]
+
+    backends["rust-simple"] = (
+        demean_rust_simple if backends["rust-accelerated"] else None
+    )
+
+    # Numba
+    try:
+        from pyfixest.estimation.demean_ import demean as demean_numba
+
+        backends["numba"] = demean_numba
+    except ImportError:
+        backends["numba"] = None
+
+    # CuPy 32-bit
+    try:
+        from pyfixest.estimation.cupy.demean_cupy_ import demean_cupy32
+
+        backends["cupy32"] = demean_cupy32
+    except ImportError:
+        backends["cupy32"] = None
+
+    # CuPy 64-bit
+    try:
+        from pyfixest.estimation.cupy.demean_cupy_ import demean_cupy64
+
+        backends["cupy64"] = demean_cupy64
+    except ImportError:
+        backends["cupy64"] = None
+
+    # R fixest via rpy2 - use feols with only FE (no covariates) to measure demean time
+    try:
+        import pandas as pd
+        import rpy2.robjects as ro
+        from rpy2.robjects import numpy2ri, pandas2ri
+        from rpy2.robjects.packages import importr
+
+        numpy2ri.activate()
+        pandas2ri.activate()
+        importr("fixest")  # Load fixest package
+
+        def demean_fixest(x, flist, weights, tol=1e-8, maxiter=100_000):
+            # Create a minimal regression problem that exercises the demeaning
+            _n, k = x.shape
+            n_fe = flist.shape[1] if flist.ndim > 1 else 1
+
+            # Build a dataframe with y and FE columns
+            data = {"y": x[:, 0]}
+            fe_names = []
+            for j in range(n_fe):
+                fe_col = f"fe{j + 1}"
+                fe_names.append(fe_col)
+                if flist.ndim > 1:
+                    data[fe_col] = flist[:, j].astype(int)
+                else:
+                    data[fe_col] = flist.astype(int)
+
+            df = pd.DataFrame(data)
+            r_df = pandas2ri.py2rpy(df)
+
+            # Build formula: y ~ 1 | fe1 + fe2 + ...
+            fe_formula = " + ".join(fe_names)
+            formula = f"y ~ 1 | {fe_formula}"
+
+            # Call feols (this includes demeaning time)
+            ro.r.assign("df", r_df)
+            ro.r(f"result <- fixest::feols({formula}, data=df, nthreads=1)")
+
+            # Return the residuals as "demeaned" values
+            resid = np.array(ro.r("residuals(result)"))
+            result = np.column_stack([resid] + [x[:, j] for j in range(1, k)])
+            return result, True
+
+        backends["fixest"] = demean_fixest
+    except (ImportError, Exception):
+        backends["fixest"] = None
+
+    return backends
+
+
+def run_single_benchmark(
+    demean_func: Callable,
+    x: np.ndarray,
+    flist: np.ndarray,
+    weights: np.ndarray,
+    n_iters: int,
+) -> list[float]:
+    """Run a single benchmark configuration multiple times."""
+    times = []
+
+    for _ in range(n_iters):
+        # Copy arrays to avoid caching effects
+        x_copy = x.copy()
+
+        start = time.perf_counter()
+        demean_func(x_copy, flist, weights)
+        elapsed = time.perf_counter() - start
+
+        times.append(elapsed)
+
+    return times
+
+
+def run_benchmarks(
+    configs: list[BenchmarkConfig],
+    backends: dict[str, Callable | None],
+) -> list[BenchmarkResult]:
+    """Run all benchmark configurations across all backends."""
+    results = []
+
+    for config in configs:
+        print(f"\n{'=' * 60}")
+        print(f"Config: n={config.n_obs:,}, type={config.dgp_type}, fe={config.n_fe}")
+        print("=" * 60)
+
+        # Generate data
+        x, indiv_id, year, firm_id, weights = generate_dgp(
+            config.n_obs, config.dgp_type
+        )
+
+        # Build flist based on n_fe
+        if config.n_fe == 2:
+            flist = np.column_stack([indiv_id, year]).astype(np.uint64)
+        else:  # n_fe == 3
+            flist = np.column_stack([indiv_id, year, firm_id]).astype(np.uint64)
+
+        for backend_name, demean_func in backends.items():
+            if demean_func is None:
+                result = BenchmarkResult(
+                    config=config,
+                    backend=backend_name,
+                    times=[],
+                    median_time=float("inf"),
+                    available=False,
+                    error="Not installed",
+                )
+                results.append(result)
+                print(f"  {backend_name:20s}: not available")
+                continue
+
+            try:
+                times = run_single_benchmark(
+                    demean_func, x, flist, weights, config.n_iters
+                )
+                med_time = median(times)
+                result = BenchmarkResult(
+                    config=config,
+                    backend=backend_name,
+                    times=times,
+                    median_time=med_time,
+                    available=True,
+                )
+                results.append(result)
+                print(
+                    f"  {backend_name:20s}: {med_time * 1000:8.2f} ms (median of {len(times)})"
+                )
+            except Exception as e:
+                result = BenchmarkResult(
+                    config=config,
+                    backend=backend_name,
+                    times=[],
+                    median_time=float("inf"),
+                    available=False,
+                    error=str(e),
+                )
+                results.append(result)
+                print(f"  {backend_name:20s}: ERROR - {e}")
+
+    return results
+
+
+def print_summary(results: list[BenchmarkResult]) -> None:
+    """Print a summary table of results."""
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+
+    # Group by config
+    configs = sorted(
+        set((r.config.n_obs, r.config.dgp_type, r.config.n_fe) for r in results)
+    )
+
+    backends = sorted(set(r.backend for r in results))
+
+    # Header
+    header = f"{'Config':30s}"
+    for backend in backends:
+        header += f" {backend:>12s}"
+    print(header)
+    print("-" * len(header))
+
+    # Find fixest baseline for relative comparison
+    fixest_times = {}
+    for r in results:
+        if r.backend == "fixest" and r.available:
+            key = (r.config.n_obs, r.config.dgp_type, r.config.n_fe)
+            fixest_times[key] = r.median_time
+
+    # Rows
+    for n_obs, dgp_type, n_fe in configs:
+        config_str = f"n={n_obs:,} {dgp_type:9s} {n_fe}FE"
+        row = f"{config_str:30s}"
+
+        key = (n_obs, dgp_type, n_fe)
+        baseline = fixest_times.get(key)
+
+        for backend in backends:
+            matching = [
+                r
+                for r in results
+                if r.config.n_obs == n_obs
+                and r.config.dgp_type == dgp_type
+                and r.config.n_fe == n_fe
+                and r.backend == backend
+            ]
+            if matching and matching[0].available:
+                time_ms = matching[0].median_time * 1000
+                if baseline and backend != "fixest":
+                    ratio = matching[0].median_time / baseline
+                    row += f" {time_ms:7.1f}ms({ratio:.1f}x)"
+                else:
+                    row += f" {time_ms:12.1f}ms"
+            else:
+                row += f" {'N/A':>12s}"
+
+        print(row)
+
+
+def save_results(results: list[BenchmarkResult], path: Path) -> None:
+    """Save results to JSON."""
+    data = []
+    for r in results:
+        data.append(
+            {
+                "n_obs": r.config.n_obs,
+                "dgp_type": r.config.dgp_type,
+                "n_fe": r.config.n_fe,
+                "n_iters": r.config.n_iters,
+                "backend": r.backend,
+                "times": r.times,
+                "median_time": r.median_time if r.median_time != float("inf") else None,
+                "available": r.available,
+                "error": r.error,
+            }
+        )
+
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)
+    print(f"\nResults saved to {path}")
+
+
+def main():
+    """Run demeaning benchmarks."""
+    parser = argparse.ArgumentParser(description="Benchmark demeaning implementations")
+    parser.add_argument(
+        "--full", action="store_true", help="Run full benchmark (slower)"
+    )
+    parser.add_argument("--save", action="store_true", help="Save results to JSON")
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("benchmarks/results/benchmark.json"),
+        help="Output path for results",
+    )
+    args = parser.parse_args()
+
+    # Define configurations
+    if args.full:
+        configs = [
+            # Small (fast)
+            BenchmarkConfig(n_obs=10_000, dgp_type="simple", n_fe=2, n_iters=5),
+            BenchmarkConfig(n_obs=10_000, dgp_type="difficult", n_fe=2, n_iters=5),
+            BenchmarkConfig(n_obs=10_000, dgp_type="simple", n_fe=3, n_iters=5),
+            BenchmarkConfig(n_obs=10_000, dgp_type="difficult", n_fe=3, n_iters=5),
+            # Medium
+            BenchmarkConfig(n_obs=100_000, dgp_type="simple", n_fe=2, n_iters=3),
+            BenchmarkConfig(n_obs=100_000, dgp_type="difficult", n_fe=2, n_iters=3),
+            BenchmarkConfig(n_obs=100_000, dgp_type="simple", n_fe=3, n_iters=3),
+            BenchmarkConfig(n_obs=100_000, dgp_type="difficult", n_fe=3, n_iters=3),
+            # Large
+            BenchmarkConfig(n_obs=500_000, dgp_type="simple", n_fe=2, n_iters=2),
+            BenchmarkConfig(n_obs=500_000, dgp_type="difficult", n_fe=2, n_iters=2),
+            BenchmarkConfig(n_obs=1_000_000, dgp_type="simple", n_fe=2, n_iters=1),
+            BenchmarkConfig(n_obs=1_000_000, dgp_type="difficult", n_fe=2, n_iters=1),
+        ]
+    else:
+        # Fast mode - minimal configs for quick iteration
+        configs = [
+            BenchmarkConfig(n_obs=10_000, dgp_type="simple", n_fe=2, n_iters=5),
+            BenchmarkConfig(n_obs=10_000, dgp_type="difficult", n_fe=2, n_iters=5),
+            BenchmarkConfig(n_obs=10_000, dgp_type="simple", n_fe=3, n_iters=5),
+            BenchmarkConfig(n_obs=10_000, dgp_type="difficult", n_fe=3, n_iters=5),
+            BenchmarkConfig(n_obs=100_000, dgp_type="simple", n_fe=2, n_iters=3),
+            BenchmarkConfig(n_obs=100_000, dgp_type="difficult", n_fe=2, n_iters=3),
+            BenchmarkConfig(n_obs=100_000, dgp_type="simple", n_fe=3, n_iters=3),
+            BenchmarkConfig(n_obs=100_000, dgp_type="difficult", n_fe=3, n_iters=3),
+        ]
+
+    print("Demeaning Benchmark")
+    print("=" * 60)
+    print(f"Mode: {'full' if args.full else 'fast'}")
+    print(f"Configurations: {len(configs)}")
+
+    # Get available backends
+    backends = get_demean_backends()
+    available = [name for name, func in backends.items() if func is not None]
+    unavailable = [name for name, func in backends.items() if func is None]
+
+    print(f"Available backends: {', '.join(available)}")
+    if unavailable:
+        print(f"Unavailable backends: {', '.join(unavailable)}")
+
+    # Run benchmarks
+    results = run_benchmarks(configs, backends)
+
+    # Print summary
+    print_summary(results)
+
+    # Save if requested
+    if args.save:
+        save_results(results, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/specs/demean_accelerated_optimization.md b/docs/specs/demean_accelerated_optimization.md
new file mode 100644
index 000000000..89cb4b2c3
--- /dev/null
+++ b/docs/specs/demean_accelerated_optimization.md
@@ -0,0 +1,370 @@
+# Optimization Specification: demean_accelerated.rs
+
+## 1. Current Implementation Analysis
+
+### 1.1 Overview of demean_accelerated.rs
+
+The current implementation in `src/demean_accelerated.rs` (336 lines) provides:
+
+- **Irons-Tuck acceleration**: Applied every 3rd iteration
+- **Struct abstractions**: `FactorDemeaner`, `MultiFactorDemeaner`, `AccelerationBuffers`, `IronTucksAcceleration`
+- **Parallelization**: rayon for column-level parallelism
+- **Memory**: Heap-allocated `Vec<f64>` buffers
+
+### 1.2 Comparison: demean.rs vs demean_accelerated.rs
+
+| Aspect | demean.rs | demean_accelerated.rs |
+|--------|-----------|----------------------|
+| Algorithm | Simple alternating projection | Irons-Tuck acceleration |
+| Iteration | One projection per iter | 2 projections + acceleration step |
+| Memory | Minimal buffers | 6 buffers × n_samples |
+| Convergence | Element-wise SAD | Element-wise SAD |
+
+### 1.3 Reference: fixest C++ (demeaning.cpp)
+
+Key features in fixest not present in current Rust implementation:
+
+| Feature | fixest | demean_accelerated.rs |
+|---------|--------|----------------------|
+| Grand acceleration | ✓ (3-point history) | ✗ |
+| 2-FE optimization | ✓ (no N-length temps) | ✗ |
+| SSR convergence | ✓ (every 40 iters) | ✗ |
+| Coefficient-based | ✓ (iterates on FE coeffs) | ✗ (observation-based) |
+
+---
+
+## 2. Missing Parts (vs fixest)
+
+### 2.1 Grand Acceleration (Priority: HIGH)
+
+fixest implements a **two-tier acceleration scheme**:
+
+```
+Standard iterations: Apply Irons-Tuck every 3 iterations
+Grand acceleration: Every `iter_grandAcc` iterations, apply Irons-Tuck
+                    on a 3-point history (Y, GY, GGY) of coefficient vectors
+```
+
+The grand acceleration operates on a coarser timescale, accelerating convergence on slow-moving modes. This can significantly reduce iteration count for hard-to-converge problems.
+
+**Implementation sketch:**
+```rust
+struct GrandAccelerationState {
+    y: Vec<f64>,      // First history point
+    gy: Vec<f64>,     // Second history point
+    ggy: Vec<f64>,    // Third history point
+    counter: usize,   // Cycles 0-2
+    interval: usize,  // Apply every N iterations (default ~15)
+}
+```
+
+### 2.2 Specialized 2-FE Path (Priority: MEDIUM)
+
+When `n_factors == 2`, fixest uses a specialized routine that:
+- Stores second FE coefficients in a `nb_coef_Q[1]`-length buffer instead of `n_obs`
+- Avoids materializing full N-length residual vectors
+- Alternates between updating both effects without intermediate storage
+
+Current implementation always allocates `n_samples`-length buffers regardless of factor count.
+
+### 2.3 SSR-Based Convergence (Priority: MEDIUM)
+
+fixest checks residual sum-of-squares every 40 iterations:
+
+```cpp
+ssr = Σ(input[i] - mu_current[i])²
+if (stopping_crit(ssr_old, ssr, diffMax)) break;
+```
+
+This complements the element-wise convergence check and can detect convergence earlier in some cases.
+
+### 2.4 Coefficient-Based Iteration (Priority: LOW)
+
+fixest iterates on FE **coefficients** rather than demeaned **observations**:
+- Coefficient vector length: `Σ n_groups[j]` (often << n_samples)
+- More cache-friendly for problems with many observations but few groups
+- Requires restructuring the core algorithm
+
+---
+
+## 3. Potential Speedup Opportunities
+
+### 3.1 SIMD Vectorization (Priority: HIGH)
+
+Current inner loops rely on compiler autovectorization:
+
+```rust
+// Current: relies on autovectorization
+for i in 0..n {
+    self.buffers.delta_gx[i] = self.buffers.ggx_curr[i] - gx_tmp;
+    // ...
+}
+```
+
+**Opportunity**: Use explicit SIMD via `std::simd` (nightly) or `wide` crate:
+
+```rust
+use wide::f64x4;
+
+// Process 4 elements at a time
+for chunk in buffers.chunks_exact_mut(4) {
+    let a = f64x4::from_slice(a_slice);
+    let b = f64x4::from_slice(b_slice);
+    (a - b).store(chunk);
+}
+```
+
+Potential gains:
+- **2-4x** for memory-bound operations (likely scenario)
+- Requires careful handling of non-aligned tails
+
+### 3.2 Memory Layout Optimization (Priority: HIGH)
+
+Current: Separate `Vec<f64>` for each buffer (AoS pattern)
+
+```rust
+struct AccelerationBuffers {
+    x_curr: Vec<f64>,
+    gx_curr: Vec<f64>,
+    ggx_curr: Vec<f64>,
+    // ... 6 separate allocations
+}
+```
+
+**Opportunity**: Interleaved SoA layout for better cache locality:
+
+```rust
+struct InterleavedBuffers {
+    // All data in single allocation, interleaved for spatial locality
+    data: Vec<f64>,  // [x0, gx0, ggx0, x1, gx1, ggx1, ...]
+}
+```
+
+Or single contiguous allocation with computed offsets:
+
+```rust
+struct AccelerationBuffers {
+    data: Vec<f64>,  // Single allocation: 6 * n_samples
+    n_samples: usize,
+}
+impl AccelerationBuffers {
+    fn x_curr(&mut self) -> &mut [f64] { &mut self.data[0..self.n_samples] }
+    // ...
+}
+```
+
+### 3.3 Reduce Per-Column Allocations (Priority: HIGH)
+
+Current implementation allocates `MultiFactorDemeaner` per column:
+
+```rust
+// src/demean_accelerated.rs:274
+let process_column = |(k, mut col): (...)| {
+    let demeaner = MultiFactorDemeaner::new(...);  // Allocation per column!
+    let mut acceleration = IronTucksAcceleration::new(...);
+    // ...
+};
+```
+
+**Opportunity**: Pre-allocate demeaners and reuse via thread-local storage:
+
+```rust
+use rayon::prelude::*;
+use std::cell::RefCell;
+
+thread_local! {
+    static DEMEANER: RefCell<Option<MultiFactorDemeaner>> = RefCell::new(None);
+}
+
+// Or use rayon's broadcast for pre-allocation
+```
+
+### 3.4 Convergence Check Optimization (Priority: MEDIUM)
+
+Current: Full pass over all elements every iteration:
+
+```rust
+fn sad_converged(a: &[f64], b: &[f64], tol: f64) -> bool {
+    a.iter().zip(b).all(|(&x, &y)| (x - y).abs() < tol)
+}
+```
+
+**Opportunity**: Early exit with SIMD max-reduction:
+
+```rust
+fn sad_converged_simd(a: &[f64], b: &[f64], tol: f64) -> bool {
+    // SIMD: compute max |a-b| in chunks, early exit if any chunk exceeds tol
+    let tol_vec = f64x4::splat(tol);
+    for (a_chunk, b_chunk) in a.chunks_exact(4).zip(b.chunks_exact(4)) {
+        let diff = (f64x4::from_slice(a_chunk) - f64x4::from_slice(b_chunk)).abs();
+        if diff.reduce_max() >= tol {
+            return false;
+        }
+    }
+    // Handle remainder...
+    true
+}
+```
+
+### 3.5 Group Mean Computation (Priority: MEDIUM)
+
+Current scatter-gather pattern:
+
+```rust
+// Scatter: accumulate weighted sums
+input.iter().zip(&self.sample_weights).zip(&self.group_ids)
+    .for_each(|((&xi, &wi), &gid)| {
+        self.group_weighted_sums[gid] += wi * xi;  // Random access
+    });
+```
+
+**Opportunity**:
+- Sort observations by group ID for sequential access (one-time cost)
+- Use sparse matrix representation for very large groups
+- Consider prefix sums for sorted data
+
+### 3.6 Use ndarray-linalg for BLAS (Priority: LOW)
+
+Add `ndarray-linalg` for optimized linear algebra:
+
+```toml
+[dependencies]
+ndarray-linalg = { version = "0.16", features = ["openblas-system"] }
+```
+
+Could accelerate matrix operations if algorithm is restructured.
+
+---
+
+## 4. Benchmark Strategy
+
+### 4.1 Minimal Benchmark Fixture
+
+Add to `tests/test_demean.py`:
+
+```python
+import pytest
+import numpy as np
+from pyfixest.core.demean import demean
+from pyfixest.core.demean_accelerated import demean_accelerated
+
+@pytest.fixture
+def benchmark_data_small():
+    """Small dataset for quick iteration."""
+    rng = np.random.default_rng(42)
+    n, k = 10_000, 5
+    return {
+        'x': rng.normal(0, 1, (n, k)),
+        'flist': np.column_stack([
+            rng.integers(0, 100, n),
+            rng.integers(0, 50, n),
+        ]).astype(np.uint64),
+        'weights': np.ones(n),
+    }
+
+@pytest.fixture
+def benchmark_data_complex():
+    """Complex FE structure from fixest benchmarks."""
+    # Use generate_complex_fixed_effects_data() from test_demean.py
+    X, flist, weights = generate_complex_fixed_effects_data()
+    return {'x': X, 'flist': flist, 'weights': weights}
+
+@pytest.mark.benchmark(group="demean")
+def test_bench_demean_simple(benchmark, benchmark_data_small):
+    data = benchmark_data_small
+    result, success = benchmark(
+        demean, data['x'], data['flist'], data['weights'], tol=1e-8
+    )
+    assert success
+
+@pytest.mark.benchmark(group="demean")
+def test_bench_demean_accelerated(benchmark, benchmark_data_small):
+    data = benchmark_data_small
+    result, success = benchmark(
+        demean_accelerated, data['x'], data['flist'], data['weights'], tol=1e-8
+    )
+    assert success
+```
+
+### 4.2 Run Benchmarks
+
+```bash
+# Quick benchmark during iteration
+pytest tests/test_demean.py -k "bench" --benchmark-only --benchmark-compare
+
+# Full benchmark with stats
+pytest tests/test_demean.py -k "bench" --benchmark-only \
+    --benchmark-columns=mean,stddev,rounds \
+    --benchmark-save=baseline
+```
+
+### 4.3 Benchmark Scenarios
+
+| Scenario | n_samples | n_features | n_factors | n_groups_per_factor |
+|----------|-----------|------------|-----------|---------------------|
+| Small-simple | 10K | 5 | 2 | 100, 50 |
+| Medium-2FE | 100K | 10 | 2 | 1000, 500 |
+| Large-3FE | 1M | 5 | 3 | 5000, 2500, 100 |
+| Complex | 100K | 3 | 3 | (per fixest) |
+
+---
+
+## 5. Implementation Roadmap
+
+### Phase 1: Low-Hanging Fruit (Quick Wins)
+1. [ ] Reduce per-column allocations (thread-local reuse)
+2. [ ] Single contiguous buffer allocation
+3. [ ] Add SIMD convergence check
+
+### Phase 2: Algorithm Improvements
+4. [ ] Implement grand acceleration
+5. [ ] Add SSR-based convergence check
+6. [ ] Specialized 2-FE path
+
+### Phase 3: Advanced Optimization
+7. [ ] Explicit SIMD for inner loops (wide crate)
+8. [ ] Sort-by-group optimization
+9. [ ] Coefficient-based iteration (major refactor)
+
+---
+
+## 6. Testing Requirements (Minimal)
+
+Keep tests minimal for fast iteration:
+
+```python
+# Correctness: compare against pyhdfe (already in test_demean.py)
+def test_accelerated_correctness():
+    """Verify accelerated matches reference implementation."""
+    X, flist, weights = generate_data()
+    res_simple, _ = demean(X, flist, weights, tol=1e-10)
+    res_accel, _ = demean_accelerated(X, flist, weights, tol=1e-10)
+    assert np.allclose(res_simple, res_accel, rtol=1e-6, atol=1e-8)
+
+# Benchmark: already covered above
+```
+
+---
+
+## 7. Expected Performance Gains
+
+| Optimization | Expected Gain | Effort |
+|--------------|---------------|--------|
+| Reduce allocations | 10-20% | Low |
+| SIMD convergence | 5-10% | Low |
+| Grand acceleration | 20-50% (hard problems) | Medium |
+| 2-FE specialization | 10-30% (2-FE cases) | Medium |
+| Full SIMD loops | 2-4x (compute-bound) | High |
+| Coefficient-based | Variable | Very High |
+
+**Realistic target**: 2-3x speedup over current `demean_accelerated.rs` for typical workloads, approaching fixest C++ performance.
+
+---
+
+## 8. Files to Modify
+
+- `src/demean_accelerated.rs` - Main implementation
+- `src/lib.rs` - Expose new functions if needed
+- `pyfixest/core/demean_accelerated.py` - Python wrapper
+- `tests/test_demean.py` - Add benchmarks
+- `Cargo.toml` - Add `wide` crate for SIMD (optional)
diff --git a/pyfixest/core/_core_impl.pyi b/pyfixest/core/_core_impl.pyi
index 415793a47..ac714e33a 100644
--- a/pyfixest/core/_core_impl.pyi
+++ b/pyfixest/core/_core_impl.pyi
@@ -20,3 +20,10 @@ def _count_fixef_fully_nested_all_rs(
     cluster_data: NDArray[np.uint64],
     fe_data: NDArray[np.uint64],
 ) -> tuple[np.ndarray, int]: ...
+def _demean_accelerated_rs(
+    x: NDArray[np.float64],
+    flist: NDArray[np.uint64],
+    weights: NDArray[np.float64],
+    tol: float = 1e-08,
+    maxiter: int = 100_000,
+) -> tuple[np.ndarray, bool]: ...
diff --git a/pyfixest/core/demean_accelerated.py b/pyfixest/core/demean_accelerated.py
new file mode 100644
index 000000000..1121463e3
--- /dev/null
+++ b/pyfixest/core/demean_accelerated.py
@@ -0,0 +1,73 @@
+import numpy as np
+from numpy.typing import NDArray
+
+from ._core_impl import _demean_accelerated_rs
+
+
+def demean_accelerated(
+    x: NDArray[np.float64],
+    flist: NDArray[np.uint64],
+    weights: NDArray[np.float64],
+    tol: float = 1e-08,
+    maxiter: int = 100_000,
+) -> tuple[NDArray, bool]:
+    """
+    Demean an array.
+
+    Workhorse for demeaning an input array `x` based on the specified fixed
+    effects and weights via the alternating projections algorithm.
+
+    Parameters
+    ----------
+    x : numpy.ndarray
+        Input array of shape (n_samples, n_features). Needs to be of type float.
+    flist : numpy.ndarray
+        Array of shape (n_samples, n_factors) specifying the fixed effects.
+        Needs to already be converted to integers.
+    weights : numpy.ndarray
+        Array of shape (n_samples,) specifying the weights.
+    tol : float, optional
+        Tolerance criterion for convergence. Defaults to 1e-08.
+    maxiter : int, optional
+        Maximum number of iterations. Defaults to 100_000.
+
+    Returns
+    -------
+    tuple[numpy.ndarray, bool]
+        A tuple containing the demeaned array of shape (n_samples, n_features)
+        and a boolean indicating whether the algorithm converged successfully.
+
+    Examples
+    --------
+    ```{python}
+    import numpy as np
+    import pyfixest as pf
+    from pyfixest.utils.dgps import get_blw
+    from pyfixest.estimation.demean_ import demean
+    from formulaic import model_matrix
+
+    fml = "y ~ treat | state + year"
+
+    data = get_blw()
+    data.head()
+
+    Y, rhs = model_matrix(fml, data)
+    X = rhs[0].drop(columns="Intercept")
+    fe = rhs[1].drop(columns="Intercept")
+    YX = np.concatenate([Y, X], axis=1)
+
+    # to numpy
+    Y = Y.to_numpy()
+    X = X.to_numpy()
+    YX = np.concatenate([Y, X], axis=1)
+    fe = fe.to_numpy().astype(int)  # demean requires fixed effects as ints!
+
+    YX_demeaned, success = demean(YX, fe, weights = np.ones(YX.shape[0]))
+    Y_demeaned = YX_demeaned[:, 0]
+    X_demeaned = YX_demeaned[:, 1:]
+
+    print(np.linalg.lstsq(X_demeaned, Y_demeaned, rcond=None)[0])
+    print(pf.feols(fml, data).coef())
+    ```
+    """
+    return _demean_accelerated_rs(x, flist.astype(np.uint64), weights, tol, maxiter)
diff --git a/src/demean.rs b/src/demean.rs
index 418bc68d1..8d04414db 100644
--- a/src/demean.rs
+++ b/src/demean.rs
@@ -2,6 +2,7 @@ use ndarray::{Array2, ArrayView1, ArrayView2, Zip};
 use numpy::{PyArray2, PyReadonlyArray1, PyReadonlyArray2};
 use pyo3::prelude::*;
 use rayon::prelude::*;
+use std::env;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 
@@ -65,6 +66,29 @@ fn demean_impl(
     weights: &ArrayView1<f64>,
     tol: f64,
     maxiter: usize,
+) -> (Array2<f64>, bool) {
+    // Allow benchmarks to force the simple implementation for apples-to-apples comparisons.
+    if env::var("PYFIXEST_DEMEAN_SIMPLE").is_ok() {
+        return demean_simple_impl(x, flist, weights, tol, maxiter);
+    }
+
+    // Use the accelerated Rust implementation by default. If it fails to converge,
+    // fall back to the reference implementation to guarantee correctness.
+    let (accel, success) =
+        crate::demean_accelerated::demean_accelerated(x, flist, weights, tol, maxiter);
+    if success {
+        return (accel, true);
+    }
+
+    demean_simple_impl(x, flist, weights, tol, maxiter)
+}
+
+fn demean_simple_impl(
+    x: &ArrayView2<f64>,
+    flist: &ArrayView2<usize>,
+    weights: &ArrayView1<f64>,
+    tol: f64,
+    maxiter: usize,
 ) -> (Array2<f64>, bool) {
     let (n_samples, n_features) = x.dim();
     let n_factors = flist.ncols();
@@ -211,8 +235,7 @@ pub fn _demean_rs(
     let flist_arr = flist.as_array();
     let weights_arr = weights.as_array();
 
-    let (out, success) =
-        py.allow_threads(|| demean_impl(&x_arr, &flist_arr, &weights_arr, tol, maxiter));
+    let (out, success) = py.detach(|| demean_impl(&x_arr, &flist_arr, &weights_arr, tol, maxiter));
 
     let pyarray = PyArray2::from_owned_array(py, out);
     Ok((pyarray.into(), success))
diff --git a/src/demean_accelerated/coef_space.rs b/src/demean_accelerated/coef_space.rs
new file mode 100644
index 000000000..e510eb581
--- /dev/null
+++ b/src/demean_accelerated/coef_space.rs
@@ -0,0 +1,785 @@
+//! Coefficient-space demeaning matching fixest's algorithm exactly.
+//!
+//! This is a direct port of fixest's demeaning.cpp, using coefficient-space
+//! iteration rather than residual-space iteration.
+
+/// Pre-computed FE information for coefficient-space iteration.
+pub struct FEInfo {
+    pub n_obs: usize,
+    pub n_fe: usize,
+    /// Group IDs for each FE: fe_ids[q][i] = group ID for observation i in FE q
+    pub fe_ids: Vec<Vec<usize>>,
+    /// Number of groups per FE
+    pub n_groups: Vec<usize>,
+    /// Starting index of each FE's coefficients
+    pub coef_start: Vec<usize>,
+    /// Total number of coefficients
+    pub n_coef_total: usize,
+    /// Sum of weights per group: sum_weights[q][g]
+    pub sum_weights: Vec<Vec<f64>>,
+    /// Sample weights
+    pub weights: Vec<f64>,
+    /// Whether all weights are 1.0 (optimization)
+    pub is_unweighted: bool,
+}
+
+impl FEInfo {
+    pub fn new(
+        n_obs: usize,
+        n_fe: usize,
+        group_ids: &[usize], // flat [n_obs * n_fe], row-major
+        n_groups: &[usize],
+        weights: &[f64],
+    ) -> Self {
+        // Check if unweighted
+        let is_unweighted = weights.iter().all(|&w| (w - 1.0).abs() < 1e-10);
+
+        // Extract per-FE group IDs
+        let mut fe_ids = vec![vec![0usize; n_obs]; n_fe];
+        for i in 0..n_obs {
+            for q in 0..n_fe {
+                fe_ids[q][i] = group_ids[i * n_fe + q];
+            }
+        }
+
+        // Coefficient starting indices
+        let mut coef_start = vec![0usize; n_fe];
+        for q in 1..n_fe {
+            coef_start[q] = coef_start[q - 1] + n_groups[q - 1];
+        }
+        let n_coef_total: usize = n_groups.iter().sum();
+
+        // Sum of weights per group
+        let mut sum_weights = Vec::with_capacity(n_fe);
+        for q in 0..n_fe {
+            let mut sw = vec![0.0; n_groups[q]];
+            for i in 0..n_obs {
+                sw[fe_ids[q][i]] += weights[i];
+            }
+            // Avoid division by zero
+            for s in &mut sw {
+                if *s == 0.0 {
+                    *s = 1.0;
+                }
+            }
+            sum_weights.push(sw);
+        }
+
+        Self {
+            n_obs,
+            n_fe,
+            fe_ids,
+            n_groups: n_groups.to_vec(),
+            coef_start,
+            n_coef_total,
+            sum_weights,
+            weights: weights.to_vec(),
+            is_unweighted,
+        }
+    }
+
+    /// Compute sum of weighted (input - output) for each coefficient.
+    /// This is computed ONCE at the start and never changes.
+    pub fn compute_in_out(&self, input: &[f64], output: &[f64]) -> Vec<f64> {
+        let mut in_out = vec![0.0; self.n_coef_total];
+
+        if self.is_unweighted {
+            for q in 0..self.n_fe {
+                let start = self.coef_start[q];
+                let fe_q = &self.fe_ids[q];
+                for i in 0..self.n_obs {
+                    in_out[start + fe_q[i]] += input[i] - output[i];
+                }
+            }
+        } else {
+            for q in 0..self.n_fe {
+                let start = self.coef_start[q];
+                let fe_q = &self.fe_ids[q];
+                for i in 0..self.n_obs {
+                    in_out[start + fe_q[i]] += (input[i] - output[i]) * self.weights[i];
+                }
+            }
+        }
+
+        in_out
+    }
+
+    /// Compute output from coefficients: output[i] = input[i] - sum_q(coef[fe_q[i]])
+    pub fn compute_output(&self, coef: &[f64], input: &[f64], output: &mut [f64]) {
+        output.copy_from_slice(input);
+        for q in 0..self.n_fe {
+            let start = self.coef_start[q];
+            let fe_q = &self.fe_ids[q];
+            for i in 0..self.n_obs {
+                output[i] -= coef[start + fe_q[i]];
+            }
+        }
+    }
+}
+
+/// Fixest's continue_crit: returns true if should CONTINUE (not converged).
+#[inline]
+fn continue_crit(a: f64, b: f64, diff_max: f64) -> bool {
+    let diff = (a - b).abs();
+    (diff > diff_max) && (diff / (0.1 + a.abs()) > diff_max)
+}
+
+/// Check if should continue on coefficient slice.
+fn should_continue(x: &[f64], gx: &[f64], tol: f64) -> bool {
+    for i in 0..x.len() {
+        if continue_crit(x[i], gx[i], tol) {
+            return true;
+        }
+    }
+    false
+}
+
+/// Fixest's stopping_crit for SSR.
+#[inline]
+fn stopping_crit(a: f64, b: f64, diff_max: f64) -> bool {
+    let diff = (a - b).abs();
+    (diff < diff_max) || (diff / (0.1 + a.abs()) < diff_max)
+}
+
+/// Irons-Tuck acceleration: X = GGX - coef * (GGX - GX)
+#[inline(always)]
+fn irons_tuck_update(x: &mut [f64], gx: &[f64], ggx: &[f64]) -> bool {
+    let n = x.len();
+    let mut vprod = 0.0;
+    let mut ssq = 0.0;
+
+    // SAFETY: x, gx, ggx all have the same length n
+    for i in 0..n {
+        unsafe {
+            let gx_i = *gx.get_unchecked(i);
+            let ggx_i = *ggx.get_unchecked(i);
+            let x_i = *x.get_unchecked(i);
+            let delta_gx = ggx_i - gx_i;
+            let delta2_x = delta_gx - gx_i + x_i;
+            vprod += delta_gx * delta2_x;
+            ssq += delta2_x * delta2_x;
+        }
+    }
+
+    if ssq == 0.0 {
+        return true;
+    }
+
+    let coef = vprod / ssq;
+    for i in 0..n {
+        unsafe {
+            let gx_i = *gx.get_unchecked(i);
+            let ggx_i = *ggx.get_unchecked(i);
+            *x.get_unchecked_mut(i) = ggx_i - coef * (ggx_i - gx_i);
+        }
+    }
+
+    false
+}
+
+/// Configuration matching fixest defaults.
+#[derive(Clone, Copy)]
+pub struct FixestConfig {
+    pub tol: f64,
+    pub maxiter: usize,
+    pub iter_warmup: usize,
+    pub iter_proj_after_acc: usize,
+    pub iter_grand_acc: usize,
+}
+
+impl Default for FixestConfig {
+    fn default() -> Self {
+        Self {
+            tol: 1e-8,
+            maxiter: 100_000,
+            iter_warmup: 15,
+            iter_proj_after_acc: 40,
+            iter_grand_acc: 4,
+        }
+    }
+}
+
+// =============================================================================
+// 2-FE Coefficient-Space Implementation (matching compute_fe_coef_2)
+// =============================================================================
+
+/// 2-FE projection: Given alpha coefficients, compute new alpha via beta.
+/// This matches fixest's compute_fe_coef_2 which avoids N-length intermediates.
+#[inline(always)]
+fn project_2fe(
+    fe_info: &FEInfo,
+    in_out: &[f64],
+    alpha_in: &[f64],
+    alpha_out: &mut [f64],
+    beta: &mut [f64],
+) {
+    let n0 = fe_info.n_groups[0];
+    let n1 = fe_info.n_groups[1];
+    let n_obs = fe_info.n_obs;
+    let fe0 = &fe_info.fe_ids[0];
+    let fe1 = &fe_info.fe_ids[1];
+    let sw0 = &fe_info.sum_weights[0];
+    let sw1 = &fe_info.sum_weights[1];
+    let weights = &fe_info.weights;
+
+    // Step 1: Compute beta from alpha_in
+    // beta[g] = (in_out[n0+g] - sum_{i:fe1[i]=g} alpha[fe0[i]] * w[i]) / sw1[g]
+    beta[..n1].copy_from_slice(&in_out[n0..n0 + n1]);
+
+    // SAFETY: fe0[i] < n0 (alpha_in.len()), fe1[i] < n1 (beta.len()) by construction
+    if fe_info.is_unweighted {
+        for i in 0..n_obs {
+            unsafe {
+                let g1 = *fe1.get_unchecked(i);
+                let g0 = *fe0.get_unchecked(i);
+                *beta.get_unchecked_mut(g1) -= *alpha_in.get_unchecked(g0);
+            }
+        }
+    } else {
+        for i in 0..n_obs {
+            unsafe {
+                let g1 = *fe1.get_unchecked(i);
+                let g0 = *fe0.get_unchecked(i);
+                *beta.get_unchecked_mut(g1) -= *alpha_in.get_unchecked(g0) * *weights.get_unchecked(i);
+            }
+        }
+    }
+
+    for g in 0..n1 {
+        unsafe { *beta.get_unchecked_mut(g) /= *sw1.get_unchecked(g) };
+    }
+
+    // Step 2: Compute alpha_out from beta
+    // alpha[g] = (in_out[g] - sum_{i:fe0[i]=g} beta[fe1[i]] * w[i]) / sw0[g]
+    alpha_out[..n0].copy_from_slice(&in_out[..n0]);
+
+    // SAFETY: fe0[i] < n0 (alpha_out.len()), fe1[i] < n1 (beta.len()) by construction
+    if fe_info.is_unweighted {
+        for i in 0..n_obs {
+            unsafe {
+                let g0 = *fe0.get_unchecked(i);
+                let g1 = *fe1.get_unchecked(i);
+                *alpha_out.get_unchecked_mut(g0) -= *beta.get_unchecked(g1);
+            }
+        }
+    } else {
+        for i in 0..n_obs {
+            unsafe {
+                let g0 = *fe0.get_unchecked(i);
+                let g1 = *fe1.get_unchecked(i);
+                *alpha_out.get_unchecked_mut(g0) -= *beta.get_unchecked(g1) * *weights.get_unchecked(i);
+            }
+        }
+    }
+
+    for g in 0..n0 {
+        unsafe { *alpha_out.get_unchecked_mut(g) /= *sw0.get_unchecked(g) };
+    }
+}
+
+/// Run 2-FE acceleration loop (demean_acc_gnl with two_fe=true).
+fn run_2fe_acceleration(
+    fe_info: &FEInfo,
+    in_out: &[f64],
+    alpha: &mut [f64],      // Current coefficients, modified in place
+    beta: &mut [f64],       // Temporary buffer
+    config: &FixestConfig,
+    max_iter: usize,
+) -> (usize, bool) {
+    let n0 = fe_info.n_groups[0];
+
+    // Working buffers
+    let mut gx = vec![0.0; n0];
+    let mut ggx = vec![0.0; n0];
+    let mut temp = vec![0.0; n0];
+    let mut beta_tmp = vec![0.0; fe_info.n_groups[1]];
+
+    // Grand acceleration buffers
+    let mut y = vec![0.0; n0];
+    let mut gy = vec![0.0; n0];
+    let mut ggy = vec![0.0; n0];
+    let mut grand_counter = 0usize;
+
+    // First iteration: G(alpha)
+    project_2fe(fe_info, in_out, alpha, &mut gx, beta);
+
+    let mut keep_going = should_continue(alpha, &gx, config.tol);
+    let mut iter = 0;
+
+    while keep_going && iter < max_iter {
+        iter += 1;
+
+        // G(G(alpha))
+        project_2fe(fe_info, in_out, &gx, &mut ggx, &mut beta_tmp);
+
+        // Irons-Tuck
+        if irons_tuck_update(alpha, &gx, &ggx) {
+            break;
+        }
+
+        // Project after acceleration
+        if iter >= config.iter_proj_after_acc {
+            temp.copy_from_slice(alpha);
+            project_2fe(fe_info, in_out, &temp, alpha, &mut beta_tmp);
+        }
+
+        // G(alpha)
+        project_2fe(fe_info, in_out, alpha, &mut gx, beta);
+
+        // Convergence check
+        keep_going = should_continue(alpha, &gx, config.tol);
+
+        // Grand acceleration
+        if iter % config.iter_grand_acc == 0 {
+            grand_counter += 1;
+            match grand_counter {
+                1 => y.copy_from_slice(&gx),
+                2 => gy.copy_from_slice(&gx),
+                _ => {
+                    ggy.copy_from_slice(&gx);
+                    if irons_tuck_update(&mut y, &gy, &ggy) {
+                        break;
+                    }
+                    project_2fe(fe_info, in_out, &y, &mut gx, beta);
+                    grand_counter = 0;
+                }
+            }
+        }
+    }
+
+    (iter, !keep_going)
+}
+
+// =============================================================================
+// General Q-FE Coefficient-Space Implementation (matching compute_fe_gnl)
+// =============================================================================
+
+/// Q-FE projection: Compute G(coef_in) -> coef_out.
+/// Updates FEs in reverse order (Q-1 down to 0) matching fixest.
+#[inline(always)]
+fn project_qfe(
+    fe_info: &FEInfo,
+    in_out: &[f64],
+    coef_in: &[f64],
+    coef_out: &mut [f64],
+    sum_other_means: &mut [f64], // N-length buffer
+) {
+    let n_fe = fe_info.n_fe;
+    let n_obs = fe_info.n_obs;
+    let weights = &fe_info.weights;
+
+    // Process in reverse order
+    for q in (0..n_fe).rev() {
+        // Step 1: Compute sum of other FE contributions (NO weights here - this is just
+        // expanding coefficients to observation space)
+        sum_other_means.fill(0.0);
+
+        // Add contributions from FEs with h < q (use coef_in)
+        for h in 0..q {
+            let start_h = fe_info.coef_start[h];
+            let fe_h = &fe_info.fe_ids[h];
+            // SAFETY: fe_h[i] < n_groups[h], start_h + fe_h[i] < coef_in.len()
+            for i in 0..n_obs {
+                unsafe {
+                    let g = *fe_h.get_unchecked(i);
+                    *sum_other_means.get_unchecked_mut(i) += *coef_in.get_unchecked(start_h + g);
+                }
+            }
+        }
+
+        // Add contributions from FEs with h > q (use coef_out, already computed)
+        for h in (q + 1)..n_fe {
+            let start_h = fe_info.coef_start[h];
+            let fe_h = &fe_info.fe_ids[h];
+            // SAFETY: fe_h[i] < n_groups[h], start_h + fe_h[i] < coef_out.len()
+            for i in 0..n_obs {
+                unsafe {
+                    let g = *fe_h.get_unchecked(i);
+                    *sum_other_means.get_unchecked_mut(i) += *coef_out.get_unchecked(start_h + g);
+                }
+            }
+        }
+
+        // Step 2: Compute new coefficients for FE q
+        let start_q = fe_info.coef_start[q];
+        let n_groups_q = fe_info.n_groups[q];
+        let fe_q = &fe_info.fe_ids[q];
+        let sw_q = &fe_info.sum_weights[q];
+
+        // Initialize to in_out (pre-aggregated weighted (input-output))
+        coef_out[start_q..start_q + n_groups_q]
+            .copy_from_slice(&in_out[start_q..start_q + n_groups_q]);
+
+        // Subtract weighted other FE contributions (weights applied when aggregating back)
+        // SAFETY: fe_q[i] < n_groups_q, start_q + fe_q[i] < coef_out.len()
+        if fe_info.is_unweighted {
+            for i in 0..n_obs {
+                unsafe {
+                    let g = *fe_q.get_unchecked(i);
+                    *coef_out.get_unchecked_mut(start_q + g) -= *sum_other_means.get_unchecked(i);
+                }
+            }
+        } else {
+            for i in 0..n_obs {
+                unsafe {
+                    let g = *fe_q.get_unchecked(i);
+                    *coef_out.get_unchecked_mut(start_q + g) -=
+                        *sum_other_means.get_unchecked(i) * *weights.get_unchecked(i);
+                }
+            }
+        }
+
+        // Divide by sum of weights
+        for g in 0..n_groups_q {
+            unsafe {
+                *coef_out.get_unchecked_mut(start_q + g) /= *sw_q.get_unchecked(g);
+            }
+        }
+    }
+}
+
+/// Run Q-FE acceleration loop (demean_acc_gnl).
+#[allow(dead_code)]
+fn run_qfe_acceleration(
+    fe_info: &FEInfo,
+    in_out: &[f64],
+    coef: &mut [f64],       // Current coefficients, modified in place
+    config: &FixestConfig,
+    max_iter: usize,
+    input: &[f64],          // Original input for SSR
+) -> (usize, bool) {
+    let n_coef = fe_info.n_coef_total;
+    let n_obs = fe_info.n_obs;
+
+    // nb_coef_no_Q: all except last FE (what fixest uses for acceleration)
+    let nb_coef_no_q = n_coef - fe_info.n_groups[fe_info.n_fe - 1];
+
+    // Working buffers
+    let mut gx = vec![0.0; n_coef];
+    let mut ggx = vec![0.0; n_coef];
+    let mut temp = vec![0.0; n_coef];
+    let mut sum_other_means = vec![0.0; n_obs];
+
+    // Grand acceleration buffers (only nb_coef_no_q needed)
+    let mut y = vec![0.0; n_coef];
+    let mut gy = vec![0.0; n_coef];
+    let mut ggy = vec![0.0; n_coef];
+    let mut grand_counter = 0usize;
+
+    // SSR buffer
+    let mut output_buf = vec![0.0; n_obs];
+    let mut ssr = 0.0;
+
+    // First iteration: G(coef)
+    project_qfe(fe_info, in_out, coef, &mut gx, &mut sum_other_means);
+
+    let mut keep_going = should_continue(&coef[..nb_coef_no_q], &gx[..nb_coef_no_q], config.tol);
+    let mut iter = 0;
+
+    while keep_going && iter < max_iter {
+        iter += 1;
+
+        // G(G(coef))
+        project_qfe(fe_info, in_out, &gx, &mut ggx, &mut sum_other_means);
+
+        // Irons-Tuck on nb_coef_no_q
+        if irons_tuck_update(&mut coef[..nb_coef_no_q], &gx[..nb_coef_no_q], &ggx[..nb_coef_no_q]) {
+            break;
+        }
+
+        // Project after acceleration
+        if iter >= config.iter_proj_after_acc {
+            temp.copy_from_slice(coef);
+            project_qfe(fe_info, in_out, &temp, coef, &mut sum_other_means);
+        }
+
+        // G(coef)
+        project_qfe(fe_info, in_out, coef, &mut gx, &mut sum_other_means);
+
+        // Convergence check on nb_coef_no_q
+        keep_going = should_continue(&coef[..nb_coef_no_q], &gx[..nb_coef_no_q], config.tol);
+
+        // Grand acceleration on nb_coef_no_q
+        if iter % config.iter_grand_acc == 0 {
+            grand_counter += 1;
+            match grand_counter {
+                1 => y[..nb_coef_no_q].copy_from_slice(&gx[..nb_coef_no_q]),
+                2 => gy[..nb_coef_no_q].copy_from_slice(&gx[..nb_coef_no_q]),
+                _ => {
+                    ggy[..nb_coef_no_q].copy_from_slice(&gx[..nb_coef_no_q]);
+                    if irons_tuck_update(&mut y[..nb_coef_no_q], &gy[..nb_coef_no_q], &ggy[..nb_coef_no_q]) {
+                        break;
+                    }
+                    project_qfe(fe_info, in_out, &y, &mut gx, &mut sum_other_means);
+                    grand_counter = 0;
+                }
+            }
+        }
+
+        // SSR stopping every 40 iterations
+        if iter % 40 == 0 {
+            let ssr_old = ssr;
+            fe_info.compute_output(&gx, input, &mut output_buf);
+            ssr = output_buf.iter().map(|&r| r * r).sum();
+
+            if iter > 40 && stopping_crit(ssr_old, ssr, config.tol) {
+                break;
+            }
+        }
+    }
+
+    // Copy final gx to coef
+    coef.copy_from_slice(&gx);
+
+    (iter, !keep_going)
+}
+
+// =============================================================================
+// Public API: demean_single matching fixest's demean_single_gnl
+// =============================================================================
+
+/// Demean a single variable using coefficient-space iteration.
+/// Matches fixest's demean_single_gnl exactly.
+pub fn demean_single(
+    fe_info: &FEInfo,
+    input: &[f64],
+    config: &FixestConfig,
+) -> (Vec<f64>, usize, bool) {
+    let n_obs = fe_info.n_obs;
+    let n_fe = fe_info.n_fe;
+
+    // Output initialized to 0
+    let mut output = vec![0.0; n_obs];
+
+    // Compute initial in_out
+    let in_out = fe_info.compute_in_out(input, &output);
+
+    if n_fe == 1 {
+        // Single FE: closed-form solution
+        let mut result = vec![0.0; n_obs];
+        let fe0 = &fe_info.fe_ids[0];
+        let sw0 = &fe_info.sum_weights[0];
+
+        // coef[g] = in_out[g] / sw[g]
+        let coef: Vec<f64> = in_out.iter().zip(sw0.iter()).map(|(&io, &sw)| io / sw).collect();
+
+        // output[i] = input[i] - coef[fe0[i]]
+        for i in 0..n_obs {
+            result[i] = input[i] - coef[fe0[i]];
+        }
+
+        return (result, 0, true);
+    }
+
+    if n_fe == 2 {
+        // 2-FE: Use specialized 2-FE algorithm
+        let n0 = fe_info.n_groups[0];
+        let n1 = fe_info.n_groups[1];
+
+        let mut alpha = vec![0.0; n0];
+        let mut beta = vec![0.0; n1];
+
+        let (iter, converged) = run_2fe_acceleration(
+            fe_info,
+            &in_out,
+            &mut alpha,
+            &mut beta,
+            config,
+            config.maxiter,
+        );
+
+        // Compute output
+        let mut result = vec![0.0; n_obs];
+        let fe0 = &fe_info.fe_ids[0];
+        let fe1 = &fe_info.fe_ids[1];
+
+        for i in 0..n_obs {
+            result[i] = input[i] - alpha[fe0[i]] - beta[fe1[i]];
+        }
+
+        return (result, iter, converged);
+    }
+
+    // 3+ FE: Use fixest's multi-phase strategy
+    // Key insight: fixest's output stores SUM OF FE COEFFICIENTS, not residual.
+    // in_out = agg(input - output) = agg(input - sum_of_coefs) = agg(residual)
+    // We'll use mu to store sum of FE coefs, then convert to residual at the end.
+    //
+    // 1. Warmup iterations on all FEs
+    // 2. 2-FE sub-convergence on first 2 FEs
+    // 3. Re-acceleration on all FEs
+
+    let n_coef = fe_info.n_coef_total;
+    let n0 = fe_info.n_groups[0];
+    let n1 = fe_info.n_groups[1];
+    let mut total_iter = 0usize;
+
+    // mu = sum of FE contributions per observation (fixest's "output")
+    // Starts at 0, accumulates FE coefficients across phases
+    let mut mu = vec![0.0; n_obs];
+
+    // Helper to compute in_out = agg(input - mu) per FE group
+    let compute_in_out_from_mu = |mu: &[f64]| -> Vec<f64> {
+        let mut in_out = vec![0.0; fe_info.n_coef_total];
+        for q in 0..fe_info.n_fe {
+            let start = fe_info.coef_start[q];
+            let fe_q = &fe_info.fe_ids[q];
+            if fe_info.is_unweighted {
+                for i in 0..n_obs {
+                    in_out[start + fe_q[i]] += input[i] - mu[i];
+                }
+            } else {
+                for i in 0..n_obs {
+                    in_out[start + fe_q[i]] += (input[i] - mu[i]) * fe_info.weights[i];
+                }
+            }
+        }
+        in_out
+    };
+
+    // Helper to add coefficients to mu
+    let add_coef_to_mu = |coef: &[f64], mu: &mut [f64]| {
+        for q in 0..fe_info.n_fe {
+            let start = fe_info.coef_start[q];
+            let fe_q = &fe_info.fe_ids[q];
+            for i in 0..n_obs {
+                mu[i] += coef[start + fe_q[i]];
+            }
+        }
+    };
+
+    // Phase 1: Warmup with all FEs
+    let mut coef = vec![0.0; n_coef];
+    let in_out_phase1 = compute_in_out_from_mu(&mu);
+
+    let (iter1, converged1) = run_qfe_acceleration(
+        fe_info,
+        &in_out_phase1,
+        &mut coef,
+        config,
+        config.iter_warmup,
+        input,
+    );
+    total_iter += iter1;
+
+    // Add Phase 1 coefficients to mu
+    add_coef_to_mu(&coef, &mut mu);
+
+    if !converged1 {
+        // Phase 2: 2-FE sub-convergence on first 2 FEs
+        let in_out_phase2 = compute_in_out_from_mu(&mu);
+
+        // Start with fresh alpha, beta
+        let mut alpha = vec![0.0; n0];
+        let mut beta = vec![0.0; n1];
+
+        // Extract only the first 2 FE portions of in_out
+        let in_out_2fe: Vec<f64> = in_out_phase2[..n0 + n1].to_vec();
+
+        let iter_max_2fe = config.maxiter / 2;
+        let (iter2, _) = run_2fe_acceleration(
+            fe_info,
+            &in_out_2fe,
+            &mut alpha,
+            &mut beta,
+            config,
+            iter_max_2fe,
+        );
+        total_iter += iter2;
+
+        // Add Phase 2's alpha/beta to mu (only FE0 and FE1)
+        let fe0 = &fe_info.fe_ids[0];
+        let fe1 = &fe_info.fe_ids[1];
+        for i in 0..n_obs {
+            mu[i] += alpha[fe0[i]] + beta[fe1[i]];
+        }
+
+        // Phase 3: Re-acceleration on all FEs
+        let remaining = config.maxiter.saturating_sub(total_iter);
+        if remaining > 0 {
+            let in_out_phase3 = compute_in_out_from_mu(&mu);
+
+            // Start with fresh coefficients
+            coef.fill(0.0);
+
+            let (iter3, _) = run_qfe_acceleration(
+                fe_info,
+                &in_out_phase3,
+                &mut coef,
+                config,
+                remaining,
+                input,
+            );
+            total_iter += iter3;
+
+            // Add Phase 3 coefficients to mu
+            add_coef_to_mu(&coef, &mut mu);
+        }
+    }
+
+    // Convert mu (sum of FE coefs) to output (residual = input - mu)
+    for i in 0..n_obs {
+        output[i] = input[i] - mu[i];
+    }
+
+    let converged = total_iter < config.maxiter;
+    (output, total_iter, converged)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_2fe_convergence() {
+        let n_obs = 100;
+        let n_fe = 2;
+
+        // Create simple FE structure
+        let mut group_ids = Vec::with_capacity(n_obs * n_fe);
+        for i in 0..n_obs {
+            group_ids.push(i % 10);  // FE1: 10 groups
+            group_ids.push(i % 5);   // FE2: 5 groups
+        }
+
+        let n_groups = vec![10, 5];
+        let weights = vec![1.0; n_obs];
+
+        let fe_info = FEInfo::new(n_obs, n_fe, &group_ids, &n_groups, &weights);
+
+        // Random input
+        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+
+        let config = FixestConfig::default();
+        let (result, iter, converged) = demean_single(&fe_info, &input, &config);
+
+        assert!(converged, "Should converge");
+        assert!(iter < 100, "Should converge quickly");
+        assert!(result.iter().all(|&v| v.is_finite()));
+    }
+
+    #[test]
+    fn test_3fe_convergence() {
+        let n_obs = 100;
+        let n_fe = 3;
+
+        let mut group_ids = Vec::with_capacity(n_obs * n_fe);
+        for i in 0..n_obs {
+            group_ids.push(i % 10);  // FE1
+            group_ids.push(i % 5);   // FE2
+            group_ids.push(i % 3);   // FE3
+        }
+
+        let n_groups = vec![10, 5, 3];
+        let weights = vec![1.0; n_obs];
+
+        let fe_info = FEInfo::new(n_obs, n_fe, &group_ids, &n_groups, &weights);
+        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+
+        let config = FixestConfig::default();
+        let (result, _iter, converged) = demean_single(&fe_info, &input, &config);
+
+        assert!(converged);
+        assert!(result.iter().all(|&v| v.is_finite()));
+    }
+}
diff --git a/src/demean_accelerated/mod.rs b/src/demean_accelerated/mod.rs
new file mode 100644
index 000000000..c1e17b6a1
--- /dev/null
+++ b/src/demean_accelerated/mod.rs
@@ -0,0 +1,127 @@
+//! Accelerated alternating-projections demeaning with Irons-Tuck/Grand speedups.
+//!
+//! This module is a Rust port of fixest's original C++ demeaning implementation
+//! (`https://github.com/lrberge/fixest/blob/master/src/demeaning.cpp`),
+//! using coefficient-space iteration for efficiency.
+//!
+//! Dispatches based on number of fixed effects:
+//! - 1 FE: O(n) closed-form solution (single pass, no iteration)
+//! - 2 FE: Coefficient-space iteration with Irons-Tuck + Grand acceleration
+//! - 3+ FE: Coefficient-space iteration with Irons-Tuck + Grand acceleration
+
+mod coef_space;
+
+use coef_space::{demean_single, FEInfo, FixestConfig};
+use ndarray::{Array2, ArrayView1, ArrayView2, Zip};
+use numpy::{PyArray2, PyReadonlyArray1, PyReadonlyArray2};
+use pyo3::prelude::*;
+use rayon::prelude::*;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::Arc;
+
+pub(crate) fn demean_accelerated(
+    x: &ArrayView2<f64>,
+    flist: &ArrayView2<usize>,
+    weights: &ArrayView1<f64>,
+    tol: f64,
+    maxiter: usize,
+) -> (Array2<f64>, bool) {
+    let (n_samples, n_features) = x.dim();
+    let n_factors = flist.ncols();
+
+    let sample_weights: Vec<f64> = weights.iter().cloned().collect();
+    let group_ids: Vec<usize> = flist.iter().cloned().collect();
+
+    // Compute n_groups per factor
+    let n_groups_per_factor: Vec<usize> = (0..n_factors)
+        .map(|j| {
+            (0..n_samples)
+                .map(|i| group_ids[i * n_factors + j])
+                .max()
+                .unwrap_or(0)
+                + 1
+        })
+        .collect();
+
+    let config = FixestConfig {
+        tol,
+        maxiter,
+        ..FixestConfig::default()
+    };
+
+    // Use the unified coefficient-space implementation for all FE counts
+    demean_coef_space(
+        x,
+        &sample_weights,
+        &group_ids,
+        n_samples,
+        n_features,
+        n_factors,
+        &n_groups_per_factor,
+        &config,
+    )
+}
+
+/// Demean using coefficient-space iteration (unified for all FE counts).
+fn demean_coef_space(
+    x: &ArrayView2<f64>,
+    sample_weights: &[f64],
+    group_ids: &[usize],
+    n_samples: usize,
+    n_features: usize,
+    n_factors: usize,
+    n_groups_per_factor: &[usize],
+    config: &FixestConfig,
+) -> (Array2<f64>, bool) {
+    let not_converged = Arc::new(AtomicUsize::new(0));
+    let mut res = Array2::<f64>::zeros((n_samples, n_features));
+
+    res.axis_iter_mut(ndarray::Axis(1))
+        .into_par_iter()
+        .enumerate()
+        .for_each(|(k, mut col)| {
+            let xk: Vec<f64> = (0..n_samples).map(|i| x[[i, k]]).collect();
+
+            let fe_info = FEInfo::new(
+                n_samples,
+                n_factors,
+                group_ids,
+                n_groups_per_factor,
+                sample_weights,
+            );
+
+            let (result, _iter, converged) = demean_single(&fe_info, &xk, config);
+
+            if !converged {
+                not_converged.fetch_add(1, Ordering::SeqCst);
+            }
+
+            Zip::from(&mut col).and(&result).for_each(|col_elm, &val| {
+                *col_elm = val;
+            });
+        });
+
+    let success = not_converged.load(Ordering::SeqCst) == 0;
+    (res, success)
+}
+
+#[pyfunction]
+#[pyo3(signature = (x, flist, weights, tol=1e-8, maxiter=100_000))]
+pub fn _demean_accelerated_rs(
+    py: Python<'_>,
+    x: PyReadonlyArray2<f64>,
+    flist: PyReadonlyArray2<usize>,
+    weights: PyReadonlyArray1<f64>,
+    tol: f64,
+    maxiter: usize,
+) -> PyResult<(Py<PyArray2<f64>>, bool)> {
+    let x_arr = x.as_array();
+    let flist_arr = flist.as_array();
+    let weights_arr = weights.as_array();
+
+    let (out, success) =
+        py.detach(|| demean_accelerated(&x_arr, &flist_arr, &weights_arr, tol, maxiter));
+
+    let pyarray = PyArray2::from_owned_array(py, out);
+    Ok((pyarray.into(), success))
+}
diff --git a/src/lib.rs b/src/lib.rs
index b428b07b5..0a5df7878 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -4,6 +4,7 @@ mod collinear;
 mod crv1;
 mod demean;
 mod nested_fixed_effects;
+mod demean_accelerated;
 
 #[pymodule]
 fn _core_impl(m: &Bound<'_, PyModule>) -> PyResult<()> {
@@ -13,5 +14,6 @@ fn _core_impl(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_wrapped(wrap_pyfunction!(
         nested_fixed_effects::_count_fixef_fully_nested_all_rs
     ))?;
+    m.add_wrapped(wrap_pyfunction!(demean_accelerated::_demean_accelerated_rs))?;
     Ok(())
 }

From 006ad5fe59d3e043bc1d9ad6bdf8169f809dc5ad Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Fri, 26 Dec 2025 15:44:41 +0100
Subject: [PATCH 02/24] Optimize demean_accelerated performance

Performance improvements to the accelerated demeaning implementation:
- Optimize memory layout and share FEInfo across columns
- Add SSR (sum of squared residuals) stopping criterion for 2-FE
- Loop unrolling for 3-FE projection hot paths
- Align tolerance default with fixest (1e-6 instead of 1e-8)
---
 .cargo/config.toml                    |   7 +
 Cargo.toml                            |   1 +
 benchmarks/bench_demean_r.R           |   8 +-
 benchmarks/bench_native_comparison.py | 118 +++---
 pyfixest/core/demean.py               |   4 +-
 pyfixest/estimation/feols_.py         |   4 +-
 src/demean_accelerated/coef_space.rs  | 553 ++++++++++++++++++++++----
 src/demean_accelerated/mod.rs         |  17 +-
 8 files changed, 547 insertions(+), 165 deletions(-)
 create mode 100644 .cargo/config.toml

diff --git a/.cargo/config.toml b/.cargo/config.toml
new file mode 100644
index 000000000..f5833703c
--- /dev/null
+++ b/.cargo/config.toml
@@ -0,0 +1,7 @@
+[target.aarch64-apple-darwin]
+rustflags = [
+    "-C", "target-cpu=native",
+    "-C", "target-feature=+neon,+fp-armv8,+aes,+sha2",
+    "-C", "llvm-args=-enable-unsafe-fp-math",
+    "-C", "llvm-args=-fast-isel=false",
+]
diff --git a/Cargo.toml b/Cargo.toml
index a952ace3b..81eeb3b5e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,3 +21,4 @@ lto = "fat"          # Full link-time optimization
 codegen-units = 1    # Whole-program optimization
 panic = "abort"      # Smaller binary, no unwind support
 strip = true         # Remove symbol table
+debug = false        # No debug info in release
diff --git a/benchmarks/bench_demean_r.R b/benchmarks/bench_demean_r.R
index fb9a55620..fb894078d 100644
--- a/benchmarks/bench_demean_r.R
+++ b/benchmarks/bench_demean_r.R
@@ -9,8 +9,8 @@ n_obs <- if (length(args) >= 1) as.integer(args[1]) else 100000L
 dgp_type <- if (length(args) >= 2) args[2] else "difficult"
 n_fe <- if (length(args) >= 3) as.integer(args[3]) else 3L
 
-# Set single thread for fair comparison
-setFixest_nthreads(1)
+# Use all available threads for fair comparison (pyfixest also uses all threads)
+setFixest_nthreads(0)  # 0 = use all available
 
 # Generate data matching Python benchmark DGP
 set.seed(42)
@@ -46,9 +46,9 @@ df <- data.frame(
 
 # Build formula based on n_fe
 if (n_fe == 2) {
-  fml <- y ~ 1 | indiv_id + year
+  fml <- y ~ x1 | indiv_id + year
 } else {
-  fml <- y ~ 1 | indiv_id + year + firm_id
+  fml <- y ~ x1 | indiv_id + year + firm_id
 }
 
 # Warm up
diff --git a/benchmarks/bench_native_comparison.py b/benchmarks/bench_native_comparison.py
index 5782a1e65..1af0ee2ab 100644
--- a/benchmarks/bench_native_comparison.py
+++ b/benchmarks/bench_native_comparison.py
@@ -1,8 +1,9 @@
 #!/usr/bin/env python3
 """
-Benchmark comparing pyfixest demean vs native fixest (via R subprocess).
+Benchmark comparing pyfixest feols vs native fixest feols.
 
 Runs fixest directly in R to avoid rpy2 overhead, then compares with pyfixest.
+This is a fair apples-to-apples comparison of full feols() routines.
 """
 
 from __future__ import annotations
@@ -14,6 +15,7 @@
 from statistics import median
 
 import numpy as np
+import pandas as pd
 
 
 def generate_dgp(
@@ -21,7 +23,7 @@ def generate_dgp(
     dgp_type: str = "simple",
     n_years: int = 10,
     n_indiv_per_firm: int = 23,
-) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+) -> pd.DataFrame:
     """Generate test data matching fixest benchmark DGP."""
     np.random.seed(42)
 
@@ -42,10 +44,15 @@ def generate_dgp(
     year_fe = np.random.randn(n_years)[year]
     y = x1 + firm_fe + unit_fe + year_fe + np.random.randn(n)
 
-    x = np.column_stack([y, x1])
-    weights = np.ones(n)
-
-    return x, indiv_id, year, firm_id, weights
+    return pd.DataFrame(
+        {
+            "y": y,
+            "x1": x1,
+            "indiv_id": indiv_id,
+            "year": year,
+            "firm_id": firm_id,
+        }
+    )
 
 
 def run_r_benchmark(n_obs: int, dgp_type: str, n_fe: int, n_runs: int = 5) -> dict:
@@ -80,46 +87,39 @@ def run_r_benchmark(n_obs: int, dgp_type: str, n_fe: int, n_runs: int = 5) -> di
         return {"error": "R not found", "median": float("inf")}
 
 
-def run_rust_benchmark(
-    x: np.ndarray,
-    flist: np.ndarray,
-    weights: np.ndarray,
+def run_pyfixest_benchmark(
+    df: pd.DataFrame,
+    n_fe: int,
     n_runs: int = 5,
-    use_simple: bool = False,
 ) -> dict:
-    """Run pyfixest Rust demean benchmark."""
-    import os
+    """Run pyfixest feols benchmark."""
+    import pyfixest as pf
 
-    if use_simple:
-        os.environ["PYFIXEST_DEMEAN_SIMPLE"] = "1"
-    elif "PYFIXEST_DEMEAN_SIMPLE" in os.environ:
-        del os.environ["PYFIXEST_DEMEAN_SIMPLE"]
+    # Build formula matching R benchmark
+    if n_fe == 2:
+        fml = "y ~ x1 | indiv_id + year"
+    else:
+        fml = "y ~ x1 | indiv_id + year + firm_id"
 
-    try:
-        from pyfixest.core.demean import demean
+    # Warmup - use rust backend for accelerated demeaning
+    pf.feols(fml, data=df, demeaner_backend="rust")
 
-        times = []
-        for _ in range(n_runs):
-            x_copy = x.copy()
-            start = time.perf_counter()
-            _result, converged = demean(x_copy, flist, weights)
-            elapsed = (time.perf_counter() - start) * 1000  # ms
-            times.append(elapsed)
+    times = []
+    for _ in range(n_runs):
+        start = time.perf_counter()
+        fit = pf.feols(fml, data=df, demeaner_backend="rust")
+        elapsed = (time.perf_counter() - start) * 1000  # ms
+        times.append(elapsed)
 
-        return {
-            "median": median(times),
-            "times": times,
-            "converged": converged,
-        }
-    except Exception as e:
-        return {"error": str(e), "median": float("inf")}
-    finally:
-        if "PYFIXEST_DEMEAN_SIMPLE" in os.environ:
-            del os.environ["PYFIXEST_DEMEAN_SIMPLE"]
+    return {
+        "median": median(times),
+        "times": times,
+        "coef": float(fit.coef().iloc[0]),
+    }
 
 
 def main():
-    """Run benchmark comparing pyfixest demean vs native fixest."""
+    """Run benchmark comparing pyfixest feols vs native fixest feols."""
     configs = [
         (10_000, "simple", 2),
         (10_000, "difficult", 2),
@@ -134,7 +134,7 @@ def main():
     results = []
 
     print("=" * 70)
-    print("PyFixest vs Fixest Native Benchmark")
+    print("PyFixest feols() vs Fixest feols() Benchmark")
     print("=" * 70)
 
     for n_obs, dgp_type, n_fe in configs:
@@ -142,37 +142,22 @@ def main():
         print("-" * 50)
 
         # Generate data
-        x, indiv_id, year, firm_id, weights = generate_dgp(n_obs, dgp_type)
+        df = generate_dgp(n_obs, dgp_type)
 
-        if n_fe == 2:
-            flist = np.column_stack([indiv_id, year]).astype(np.uint64)
-        else:
-            flist = np.column_stack([indiv_id, year, firm_id]).astype(np.uint64)
-
-        # Run R benchmark
+        # Run R benchmark (feols)
         r_result = run_r_benchmark(n_obs, dgp_type, n_fe)
         r_time = r_result.get("median", float("inf"))
-        print(f"  fixest (R native):   {r_time:8.2f} ms")
-
-        # Run Rust accelerated benchmark
-        rust_result = run_rust_benchmark(x, flist, weights)
-        rust_time = rust_result.get("median", float("inf"))
-
-        if r_time > 0 and rust_time < float("inf"):
-            ratio = rust_time / r_time
-            print(f"  pyfixest (Rust):     {rust_time:8.2f} ms ({ratio:.2f}x)")
-        else:
-            print(f"  pyfixest (Rust):     {rust_time:8.2f} ms")
+        print(f"  fixest (R):      {r_time:8.2f} ms")
 
-        # Run Rust simple benchmark
-        rust_simple = run_rust_benchmark(x, flist, weights, use_simple=True)
-        rust_simple_time = rust_simple.get("median", float("inf"))
+        # Run pyfixest benchmark (feols)
+        py_result = run_pyfixest_benchmark(df, n_fe)
+        py_time = py_result.get("median", float("inf"))
 
-        if r_time > 0 and rust_simple_time < float("inf"):
-            ratio = rust_simple_time / r_time
-            print(f"  pyfixest (simple):   {rust_simple_time:8.2f} ms ({ratio:.2f}x)")
+        if r_time > 0 and py_time < float("inf"):
+            ratio = py_time / r_time
+            print(f"  pyfixest:        {py_time:8.2f} ms ({ratio:.2f}x)")
         else:
-            print(f"  pyfixest (simple):   {rust_simple_time:8.2f} ms")
+            print(f"  pyfixest:        {py_time:8.2f} ms")
 
         results.append(
             {
@@ -180,14 +165,13 @@ def main():
                 "dgp_type": dgp_type,
                 "n_fe": n_fe,
                 "fixest_r_ms": r_time,
-                "pyfixest_rust_ms": rust_time,
-                "pyfixest_simple_ms": rust_simple_time,
+                "pyfixest_ms": py_time,
             }
         )
 
     # Summary
     print("\n" + "=" * 70)
-    print("SUMMARY (pyfixest accelerated vs fixest)")
+    print("SUMMARY (pyfixest feols vs fixest feols)")
     print("=" * 70)
 
     print(f"{'Config':<35} {'fixest':>10} {'pyfixest':>10} {'ratio':>8}")
@@ -196,7 +180,7 @@ def main():
     for r in results:
         config = f"n={r['n_obs']:,} {r['dgp_type']:9} {r['n_fe']}FE"
         fixest = r["fixest_r_ms"]
-        pyfixest = r["pyfixest_rust_ms"]
+        pyfixest = r["pyfixest_ms"]
 
         if fixest > 0 and fixest < float("inf") and pyfixest < float("inf"):
             ratio = pyfixest / fixest
diff --git a/pyfixest/core/demean.py b/pyfixest/core/demean.py
index 8af8c8bbe..95cd97e88 100644
--- a/pyfixest/core/demean.py
+++ b/pyfixest/core/demean.py
@@ -8,7 +8,7 @@ def demean(
     x: NDArray[np.float64],
     flist: NDArray[np.uint64],
     weights: NDArray[np.float64],
-    tol: float = 1e-08,
+    tol: float = 1e-06,
     maxiter: int = 100_000,
 ) -> tuple[NDArray, bool]:
     """
@@ -27,7 +27,7 @@ def demean(
     weights : numpy.ndarray
         Array of shape (n_samples,) specifying the weights.
     tol : float, optional
-        Tolerance criterion for convergence. Defaults to 1e-08.
+        Tolerance criterion for convergence. Defaults to 1e-06 (matching fixest).
     maxiter : int, optional
         Maximum number of iterations. Defaults to 100_000.
 
diff --git a/pyfixest/estimation/feols_.py b/pyfixest/estimation/feols_.py
index 1885c9422..e6bb4dd3d 100644
--- a/pyfixest/estimation/feols_.py
+++ b/pyfixest/estimation/feols_.py
@@ -1,5 +1,4 @@
 import functools
-import gc
 import re
 import warnings
 from collections.abc import Mapping
@@ -1140,7 +1139,8 @@ def _clear_attributes(self):
         for attr in attributes:
             if hasattr(self, attr):
                 delattr(self, attr)
-        gc.collect()
+        # Note: gc.collect() was removed here as it added ~50ms overhead per call
+        # and Python's automatic GC is sufficient for most use cases
 
     def wald_test(self, R=None, q=None, distribution="F"):
         """
diff --git a/src/demean_accelerated/coef_space.rs b/src/demean_accelerated/coef_space.rs
index e510eb581..f6b90e956 100644
--- a/src/demean_accelerated/coef_space.rs
+++ b/src/demean_accelerated/coef_space.rs
@@ -4,19 +4,21 @@
 //! iteration rather than residual-space iteration.
 
 /// Pre-computed FE information for coefficient-space iteration.
+/// Uses flat memory layout for better cache performance.
 pub struct FEInfo {
     pub n_obs: usize,
     pub n_fe: usize,
-    /// Group IDs for each FE: fe_ids[q][i] = group ID for observation i in FE q
-    pub fe_ids: Vec<Vec<usize>>,
+    /// Group IDs flattened: fe_ids[q * n_obs + i] = group ID for observation i in FE q
+    /// This eliminates pointer indirection compared to Vec<Vec<usize>>
+    pub fe_ids: Vec<usize>,
     /// Number of groups per FE
     pub n_groups: Vec<usize>,
-    /// Starting index of each FE's coefficients
+    /// Starting index of each FE's coefficients in coef array
     pub coef_start: Vec<usize>,
     /// Total number of coefficients
     pub n_coef_total: usize,
-    /// Sum of weights per group: sum_weights[q][g]
-    pub sum_weights: Vec<Vec<f64>>,
+    /// Sum of weights per group, flattened: access via coef_start[q] + g
+    pub sum_weights: Vec<f64>,
     /// Sample weights
     pub weights: Vec<f64>,
     /// Whether all weights are 1.0 (optimization)
@@ -34,35 +36,37 @@ impl FEInfo {
         // Check if unweighted
         let is_unweighted = weights.iter().all(|&w| (w - 1.0).abs() < 1e-10);
 
-        // Extract per-FE group IDs
-        let mut fe_ids = vec![vec![0usize; n_obs]; n_fe];
-        for i in 0..n_obs {
-            for q in 0..n_fe {
-                fe_ids[q][i] = group_ids[i * n_fe + q];
-            }
-        }
-
-        // Coefficient starting indices
+        // Coefficient starting indices (computed first, used for sum_weights layout)
         let mut coef_start = vec![0usize; n_fe];
         for q in 1..n_fe {
             coef_start[q] = coef_start[q - 1] + n_groups[q - 1];
         }
         let n_coef_total: usize = n_groups.iter().sum();
 
-        // Sum of weights per group
-        let mut sum_weights = Vec::with_capacity(n_fe);
+        // Flatten fe_ids: fe_ids[q * n_obs + i] = group_ids[i * n_fe + q]
+        // This converts from row-major input to column-major (per-FE) layout
+        let mut fe_ids = vec![0usize; n_fe * n_obs];
+        for i in 0..n_obs {
+            for q in 0..n_fe {
+                fe_ids[q * n_obs + i] = group_ids[i * n_fe + q];
+            }
+        }
+
+        // Sum of weights per group, flattened with same layout as coef
+        let mut sum_weights = vec![0.0; n_coef_total];
         for q in 0..n_fe {
-            let mut sw = vec![0.0; n_groups[q]];
+            let start = coef_start[q];
+            let fe_offset = q * n_obs;
             for i in 0..n_obs {
-                sw[fe_ids[q][i]] += weights[i];
+                let g = fe_ids[fe_offset + i];
+                sum_weights[start + g] += weights[i];
             }
-            // Avoid division by zero
-            for s in &mut sw {
-                if *s == 0.0 {
-                    *s = 1.0;
-                }
+        }
+        // Avoid division by zero
+        for s in &mut sum_weights {
+            if *s == 0.0 {
+                *s = 1.0;
             }
-            sum_weights.push(sw);
         }
 
         Self {
@@ -78,25 +82,47 @@ impl FEInfo {
         }
     }
 
+    /// Get slice of FE group IDs for FE q: &[group_id for obs 0..n_obs]
+    #[inline(always)]
+    pub fn fe_ids_slice(&self, q: usize) -> &[usize] {
+        let start = q * self.n_obs;
+        &self.fe_ids[start..start + self.n_obs]
+    }
+
+    /// Get slice of sum_weights for FE q: &[sum_weight for group 0..n_groups[q]]
+    #[inline(always)]
+    pub fn sum_weights_slice(&self, q: usize) -> &[f64] {
+        let start = self.coef_start[q];
+        let end = if q + 1 < self.n_fe {
+            self.coef_start[q + 1]
+        } else {
+            self.n_coef_total
+        };
+        &self.sum_weights[start..end]
+    }
+
     /// Compute sum of weighted (input - output) for each coefficient.
     /// This is computed ONCE at the start and never changes.
     pub fn compute_in_out(&self, input: &[f64], output: &[f64]) -> Vec<f64> {
         let mut in_out = vec![0.0; self.n_coef_total];
+        let n_obs = self.n_obs;
 
         if self.is_unweighted {
             for q in 0..self.n_fe {
                 let start = self.coef_start[q];
-                let fe_q = &self.fe_ids[q];
-                for i in 0..self.n_obs {
-                    in_out[start + fe_q[i]] += input[i] - output[i];
+                let fe_offset = q * n_obs;
+                for i in 0..n_obs {
+                    let g = self.fe_ids[fe_offset + i];
+                    in_out[start + g] += input[i] - output[i];
                 }
             }
         } else {
             for q in 0..self.n_fe {
                 let start = self.coef_start[q];
-                let fe_q = &self.fe_ids[q];
-                for i in 0..self.n_obs {
-                    in_out[start + fe_q[i]] += (input[i] - output[i]) * self.weights[i];
+                let fe_offset = q * n_obs;
+                for i in 0..n_obs {
+                    let g = self.fe_ids[fe_offset + i];
+                    in_out[start + g] += (input[i] - output[i]) * self.weights[i];
                 }
             }
         }
@@ -107,11 +133,13 @@ impl FEInfo {
     /// Compute output from coefficients: output[i] = input[i] - sum_q(coef[fe_q[i]])
     pub fn compute_output(&self, coef: &[f64], input: &[f64], output: &mut [f64]) {
         output.copy_from_slice(input);
+        let n_obs = self.n_obs;
         for q in 0..self.n_fe {
             let start = self.coef_start[q];
-            let fe_q = &self.fe_ids[q];
-            for i in 0..self.n_obs {
-                output[i] -= coef[start + fe_q[i]];
+            let fe_offset = q * n_obs;
+            for i in 0..n_obs {
+                let g = self.fe_ids[fe_offset + i];
+                output[i] -= coef[start + g];
             }
         }
     }
@@ -190,7 +218,7 @@ pub struct FixestConfig {
 impl Default for FixestConfig {
     fn default() -> Self {
         Self {
-            tol: 1e-8,
+            tol: 1e-6, // Match fixest's default
             maxiter: 100_000,
             iter_warmup: 15,
             iter_proj_after_acc: 40,
@@ -216,10 +244,10 @@ fn project_2fe(
     let n0 = fe_info.n_groups[0];
     let n1 = fe_info.n_groups[1];
     let n_obs = fe_info.n_obs;
-    let fe0 = &fe_info.fe_ids[0];
-    let fe1 = &fe_info.fe_ids[1];
-    let sw0 = &fe_info.sum_weights[0];
-    let sw1 = &fe_info.sum_weights[1];
+    let fe0 = fe_info.fe_ids_slice(0);
+    let fe1 = fe_info.fe_ids_slice(1);
+    let sw0 = fe_info.sum_weights_slice(0);
+    let sw1 = fe_info.sum_weights_slice(1);
     let weights = &fe_info.weights;
 
     // Step 1: Compute beta from alpha_in
@@ -277,6 +305,49 @@ fn project_2fe(
     }
 }
 
+/// Compute beta from alpha (half of project_2fe, for SSR computation).
+/// This matches fixest's compute_fe_coef_2_internal with step_2=false.
+#[inline(always)]
+fn compute_beta_from_alpha(
+    fe_info: &FEInfo,
+    in_out: &[f64],
+    alpha: &[f64],
+    beta: &mut [f64],
+) {
+    let n1 = fe_info.n_groups[1];
+    let n_obs = fe_info.n_obs;
+    let n0 = fe_info.n_groups[0];
+    let fe0 = fe_info.fe_ids_slice(0);
+    let fe1 = fe_info.fe_ids_slice(1);
+    let sw1 = fe_info.sum_weights_slice(1);
+    let weights = &fe_info.weights;
+
+    // beta[g] = (in_out[n0+g] - sum_{i:fe1[i]=g} alpha[fe0[i]] * w[i]) / sw1[g]
+    beta[..n1].copy_from_slice(&in_out[n0..n0 + n1]);
+
+    if fe_info.is_unweighted {
+        for i in 0..n_obs {
+            unsafe {
+                let g1 = *fe1.get_unchecked(i);
+                let g0 = *fe0.get_unchecked(i);
+                *beta.get_unchecked_mut(g1) -= *alpha.get_unchecked(g0);
+            }
+        }
+    } else {
+        for i in 0..n_obs {
+            unsafe {
+                let g1 = *fe1.get_unchecked(i);
+                let g0 = *fe0.get_unchecked(i);
+                *beta.get_unchecked_mut(g1) -= *alpha.get_unchecked(g0) * *weights.get_unchecked(i);
+            }
+        }
+    }
+
+    for g in 0..n1 {
+        unsafe { *beta.get_unchecked_mut(g) /= *sw1.get_unchecked(g) };
+    }
+}
+
 /// Run 2-FE acceleration loop (demean_acc_gnl with two_fe=true).
 fn run_2fe_acceleration(
     fe_info: &FEInfo,
@@ -285,14 +356,17 @@ fn run_2fe_acceleration(
     beta: &mut [f64],       // Temporary buffer
     config: &FixestConfig,
     max_iter: usize,
+    input: &[f64],          // Original input for SSR stopping criterion
 ) -> (usize, bool) {
     let n0 = fe_info.n_groups[0];
+    let n1 = fe_info.n_groups[1];
+    let n_obs = fe_info.n_obs;
 
     // Working buffers
     let mut gx = vec![0.0; n0];
     let mut ggx = vec![0.0; n0];
     let mut temp = vec![0.0; n0];
-    let mut beta_tmp = vec![0.0; fe_info.n_groups[1]];
+    let mut beta_tmp = vec![0.0; n1];
 
     // Grand acceleration buffers
     let mut y = vec![0.0; n0];
@@ -300,12 +374,25 @@ fn run_2fe_acceleration(
     let mut ggy = vec![0.0; n0];
     let mut grand_counter = 0usize;
 
+    // SSR tracking
+    let mut ssr = 0.0;
+    let fe0 = fe_info.fe_ids_slice(0);
+    let fe1 = fe_info.fe_ids_slice(1);
+
     // First iteration: G(alpha)
     project_2fe(fe_info, in_out, alpha, &mut gx, beta);
 
     let mut keep_going = should_continue(alpha, &gx, config.tol);
     let mut iter = 0;
 
+    if std::env::var("PYFIXEST_DEBUG_ITER").is_ok() {
+        let alpha_norm: f64 = alpha.iter().map(|x| x * x).sum();
+        let gx_norm: f64 = gx.iter().map(|x| x * x).sum();
+        let diff_norm: f64 = alpha.iter().zip(gx.iter()).map(|(a, g)| (a - g).powi(2)).sum();
+        eprintln!("[run_2fe_acc] Initial: alpha_norm={:.6e}, gx_norm={:.6e}, diff_norm={:.6e}, keep_going={}",
+                  alpha_norm, gx_norm, diff_norm, keep_going);
+    }
+
     while keep_going && iter < max_iter {
         iter += 1;
 
@@ -345,6 +432,26 @@ fn run_2fe_acceleration(
                 }
             }
         }
+
+        // SSR stopping criterion every 40 iterations (matching fixest)
+        if iter % 40 == 0 {
+            let ssr_old = ssr;
+
+            // Compute beta from gx (current alpha) for SSR computation
+            // Only need to compute beta, not full projection (matches fixest)
+            compute_beta_from_alpha(fe_info, in_out, &gx, &mut beta_tmp);
+
+            // Compute SSR = sum((input - alpha[fe0] - beta[fe1])^2)
+            ssr = 0.0;
+            for i in 0..n_obs {
+                let resid = input[i] - gx[fe0[i]] - beta_tmp[fe1[i]];
+                ssr += resid * resid;
+            }
+
+            if iter > 40 && stopping_crit(ssr_old, ssr, config.tol) {
+                break;
+            }
+        }
     }
 
     (iter, !keep_going)
@@ -356,6 +463,7 @@ fn run_2fe_acceleration(
 
 /// Q-FE projection: Compute G(coef_in) -> coef_out.
 /// Updates FEs in reverse order (Q-1 down to 0) matching fixest.
+/// Specialized for 3 FEs (most common case) with loop unrolling.
 #[inline(always)]
 fn project_qfe(
     fe_info: &FEInfo,
@@ -366,65 +474,301 @@ fn project_qfe(
 ) {
     let n_fe = fe_info.n_fe;
     let n_obs = fe_info.n_obs;
-    let weights = &fe_info.weights;
 
-    // Process in reverse order
+    // Pre-compute raw pointers for hot loops
+    let fe_ids_ptr = fe_info.fe_ids.as_ptr();
+    let coef_start = &fe_info.coef_start;
+    let sum_other_ptr = sum_other_means.as_mut_ptr();
+    let coef_in_ptr = coef_in.as_ptr();
+    let coef_out_ptr = coef_out.as_mut_ptr();
+    let weights_ptr = fe_info.weights.as_ptr();
+
+    // Specialized fast path for 3 FEs (common case)
+    if n_fe == 3 && fe_info.is_unweighted {
+        project_qfe_3fe_unweighted(
+            n_obs,
+            fe_ids_ptr,
+            coef_start,
+            sum_other_ptr,
+            coef_in_ptr,
+            coef_out_ptr,
+            in_out,
+            &fe_info.n_groups,
+            &fe_info.sum_weights,
+        );
+        return;
+    }
+
+    // General case for any number of FEs
+    project_qfe_general(
+        fe_info,
+        in_out,
+        coef_in,
+        coef_out,
+        sum_other_means,
+        n_fe,
+        n_obs,
+        fe_ids_ptr,
+        coef_start,
+        sum_other_ptr,
+        coef_in_ptr,
+        coef_out_ptr,
+        weights_ptr,
+    );
+}
+
+/// Specialized 3-FE projection for unweighted case.
+#[inline(always)]
+fn project_qfe_3fe_unweighted(
+    n_obs: usize,
+    fe_ids_ptr: *const usize,
+    coef_start: &[usize],
+    sum_other_ptr: *mut f64,
+    coef_in_ptr: *const f64,
+    coef_out_ptr: *mut f64,
+    in_out: &[f64],
+    n_groups: &[usize],
+    sum_weights: &[f64],
+) {
+    let (start_0, start_1, start_2) = (coef_start[0], coef_start[1], coef_start[2]);
+    let fe_0_ptr = fe_ids_ptr;
+    let fe_1_ptr = unsafe { fe_ids_ptr.add(n_obs) };
+    let fe_2_ptr = unsafe { fe_ids_ptr.add(2 * n_obs) };
+    let in_out_ptr = in_out.as_ptr();
+
+    // === q=2: Process FE 2 (add from FE 0, 1 using coef_in) ===
+    // No need to fill with zeros - we directly assign the sum of FE 0 and FE 1 contributions
+    // Unrolled loop: process 4 observations at a time
+    let n_chunks = n_obs / 4;
+    let remainder = n_obs % 4;
+
+    unsafe {
+        for chunk in 0..n_chunks {
+            let base = chunk * 4;
+            let g0_0 = *fe_0_ptr.add(base);
+            let g0_1 = *fe_0_ptr.add(base + 1);
+            let g0_2 = *fe_0_ptr.add(base + 2);
+            let g0_3 = *fe_0_ptr.add(base + 3);
+            let g1_0 = *fe_1_ptr.add(base);
+            let g1_1 = *fe_1_ptr.add(base + 1);
+            let g1_2 = *fe_1_ptr.add(base + 2);
+            let g1_3 = *fe_1_ptr.add(base + 3);
+
+            *sum_other_ptr.add(base) =
+                *coef_in_ptr.add(start_0 + g0_0) + *coef_in_ptr.add(start_1 + g1_0);
+            *sum_other_ptr.add(base + 1) =
+                *coef_in_ptr.add(start_0 + g0_1) + *coef_in_ptr.add(start_1 + g1_1);
+            *sum_other_ptr.add(base + 2) =
+                *coef_in_ptr.add(start_0 + g0_2) + *coef_in_ptr.add(start_1 + g1_2);
+            *sum_other_ptr.add(base + 3) =
+                *coef_in_ptr.add(start_0 + g0_3) + *coef_in_ptr.add(start_1 + g1_3);
+        }
+
+        for i in (n_chunks * 4)..(n_chunks * 4 + remainder) {
+            let g0 = *fe_0_ptr.add(i);
+            let g1 = *fe_1_ptr.add(i);
+            *sum_other_ptr.add(i) = *coef_in_ptr.add(start_0 + g0) + *coef_in_ptr.add(start_1 + g1);
+        }
+    }
+
+    // Compute coef_out for FE 2
+    let n_groups_2 = n_groups[2];
+    unsafe {
+        std::ptr::copy_nonoverlapping(
+            in_out_ptr.add(start_2),
+            coef_out_ptr.add(start_2),
+            n_groups_2,
+        );
+    }
+
+    unsafe {
+        for i in 0..n_obs {
+            let g = *fe_2_ptr.add(i);
+            *coef_out_ptr.add(start_2 + g) -= *sum_other_ptr.add(i);
+        }
+        for g in 0..n_groups_2 {
+            *coef_out_ptr.add(start_2 + g) /= *sum_weights.get_unchecked(start_2 + g);
+        }
+    }
+
+    // === q=1: Process FE 1 (add from FE 0 using coef_in, FE 2 using coef_out) ===
+    unsafe {
+        for chunk in 0..n_chunks {
+            let base = chunk * 4;
+            let g0_0 = *fe_0_ptr.add(base);
+            let g0_1 = *fe_0_ptr.add(base + 1);
+            let g0_2 = *fe_0_ptr.add(base + 2);
+            let g0_3 = *fe_0_ptr.add(base + 3);
+            let g2_0 = *fe_2_ptr.add(base);
+            let g2_1 = *fe_2_ptr.add(base + 1);
+            let g2_2 = *fe_2_ptr.add(base + 2);
+            let g2_3 = *fe_2_ptr.add(base + 3);
+
+            *sum_other_ptr.add(base) =
+                *coef_in_ptr.add(start_0 + g0_0) + *coef_out_ptr.add(start_2 + g2_0);
+            *sum_other_ptr.add(base + 1) =
+                *coef_in_ptr.add(start_0 + g0_1) + *coef_out_ptr.add(start_2 + g2_1);
+            *sum_other_ptr.add(base + 2) =
+                *coef_in_ptr.add(start_0 + g0_2) + *coef_out_ptr.add(start_2 + g2_2);
+            *sum_other_ptr.add(base + 3) =
+                *coef_in_ptr.add(start_0 + g0_3) + *coef_out_ptr.add(start_2 + g2_3);
+        }
+
+        for i in (n_chunks * 4)..(n_chunks * 4 + remainder) {
+            let g0 = *fe_0_ptr.add(i);
+            let g2 = *fe_2_ptr.add(i);
+            *sum_other_ptr.add(i) = *coef_in_ptr.add(start_0 + g0) + *coef_out_ptr.add(start_2 + g2);
+        }
+    }
+
+    // Compute coef_out for FE 1
+    let n_groups_1 = n_groups[1];
+    unsafe {
+        std::ptr::copy_nonoverlapping(
+            in_out_ptr.add(start_1),
+            coef_out_ptr.add(start_1),
+            n_groups_1,
+        );
+    }
+
+    unsafe {
+        for i in 0..n_obs {
+            let g = *fe_1_ptr.add(i);
+            *coef_out_ptr.add(start_1 + g) -= *sum_other_ptr.add(i);
+        }
+        for g in 0..n_groups_1 {
+            *coef_out_ptr.add(start_1 + g) /= *sum_weights.get_unchecked(start_1 + g);
+        }
+    }
+
+    // === q=0: Process FE 0 (add from FE 1, 2 using coef_out) ===
+    unsafe {
+        for chunk in 0..n_chunks {
+            let base = chunk * 4;
+            let g1_0 = *fe_1_ptr.add(base);
+            let g1_1 = *fe_1_ptr.add(base + 1);
+            let g1_2 = *fe_1_ptr.add(base + 2);
+            let g1_3 = *fe_1_ptr.add(base + 3);
+            let g2_0 = *fe_2_ptr.add(base);
+            let g2_1 = *fe_2_ptr.add(base + 1);
+            let g2_2 = *fe_2_ptr.add(base + 2);
+            let g2_3 = *fe_2_ptr.add(base + 3);
+
+            *sum_other_ptr.add(base) =
+                *coef_out_ptr.add(start_1 + g1_0) + *coef_out_ptr.add(start_2 + g2_0);
+            *sum_other_ptr.add(base + 1) =
+                *coef_out_ptr.add(start_1 + g1_1) + *coef_out_ptr.add(start_2 + g2_1);
+            *sum_other_ptr.add(base + 2) =
+                *coef_out_ptr.add(start_1 + g1_2) + *coef_out_ptr.add(start_2 + g2_2);
+            *sum_other_ptr.add(base + 3) =
+                *coef_out_ptr.add(start_1 + g1_3) + *coef_out_ptr.add(start_2 + g2_3);
+        }
+
+        for i in (n_chunks * 4)..(n_chunks * 4 + remainder) {
+            let g1 = *fe_1_ptr.add(i);
+            let g2 = *fe_2_ptr.add(i);
+            *sum_other_ptr.add(i) =
+                *coef_out_ptr.add(start_1 + g1) + *coef_out_ptr.add(start_2 + g2);
+        }
+    }
+
+    // Compute coef_out for FE 0
+    let n_groups_0 = n_groups[0];
+    unsafe {
+        std::ptr::copy_nonoverlapping(in_out_ptr.add(start_0), coef_out_ptr.add(start_0), n_groups_0);
+    }
+
+    unsafe {
+        for i in 0..n_obs {
+            let g = *fe_0_ptr.add(i);
+            *coef_out_ptr.add(start_0 + g) -= *sum_other_ptr.add(i);
+        }
+        for g in 0..n_groups_0 {
+            *coef_out_ptr.add(start_0 + g) /= *sum_weights.get_unchecked(start_0 + g);
+        }
+    }
+}
+
+/// General Q-FE projection (any number of FEs, weighted or unweighted).
+#[inline(always)]
+#[allow(clippy::too_many_arguments)]
+fn project_qfe_general(
+    fe_info: &FEInfo,
+    in_out: &[f64],
+    _coef_in: &[f64],  // Used via coef_in_ptr
+    _coef_out: &mut [f64],  // Used via coef_out_ptr
+    _sum_other_means: &mut [f64],  // Used via sum_other_ptr
+    n_fe: usize,
+    n_obs: usize,
+    fe_ids_ptr: *const usize,
+    coef_start: &[usize],
+    sum_other_ptr: *mut f64,
+    coef_in_ptr: *const f64,
+    coef_out_ptr: *mut f64,
+    weights_ptr: *const f64,
+) {
+    let in_out_ptr = in_out.as_ptr();
+
+    // Process in reverse order (Q-1 down to 0, matching fixest)
     for q in (0..n_fe).rev() {
-        // Step 1: Compute sum of other FE contributions (NO weights here - this is just
-        // expanding coefficients to observation space)
-        sum_other_means.fill(0.0);
+        // Step 1: Fill sum_other_means with zeros
+        unsafe {
+            std::ptr::write_bytes(sum_other_ptr, 0, n_obs);
+        }
 
         // Add contributions from FEs with h < q (use coef_in)
         for h in 0..q {
-            let start_h = fe_info.coef_start[h];
-            let fe_h = &fe_info.fe_ids[h];
-            // SAFETY: fe_h[i] < n_groups[h], start_h + fe_h[i] < coef_in.len()
+            let start_h = coef_start[h];
+            let fe_h_ptr = unsafe { fe_ids_ptr.add(h * n_obs) };
             for i in 0..n_obs {
                 unsafe {
-                    let g = *fe_h.get_unchecked(i);
-                    *sum_other_means.get_unchecked_mut(i) += *coef_in.get_unchecked(start_h + g);
+                    let g = *fe_h_ptr.add(i);
+                    *sum_other_ptr.add(i) += *coef_in_ptr.add(start_h + g);
                 }
             }
         }
 
-        // Add contributions from FEs with h > q (use coef_out, already computed)
+        // Add contributions from FEs with h > q (use coef_out)
         for h in (q + 1)..n_fe {
-            let start_h = fe_info.coef_start[h];
-            let fe_h = &fe_info.fe_ids[h];
-            // SAFETY: fe_h[i] < n_groups[h], start_h + fe_h[i] < coef_out.len()
+            let start_h = coef_start[h];
+            let fe_h_ptr = unsafe { fe_ids_ptr.add(h * n_obs) };
             for i in 0..n_obs {
                 unsafe {
-                    let g = *fe_h.get_unchecked(i);
-                    *sum_other_means.get_unchecked_mut(i) += *coef_out.get_unchecked(start_h + g);
+                    let g = *fe_h_ptr.add(i);
+                    *sum_other_ptr.add(i) += *coef_out_ptr.add(start_h + g);
                 }
             }
         }
 
         // Step 2: Compute new coefficients for FE q
-        let start_q = fe_info.coef_start[q];
+        let start_q = coef_start[q];
         let n_groups_q = fe_info.n_groups[q];
-        let fe_q = &fe_info.fe_ids[q];
-        let sw_q = &fe_info.sum_weights[q];
+        let fe_q_ptr = unsafe { fe_ids_ptr.add(q * n_obs) };
+        let sw_q = fe_info.sum_weights_slice(q);
 
-        // Initialize to in_out (pre-aggregated weighted (input-output))
-        coef_out[start_q..start_q + n_groups_q]
-            .copy_from_slice(&in_out[start_q..start_q + n_groups_q]);
+        // Initialize to in_out
+        unsafe {
+            std::ptr::copy_nonoverlapping(
+                in_out_ptr.add(start_q),
+                coef_out_ptr.add(start_q),
+                n_groups_q,
+            );
+        }
 
-        // Subtract weighted other FE contributions (weights applied when aggregating back)
-        // SAFETY: fe_q[i] < n_groups_q, start_q + fe_q[i] < coef_out.len()
+        // Subtract weighted other FE contributions
         if fe_info.is_unweighted {
             for i in 0..n_obs {
                 unsafe {
-                    let g = *fe_q.get_unchecked(i);
-                    *coef_out.get_unchecked_mut(start_q + g) -= *sum_other_means.get_unchecked(i);
+                    let g = *fe_q_ptr.add(i);
+                    *coef_out_ptr.add(start_q + g) -= *sum_other_ptr.add(i);
                 }
             }
         } else {
             for i in 0..n_obs {
                 unsafe {
-                    let g = *fe_q.get_unchecked(i);
-                    *coef_out.get_unchecked_mut(start_q + g) -=
-                        *sum_other_means.get_unchecked(i) * *weights.get_unchecked(i);
+                    let g = *fe_q_ptr.add(i);
+                    *coef_out_ptr.add(start_q + g) -=
+                        *sum_other_ptr.add(i) * *weights_ptr.add(i);
                 }
             }
         }
@@ -432,7 +776,7 @@ fn project_qfe(
         // Divide by sum of weights
         for g in 0..n_groups_q {
             unsafe {
-                *coef_out.get_unchecked_mut(start_q + g) /= *sw_q.get_unchecked(g);
+                *coef_out_ptr.add(start_q + g) /= *sw_q.get_unchecked(g);
             }
         }
     }
@@ -497,7 +841,11 @@ fn run_qfe_acceleration(
         project_qfe(fe_info, in_out, coef, &mut gx, &mut sum_other_means);
 
         // Convergence check on nb_coef_no_q
+        let prev_keep_going = keep_going;
         keep_going = should_continue(&coef[..nb_coef_no_q], &gx[..nb_coef_no_q], config.tol);
+        if std::env::var("PYFIXEST_DEBUG_ITER").is_ok() && prev_keep_going && !keep_going {
+            eprintln!("[run_qfe_acc] Coefficient converged at iter {}", iter);
+        }
 
         // Grand acceleration on nb_coef_no_q
         if iter % config.iter_grand_acc == 0 {
@@ -523,6 +871,11 @@ fn run_qfe_acceleration(
             ssr = output_buf.iter().map(|&r| r * r).sum();
 
             if iter > 40 && stopping_crit(ssr_old, ssr, config.tol) {
+                if std::env::var("PYFIXEST_DEBUG_ITER").is_ok() {
+                    eprintln!("[run_qfe_acc] SSR converged at iter {}: ssr_old={:.6e}, ssr={:.6e}",
+                              iter, ssr_old, ssr);
+                }
+                keep_going = false;  // Mark as converged
                 break;
             }
         }
@@ -557,8 +910,8 @@ pub fn demean_single(
     if n_fe == 1 {
         // Single FE: closed-form solution
         let mut result = vec![0.0; n_obs];
-        let fe0 = &fe_info.fe_ids[0];
-        let sw0 = &fe_info.sum_weights[0];
+        let fe0 = fe_info.fe_ids_slice(0);
+        let sw0 = fe_info.sum_weights_slice(0);
 
         // coef[g] = in_out[g] / sw[g]
         let coef: Vec<f64> = in_out.iter().zip(sw0.iter()).map(|(&io, &sw)| io / sw).collect();
@@ -586,12 +939,13 @@ pub fn demean_single(
             &mut beta,
             config,
             config.maxiter,
+            input,
         );
 
         // Compute output
         let mut result = vec![0.0; n_obs];
-        let fe0 = &fe_info.fe_ids[0];
-        let fe1 = &fe_info.fe_ids[1];
+        let fe0 = fe_info.fe_ids_slice(0);
+        let fe1 = fe_info.fe_ids_slice(1);
 
         for i in 0..n_obs {
             result[i] = input[i] - alpha[fe0[i]] - beta[fe1[i]];
@@ -623,14 +977,16 @@ pub fn demean_single(
         let mut in_out = vec![0.0; fe_info.n_coef_total];
         for q in 0..fe_info.n_fe {
             let start = fe_info.coef_start[q];
-            let fe_q = &fe_info.fe_ids[q];
+            let fe_offset = q * n_obs;
             if fe_info.is_unweighted {
                 for i in 0..n_obs {
-                    in_out[start + fe_q[i]] += input[i] - mu[i];
+                    let g = fe_info.fe_ids[fe_offset + i];
+                    in_out[start + g] += input[i] - mu[i];
                 }
             } else {
                 for i in 0..n_obs {
-                    in_out[start + fe_q[i]] += (input[i] - mu[i]) * fe_info.weights[i];
+                    let g = fe_info.fe_ids[fe_offset + i];
+                    in_out[start + g] += (input[i] - mu[i]) * fe_info.weights[i];
                 }
             }
         }
@@ -641,9 +997,10 @@ pub fn demean_single(
     let add_coef_to_mu = |coef: &[f64], mu: &mut [f64]| {
         for q in 0..fe_info.n_fe {
             let start = fe_info.coef_start[q];
-            let fe_q = &fe_info.fe_ids[q];
+            let fe_offset = q * n_obs;
             for i in 0..n_obs {
-                mu[i] += coef[start + fe_q[i]];
+                let g = fe_info.fe_ids[fe_offset + i];
+                mu[i] += coef[start + g];
             }
         }
     };
@@ -652,6 +1009,7 @@ pub fn demean_single(
     let mut coef = vec![0.0; n_coef];
     let in_out_phase1 = compute_in_out_from_mu(&mu);
 
+    let t1 = std::time::Instant::now();
     let (iter1, converged1) = run_qfe_acceleration(
         fe_info,
         &in_out_phase1,
@@ -660,8 +1018,15 @@ pub fn demean_single(
         config.iter_warmup,
         input,
     );
+    let phase1_time = t1.elapsed();
     total_iter += iter1;
 
+    // Debug: print iteration counts for 3+ FE case
+    if std::env::var("PYFIXEST_DEBUG_ITER").is_ok() {
+        eprintln!("[demean_single] Phase 1 (warmup): {} iters, converged={}, time={:.2}ms",
+                  iter1, converged1, phase1_time.as_secs_f64() * 1000.0);
+    }
+
     // Add Phase 1 coefficients to mu
     add_coef_to_mu(&coef, &mut mu);
 
@@ -676,20 +1041,37 @@ pub fn demean_single(
         // Extract only the first 2 FE portions of in_out
         let in_out_2fe: Vec<f64> = in_out_phase2[..n0 + n1].to_vec();
 
+        if std::env::var("PYFIXEST_DEBUG_ITER").is_ok() {
+            let in_out_norm: f64 = in_out_2fe.iter().map(|x| x * x).sum();
+            eprintln!("[demean_single] Phase 2: in_out_2fe norm^2={:.6e}, n0={}, n1={}",
+                      in_out_norm, n0, n1);
+        }
+
+        // Compute effective input for SSR: input - mu (accounts for Phase 1)
+        let effective_input: Vec<f64> = (0..n_obs).map(|i| input[i] - mu[i]).collect();
+
         let iter_max_2fe = config.maxiter / 2;
-        let (iter2, _) = run_2fe_acceleration(
+        let t2 = std::time::Instant::now();
+        let (iter2, conv2) = run_2fe_acceleration(
             fe_info,
             &in_out_2fe,
             &mut alpha,
             &mut beta,
             config,
             iter_max_2fe,
+            &effective_input,
         );
+        let phase2_time = t2.elapsed();
         total_iter += iter2;
 
+        if std::env::var("PYFIXEST_DEBUG_ITER").is_ok() {
+            eprintln!("[demean_single] Phase 2 (2-FE): {} iters, converged={}, time={:.2}ms",
+                      iter2, conv2, phase2_time.as_secs_f64() * 1000.0);
+        }
+
         // Add Phase 2's alpha/beta to mu (only FE0 and FE1)
-        let fe0 = &fe_info.fe_ids[0];
-        let fe1 = &fe_info.fe_ids[1];
+        let fe0 = fe_info.fe_ids_slice(0);
+        let fe1 = fe_info.fe_ids_slice(1);
         for i in 0..n_obs {
             mu[i] += alpha[fe0[i]] + beta[fe1[i]];
         }
@@ -702,7 +1084,8 @@ pub fn demean_single(
             // Start with fresh coefficients
             coef.fill(0.0);
 
-            let (iter3, _) = run_qfe_acceleration(
+            let t3 = std::time::Instant::now();
+            let (iter3, conv3) = run_qfe_acceleration(
                 fe_info,
                 &in_out_phase3,
                 &mut coef,
@@ -710,8 +1093,14 @@ pub fn demean_single(
                 remaining,
                 input,
             );
+            let phase3_time = t3.elapsed();
             total_iter += iter3;
 
+            if std::env::var("PYFIXEST_DEBUG_ITER").is_ok() {
+                eprintln!("[demean_single] Phase 3 (re-acc): {} iters, converged={}, time={:.2}ms",
+                          iter3, conv3, phase3_time.as_secs_f64() * 1000.0);
+            }
+
             // Add Phase 3 coefficients to mu
             add_coef_to_mu(&coef, &mut mu);
         }
diff --git a/src/demean_accelerated/mod.rs b/src/demean_accelerated/mod.rs
index c1e17b6a1..72bf6f542 100644
--- a/src/demean_accelerated/mod.rs
+++ b/src/demean_accelerated/mod.rs
@@ -76,20 +76,21 @@ fn demean_coef_space(
     let not_converged = Arc::new(AtomicUsize::new(0));
     let mut res = Array2::<f64>::zeros((n_samples, n_features));
 
+    // Create FEInfo once and share across all columns (it only depends on FE structure)
+    let fe_info = FEInfo::new(
+        n_samples,
+        n_factors,
+        group_ids,
+        n_groups_per_factor,
+        sample_weights,
+    );
+
     res.axis_iter_mut(ndarray::Axis(1))
         .into_par_iter()
         .enumerate()
         .for_each(|(k, mut col)| {
             let xk: Vec<f64> = (0..n_samples).map(|i| x[[i, k]]).collect();
 
-            let fe_info = FEInfo::new(
-                n_samples,
-                n_factors,
-                group_ids,
-                n_groups_per_factor,
-                sample_weights,
-            );
-
             let (result, _iter, converged) = demean_single(&fe_info, &xk, config);
 
             if !converged {

From 2ab945d8cd236a856310af228e6a2edb2ad9018e Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Sat, 27 Dec 2025 13:25:11 +0100
Subject: [PATCH 03/24] Refactor demean_accelerated into modular trait-based
 architecture

Restructure the Rust demeaning code for clarity and maintainability:
- Introduce Projector trait for FE-specific projection strategies
- Introduce Demeaner trait for high-level solver strategies
- Unified DemeanBuffers struct for scratch space management
- Replace unsafe pointer code with safe iterator-based implementations
- Move related functions into appropriate impl blocks
---
 .cargo/config.toml                    |    7 -
 benchmarks/bench_demean_r.R           |    8 +-
 benchmarks/bench_native_comparison.py |    9 +
 src/demean_accelerated/accelerator.rs |  307 +++++++
 src/demean_accelerated/coef_space.rs  | 1174 -------------------------
 src/demean_accelerated/demeaner.rs    |  264 ++++++
 src/demean_accelerated/mod.rs         |  141 +--
 src/demean_accelerated/projection.rs  |  347 ++++++++
 src/demean_accelerated/types.rs       |  447 ++++++++++
 9 files changed, 1465 insertions(+), 1239 deletions(-)
 delete mode 100644 .cargo/config.toml
 create mode 100644 src/demean_accelerated/accelerator.rs
 delete mode 100644 src/demean_accelerated/coef_space.rs
 create mode 100644 src/demean_accelerated/demeaner.rs
 create mode 100644 src/demean_accelerated/projection.rs
 create mode 100644 src/demean_accelerated/types.rs

diff --git a/.cargo/config.toml b/.cargo/config.toml
deleted file mode 100644
index f5833703c..000000000
--- a/.cargo/config.toml
+++ /dev/null
@@ -1,7 +0,0 @@
-[target.aarch64-apple-darwin]
-rustflags = [
-    "-C", "target-cpu=native",
-    "-C", "target-feature=+neon,+fp-armv8,+aes,+sha2",
-    "-C", "llvm-args=-enable-unsafe-fp-math",
-    "-C", "llvm-args=-fast-isel=false",
-]
diff --git a/benchmarks/bench_demean_r.R b/benchmarks/bench_demean_r.R
index fb894078d..66bdc342a 100644
--- a/benchmarks/bench_demean_r.R
+++ b/benchmarks/bench_demean_r.R
@@ -9,8 +9,8 @@ n_obs <- if (length(args) >= 1) as.integer(args[1]) else 100000L
 dgp_type <- if (length(args) >= 2) args[2] else "difficult"
 n_fe <- if (length(args) >= 3) as.integer(args[3]) else 3L
 
-# Use all available threads for fair comparison (pyfixest also uses all threads)
-setFixest_nthreads(0)  # 0 = use all available
+# Use 2 threads to match fixest_benchmarks settings
+setFixest_nthreads(2)
 
 # Generate data matching Python benchmark DGP
 set.seed(42)
@@ -52,7 +52,7 @@ if (n_fe == 2) {
 }
 
 # Warm up
-invisible(feols(fml, data = df))
+invisible(feols(fml, data = df, notes = FALSE, warn = FALSE, nthreads = 2L))
 
 # Benchmark
 n_runs <- 5L
@@ -60,7 +60,7 @@ times <- numeric(n_runs)
 
 for (i in 1:n_runs) {
   start <- Sys.time()
-  fit <- feols(fml, data = df)
+  fit <- feols(fml, data = df, notes = FALSE, warn = FALSE, nthreads = 2L)
   end <- Sys.time()
   times[i] <- as.numeric(end - start, units = "secs") * 1000  # ms
 }
diff --git a/benchmarks/bench_native_comparison.py b/benchmarks/bench_native_comparison.py
index 1af0ee2ab..f45ffd08f 100644
--- a/benchmarks/bench_native_comparison.py
+++ b/benchmarks/bench_native_comparison.py
@@ -8,6 +8,11 @@
 
 from __future__ import annotations
 
+import os
+
+# Set thread count for Rayon (pyfixest) BEFORE importing pyfixest
+os.environ["RAYON_NUM_THREADS"] = "2"
+
 import json
 import subprocess
 import time
@@ -129,6 +134,10 @@ def main():
         (100_000, "difficult", 2),
         (100_000, "simple", 3),
         (100_000, "difficult", 3),
+        (1_000_000, "simple", 2),
+        (1_000_000, "difficult", 2),
+        (1_000_000, "simple", 3),
+        (1_000_000, "difficult", 3),
     ]
 
     results = []
diff --git a/src/demean_accelerated/accelerator.rs b/src/demean_accelerated/accelerator.rs
new file mode 100644
index 000000000..9733e6c01
--- /dev/null
+++ b/src/demean_accelerated/accelerator.rs
@@ -0,0 +1,307 @@
+//! Acceleration strategies for fixed effects demeaning.
+//!
+//! This module provides the [`Accelerator`] trait for iteration acceleration,
+//! with the default implementation [`IronsTuckGrand`] matching fixest's algorithm.
+
+use crate::demean_accelerated::projection::Projector;
+use crate::demean_accelerated::types::FixestConfig;
+
+// =============================================================================
+// Accelerator Trait
+// =============================================================================
+
+/// An acceleration strategy for iterative demeaning.
+///
+/// Accelerators take a [`Projector`] and repeatedly apply it until convergence,
+/// using various techniques to speed up convergence.
+///
+/// # Associated Types
+///
+/// Each accelerator has its own buffer type, as different strategies require
+/// different working memory (e.g., Irons-Tuck needs snapshots for extrapolation).
+pub trait Accelerator {
+    /// Working buffers needed by this acceleration strategy.
+    type Buffers;
+
+    /// Create buffers for the given coefficient count.
+    fn create_buffers(n_coef: usize) -> Self::Buffers;
+
+    /// Check if two scalar values have converged within tolerance.
+    ///
+    /// Uses both absolute and relative tolerance: converged if
+    /// `|a - b| <= tol` OR `|a - b| <= tol * (0.1 + |a|)`.
+    ///
+    /// The `0.1` denominator offset prevents division by zero and provides
+    /// a smooth transition between absolute tolerance (when |a| << 0.1) and
+    /// relative tolerance (when |a| >> 0.1). This matches fixest's convergence check.
+    ///
+    /// # Implementation Note
+    ///
+    /// The relative tolerance check `|a - b| / (0.1 + |a|) <= tol` is rewritten
+    /// as `|a - b| <= tol * (0.1 + |a|)` to avoid division, improving performance
+    /// and SIMD-friendliness.
+    #[inline]
+    fn converged(a: f64, b: f64, tol: f64) -> bool {
+        // 0.1 offset: ensures numerical stability and smooth absolute/relative transition
+        const RELATIVE_TOL_OFFSET: f64 = 0.1;
+        let diff = (a - b).abs();
+        // Absolute tolerance check (faster, handles small values)
+        // OR relative tolerance check (multiplication form, avoids division)
+        (diff <= tol) || (diff <= tol * (RELATIVE_TOL_OFFSET + a.abs()))
+    }
+
+    /// Check if coefficient arrays have NOT converged (should keep iterating).
+    ///
+    /// Returns `true` if ANY pair of coefficients differs by more than tolerance.
+    /// Uses early-exit: returns as soon as any non-converged pair is found.
+    #[inline]
+    fn should_continue(coef_old: &[f64], coef_new: &[f64], tol: f64) -> bool {
+        coef_old
+            .iter()
+            .zip(coef_new.iter())
+            .any(|(&a, &b)| !Self::converged(a, b, tol))
+    }
+
+    /// Run the acceleration loop to convergence.
+    ///
+    /// # Arguments
+    ///
+    /// * `projector` - The projection operation to accelerate
+    /// * `coef` - Initial coefficients (modified in place with final result)
+    /// * `buffers` - Working buffers for the acceleration
+    /// * `config` - Algorithm configuration (tolerance, etc.)
+    /// * `max_iter` - Maximum iterations before giving up
+    ///
+    /// # Returns
+    ///
+    /// Tuple of (iterations_used, converged_flag)
+    fn run<P: Projector>(
+        projector: &mut P,
+        coef: &mut [f64],
+        buffers: &mut Self::Buffers,
+        config: &FixestConfig,
+        max_iter: usize,
+    ) -> (usize, bool);
+}
+
+// =============================================================================
+// IronsTuckGrand Accelerator
+// =============================================================================
+
+/// Irons-Tuck acceleration with Grand acceleration.
+///
+/// This is the default acceleration strategy, matching fixest's implementation.
+/// It combines two techniques:
+///
+/// 1. **Irons-Tuck**: After computing G(x) and G(G(x)), extrapolates to estimate
+///    the fixed point directly using the formula from Irons & Tuck (1969).
+///
+/// 2. **Grand acceleration**: Every `iter_grand_acc` iterations, applies Irons-Tuck
+///    at a coarser level to accelerate long-range convergence.
+///
+/// Additionally, SSR (sum of squared residuals) is checked every 40 iterations
+/// as a secondary convergence criterion. The interval of 40 balances overhead
+/// (SSR computation is O(n)) against catching convergence that coefficient
+/// checks might miss.
+pub struct IronsTuckGrand;
+
+/// Interval for SSR-based convergence checks (every N iterations).
+/// Matches fixest's check frequency for secondary convergence criterion.
+const SSR_CHECK_INTERVAL: usize = 40;
+
+/// Buffers for Irons-Tuck + Grand acceleration.
+pub struct IronsTuckGrandBuffers {
+    /// G(x): Result of one projection step.
+    pub gx: Vec<f64>,
+    /// G(G(x)): Result of two projection steps.
+    pub ggx: Vec<f64>,
+    /// Temporary buffer for post-acceleration projection.
+    pub temp: Vec<f64>,
+    /// Grand acceleration: y snapshot.
+    pub y: Vec<f64>,
+    /// Grand acceleration: G(y) snapshot.
+    pub gy: Vec<f64>,
+    /// Grand acceleration: G(G(y)) snapshot.
+    pub ggy: Vec<f64>,
+}
+
+impl IronsTuckGrand {
+    /// Apply Irons-Tuck acceleration to speed up convergence.
+    ///
+    /// Given three successive iterates x, G(x), G(G(x)), computes an accelerated
+    /// update that often converges faster than simple iteration.
+    ///
+    /// Returns `true` if already converged (denominator is zero), `false` otherwise.
+    #[inline(always)]
+    fn accelerate(x: &mut [f64], gx: &[f64], ggx: &[f64]) -> bool {
+        let (vprod, ssq) = x
+            .iter()
+            .zip(gx.iter())
+            .zip(ggx.iter())
+            .map(|((&x_i, &gx_i), &ggx_i)| {
+                let delta_gx = ggx_i - gx_i;
+                let delta2_x = delta_gx - gx_i + x_i;
+                (delta_gx * delta2_x, delta2_x * delta2_x)
+            })
+            .fold((0.0, 0.0), |(vp, sq), (dvp, dsq)| (vp + dvp, sq + dsq));
+
+        if ssq == 0.0 {
+            return true;
+        }
+
+        let coef = vprod / ssq;
+        x.iter_mut()
+            .zip(gx.iter())
+            .zip(ggx.iter())
+            .for_each(|((x_i, &gx_i), &ggx_i)| {
+                *x_i = ggx_i - coef * (ggx_i - gx_i);
+            });
+
+        false
+    }
+}
+
+impl Accelerator for IronsTuckGrand {
+    type Buffers = IronsTuckGrandBuffers;
+
+    #[inline]
+    fn create_buffers(n_coef: usize) -> Self::Buffers {
+        IronsTuckGrandBuffers {
+            gx: vec![0.0; n_coef],
+            ggx: vec![0.0; n_coef],
+            temp: vec![0.0; n_coef],
+            y: vec![0.0; n_coef],
+            gy: vec![0.0; n_coef],
+            ggy: vec![0.0; n_coef],
+        }
+    }
+
+    fn run<P: Projector>(
+        projector: &mut P,
+        coef: &mut [f64],
+        buffers: &mut Self::Buffers,
+        config: &FixestConfig,
+        max_iter: usize,
+    ) -> (usize, bool) {
+        let conv_len = projector.convergence_len();
+
+        // Initial projection
+        projector.project(coef, &mut buffers.gx);
+
+        let mut keep_going =
+            Self::should_continue(&coef[..conv_len], &buffers.gx[..conv_len], config.tol);
+        let mut iter = 0;
+        let mut grand_counter = 0usize;
+        let mut ssr = 0.0;
+
+        while keep_going && iter < max_iter {
+            iter += 1;
+
+            // Double projection for Irons-Tuck: G(G(x))
+            projector.project(&buffers.gx, &mut buffers.ggx);
+
+            // Irons-Tuck acceleration
+            if Self::accelerate(
+                &mut coef[..conv_len],
+                &buffers.gx[..conv_len],
+                &buffers.ggx[..conv_len],
+            ) {
+                break;
+            }
+
+            // Post-acceleration projection (after warmup)
+            if iter >= config.iter_proj_after_acc {
+                buffers.temp[..conv_len].copy_from_slice(&coef[..conv_len]);
+                projector.project(&buffers.temp, coef);
+            }
+
+            // Update gx for convergence check
+            projector.project(coef, &mut buffers.gx);
+            keep_going =
+                Self::should_continue(&coef[..conv_len], &buffers.gx[..conv_len], config.tol);
+
+            // Grand acceleration (every iter_grand_acc iterations)
+            if iter % config.iter_grand_acc == 0 {
+                grand_counter += 1;
+                match grand_counter {
+                    1 => {
+                        buffers.y[..conv_len].copy_from_slice(&buffers.gx[..conv_len]);
+                    }
+                    2 => {
+                        buffers.gy[..conv_len].copy_from_slice(&buffers.gx[..conv_len]);
+                    }
+                    _ => {
+                        buffers.ggy[..conv_len].copy_from_slice(&buffers.gx[..conv_len]);
+                        if Self::accelerate(
+                            &mut buffers.y[..conv_len],
+                            &buffers.gy[..conv_len],
+                            &buffers.ggy[..conv_len],
+                        ) {
+                            break;
+                        }
+                        projector.project(&buffers.y, &mut buffers.gx);
+                        grand_counter = 0;
+                    }
+                }
+            }
+
+            // SSR convergence check (every SSR_CHECK_INTERVAL iterations)
+            if iter % SSR_CHECK_INTERVAL == 0 {
+                let ssr_old = ssr;
+                ssr = projector.compute_ssr(&buffers.gx);
+
+                if iter > SSR_CHECK_INTERVAL && Self::converged(ssr_old, ssr, config.tol) {
+                    keep_going = false;
+                    break;
+                }
+            }
+        }
+
+        // Copy final result
+        coef.copy_from_slice(&buffers.gx);
+        (iter, !keep_going)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::demean_accelerated::projection::TwoFEProjector;
+    use crate::demean_accelerated::types::DemeanContext;
+    use ndarray::{Array1, Array2};
+
+    /// Create a test problem with 2 fixed effects
+    fn create_test_problem(n_obs: usize) -> (DemeanContext, Vec<f64>) {
+        let n_fe = 2;
+        let mut flist = Array2::<usize>::zeros((n_obs, n_fe));
+        for i in 0..n_obs {
+            flist[[i, 0]] = i % 10;
+            flist[[i, 1]] = i % 5;
+        }
+        let weights = Array1::<f64>::ones(n_obs);
+        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+        (ctx, input)
+    }
+
+    #[test]
+    fn test_irons_tuck_grand_convergence() {
+        let (ctx, input) = create_test_problem(100);
+        let config = FixestConfig::default();
+
+        let n0 = ctx.index.n_groups[0];
+        let n1 = ctx.index.n_groups[1];
+        let n_coef = n0 + n1;
+
+        let in_out = ctx.scatter_to_coefficients(&input);
+        let mut coef = vec![0.0; n_coef];
+        let mut buffers = IronsTuckGrand::create_buffers(n_coef);
+        let mut projector = TwoFEProjector::new(&ctx, &in_out, &input);
+
+        let (iter, converged) =
+            IronsTuckGrand::run(&mut projector, &mut coef, &mut buffers, &config, config.maxiter);
+
+        assert!(converged, "IronsTuckGrand should converge");
+        assert!(iter < 100, "Should converge in less than 100 iterations");
+    }
+}
diff --git a/src/demean_accelerated/coef_space.rs b/src/demean_accelerated/coef_space.rs
deleted file mode 100644
index f6b90e956..000000000
--- a/src/demean_accelerated/coef_space.rs
+++ /dev/null
@@ -1,1174 +0,0 @@
-//! Coefficient-space demeaning matching fixest's algorithm exactly.
-//!
-//! This is a direct port of fixest's demeaning.cpp, using coefficient-space
-//! iteration rather than residual-space iteration.
-
-/// Pre-computed FE information for coefficient-space iteration.
-/// Uses flat memory layout for better cache performance.
-pub struct FEInfo {
-    pub n_obs: usize,
-    pub n_fe: usize,
-    /// Group IDs flattened: fe_ids[q * n_obs + i] = group ID for observation i in FE q
-    /// This eliminates pointer indirection compared to Vec<Vec<usize>>
-    pub fe_ids: Vec<usize>,
-    /// Number of groups per FE
-    pub n_groups: Vec<usize>,
-    /// Starting index of each FE's coefficients in coef array
-    pub coef_start: Vec<usize>,
-    /// Total number of coefficients
-    pub n_coef_total: usize,
-    /// Sum of weights per group, flattened: access via coef_start[q] + g
-    pub sum_weights: Vec<f64>,
-    /// Sample weights
-    pub weights: Vec<f64>,
-    /// Whether all weights are 1.0 (optimization)
-    pub is_unweighted: bool,
-}
-
-impl FEInfo {
-    pub fn new(
-        n_obs: usize,
-        n_fe: usize,
-        group_ids: &[usize], // flat [n_obs * n_fe], row-major
-        n_groups: &[usize],
-        weights: &[f64],
-    ) -> Self {
-        // Check if unweighted
-        let is_unweighted = weights.iter().all(|&w| (w - 1.0).abs() < 1e-10);
-
-        // Coefficient starting indices (computed first, used for sum_weights layout)
-        let mut coef_start = vec![0usize; n_fe];
-        for q in 1..n_fe {
-            coef_start[q] = coef_start[q - 1] + n_groups[q - 1];
-        }
-        let n_coef_total: usize = n_groups.iter().sum();
-
-        // Flatten fe_ids: fe_ids[q * n_obs + i] = group_ids[i * n_fe + q]
-        // This converts from row-major input to column-major (per-FE) layout
-        let mut fe_ids = vec![0usize; n_fe * n_obs];
-        for i in 0..n_obs {
-            for q in 0..n_fe {
-                fe_ids[q * n_obs + i] = group_ids[i * n_fe + q];
-            }
-        }
-
-        // Sum of weights per group, flattened with same layout as coef
-        let mut sum_weights = vec![0.0; n_coef_total];
-        for q in 0..n_fe {
-            let start = coef_start[q];
-            let fe_offset = q * n_obs;
-            for i in 0..n_obs {
-                let g = fe_ids[fe_offset + i];
-                sum_weights[start + g] += weights[i];
-            }
-        }
-        // Avoid division by zero
-        for s in &mut sum_weights {
-            if *s == 0.0 {
-                *s = 1.0;
-            }
-        }
-
-        Self {
-            n_obs,
-            n_fe,
-            fe_ids,
-            n_groups: n_groups.to_vec(),
-            coef_start,
-            n_coef_total,
-            sum_weights,
-            weights: weights.to_vec(),
-            is_unweighted,
-        }
-    }
-
-    /// Get slice of FE group IDs for FE q: &[group_id for obs 0..n_obs]
-    #[inline(always)]
-    pub fn fe_ids_slice(&self, q: usize) -> &[usize] {
-        let start = q * self.n_obs;
-        &self.fe_ids[start..start + self.n_obs]
-    }
-
-    /// Get slice of sum_weights for FE q: &[sum_weight for group 0..n_groups[q]]
-    #[inline(always)]
-    pub fn sum_weights_slice(&self, q: usize) -> &[f64] {
-        let start = self.coef_start[q];
-        let end = if q + 1 < self.n_fe {
-            self.coef_start[q + 1]
-        } else {
-            self.n_coef_total
-        };
-        &self.sum_weights[start..end]
-    }
-
-    /// Compute sum of weighted (input - output) for each coefficient.
-    /// This is computed ONCE at the start and never changes.
-    pub fn compute_in_out(&self, input: &[f64], output: &[f64]) -> Vec<f64> {
-        let mut in_out = vec![0.0; self.n_coef_total];
-        let n_obs = self.n_obs;
-
-        if self.is_unweighted {
-            for q in 0..self.n_fe {
-                let start = self.coef_start[q];
-                let fe_offset = q * n_obs;
-                for i in 0..n_obs {
-                    let g = self.fe_ids[fe_offset + i];
-                    in_out[start + g] += input[i] - output[i];
-                }
-            }
-        } else {
-            for q in 0..self.n_fe {
-                let start = self.coef_start[q];
-                let fe_offset = q * n_obs;
-                for i in 0..n_obs {
-                    let g = self.fe_ids[fe_offset + i];
-                    in_out[start + g] += (input[i] - output[i]) * self.weights[i];
-                }
-            }
-        }
-
-        in_out
-    }
-
-    /// Compute output from coefficients: output[i] = input[i] - sum_q(coef[fe_q[i]])
-    pub fn compute_output(&self, coef: &[f64], input: &[f64], output: &mut [f64]) {
-        output.copy_from_slice(input);
-        let n_obs = self.n_obs;
-        for q in 0..self.n_fe {
-            let start = self.coef_start[q];
-            let fe_offset = q * n_obs;
-            for i in 0..n_obs {
-                let g = self.fe_ids[fe_offset + i];
-                output[i] -= coef[start + g];
-            }
-        }
-    }
-}
-
-/// Fixest's continue_crit: returns true if should CONTINUE (not converged).
-#[inline]
-fn continue_crit(a: f64, b: f64, diff_max: f64) -> bool {
-    let diff = (a - b).abs();
-    (diff > diff_max) && (diff / (0.1 + a.abs()) > diff_max)
-}
-
-/// Check if should continue on coefficient slice.
-fn should_continue(x: &[f64], gx: &[f64], tol: f64) -> bool {
-    for i in 0..x.len() {
-        if continue_crit(x[i], gx[i], tol) {
-            return true;
-        }
-    }
-    false
-}
-
-/// Fixest's stopping_crit for SSR.
-#[inline]
-fn stopping_crit(a: f64, b: f64, diff_max: f64) -> bool {
-    let diff = (a - b).abs();
-    (diff < diff_max) || (diff / (0.1 + a.abs()) < diff_max)
-}
-
-/// Irons-Tuck acceleration: X = GGX - coef * (GGX - GX)
-#[inline(always)]
-fn irons_tuck_update(x: &mut [f64], gx: &[f64], ggx: &[f64]) -> bool {
-    let n = x.len();
-    let mut vprod = 0.0;
-    let mut ssq = 0.0;
-
-    // SAFETY: x, gx, ggx all have the same length n
-    for i in 0..n {
-        unsafe {
-            let gx_i = *gx.get_unchecked(i);
-            let ggx_i = *ggx.get_unchecked(i);
-            let x_i = *x.get_unchecked(i);
-            let delta_gx = ggx_i - gx_i;
-            let delta2_x = delta_gx - gx_i + x_i;
-            vprod += delta_gx * delta2_x;
-            ssq += delta2_x * delta2_x;
-        }
-    }
-
-    if ssq == 0.0 {
-        return true;
-    }
-
-    let coef = vprod / ssq;
-    for i in 0..n {
-        unsafe {
-            let gx_i = *gx.get_unchecked(i);
-            let ggx_i = *ggx.get_unchecked(i);
-            *x.get_unchecked_mut(i) = ggx_i - coef * (ggx_i - gx_i);
-        }
-    }
-
-    false
-}
-
-/// Configuration matching fixest defaults.
-#[derive(Clone, Copy)]
-pub struct FixestConfig {
-    pub tol: f64,
-    pub maxiter: usize,
-    pub iter_warmup: usize,
-    pub iter_proj_after_acc: usize,
-    pub iter_grand_acc: usize,
-}
-
-impl Default for FixestConfig {
-    fn default() -> Self {
-        Self {
-            tol: 1e-6, // Match fixest's default
-            maxiter: 100_000,
-            iter_warmup: 15,
-            iter_proj_after_acc: 40,
-            iter_grand_acc: 4,
-        }
-    }
-}
-
-// =============================================================================
-// 2-FE Coefficient-Space Implementation (matching compute_fe_coef_2)
-// =============================================================================
-
-/// 2-FE projection: Given alpha coefficients, compute new alpha via beta.
-/// This matches fixest's compute_fe_coef_2 which avoids N-length intermediates.
-#[inline(always)]
-fn project_2fe(
-    fe_info: &FEInfo,
-    in_out: &[f64],
-    alpha_in: &[f64],
-    alpha_out: &mut [f64],
-    beta: &mut [f64],
-) {
-    let n0 = fe_info.n_groups[0];
-    let n1 = fe_info.n_groups[1];
-    let n_obs = fe_info.n_obs;
-    let fe0 = fe_info.fe_ids_slice(0);
-    let fe1 = fe_info.fe_ids_slice(1);
-    let sw0 = fe_info.sum_weights_slice(0);
-    let sw1 = fe_info.sum_weights_slice(1);
-    let weights = &fe_info.weights;
-
-    // Step 1: Compute beta from alpha_in
-    // beta[g] = (in_out[n0+g] - sum_{i:fe1[i]=g} alpha[fe0[i]] * w[i]) / sw1[g]
-    beta[..n1].copy_from_slice(&in_out[n0..n0 + n1]);
-
-    // SAFETY: fe0[i] < n0 (alpha_in.len()), fe1[i] < n1 (beta.len()) by construction
-    if fe_info.is_unweighted {
-        for i in 0..n_obs {
-            unsafe {
-                let g1 = *fe1.get_unchecked(i);
-                let g0 = *fe0.get_unchecked(i);
-                *beta.get_unchecked_mut(g1) -= *alpha_in.get_unchecked(g0);
-            }
-        }
-    } else {
-        for i in 0..n_obs {
-            unsafe {
-                let g1 = *fe1.get_unchecked(i);
-                let g0 = *fe0.get_unchecked(i);
-                *beta.get_unchecked_mut(g1) -= *alpha_in.get_unchecked(g0) * *weights.get_unchecked(i);
-            }
-        }
-    }
-
-    for g in 0..n1 {
-        unsafe { *beta.get_unchecked_mut(g) /= *sw1.get_unchecked(g) };
-    }
-
-    // Step 2: Compute alpha_out from beta
-    // alpha[g] = (in_out[g] - sum_{i:fe0[i]=g} beta[fe1[i]] * w[i]) / sw0[g]
-    alpha_out[..n0].copy_from_slice(&in_out[..n0]);
-
-    // SAFETY: fe0[i] < n0 (alpha_out.len()), fe1[i] < n1 (beta.len()) by construction
-    if fe_info.is_unweighted {
-        for i in 0..n_obs {
-            unsafe {
-                let g0 = *fe0.get_unchecked(i);
-                let g1 = *fe1.get_unchecked(i);
-                *alpha_out.get_unchecked_mut(g0) -= *beta.get_unchecked(g1);
-            }
-        }
-    } else {
-        for i in 0..n_obs {
-            unsafe {
-                let g0 = *fe0.get_unchecked(i);
-                let g1 = *fe1.get_unchecked(i);
-                *alpha_out.get_unchecked_mut(g0) -= *beta.get_unchecked(g1) * *weights.get_unchecked(i);
-            }
-        }
-    }
-
-    for g in 0..n0 {
-        unsafe { *alpha_out.get_unchecked_mut(g) /= *sw0.get_unchecked(g) };
-    }
-}
-
-/// Compute beta from alpha (half of project_2fe, for SSR computation).
-/// This matches fixest's compute_fe_coef_2_internal with step_2=false.
-#[inline(always)]
-fn compute_beta_from_alpha(
-    fe_info: &FEInfo,
-    in_out: &[f64],
-    alpha: &[f64],
-    beta: &mut [f64],
-) {
-    let n1 = fe_info.n_groups[1];
-    let n_obs = fe_info.n_obs;
-    let n0 = fe_info.n_groups[0];
-    let fe0 = fe_info.fe_ids_slice(0);
-    let fe1 = fe_info.fe_ids_slice(1);
-    let sw1 = fe_info.sum_weights_slice(1);
-    let weights = &fe_info.weights;
-
-    // beta[g] = (in_out[n0+g] - sum_{i:fe1[i]=g} alpha[fe0[i]] * w[i]) / sw1[g]
-    beta[..n1].copy_from_slice(&in_out[n0..n0 + n1]);
-
-    if fe_info.is_unweighted {
-        for i in 0..n_obs {
-            unsafe {
-                let g1 = *fe1.get_unchecked(i);
-                let g0 = *fe0.get_unchecked(i);
-                *beta.get_unchecked_mut(g1) -= *alpha.get_unchecked(g0);
-            }
-        }
-    } else {
-        for i in 0..n_obs {
-            unsafe {
-                let g1 = *fe1.get_unchecked(i);
-                let g0 = *fe0.get_unchecked(i);
-                *beta.get_unchecked_mut(g1) -= *alpha.get_unchecked(g0) * *weights.get_unchecked(i);
-            }
-        }
-    }
-
-    for g in 0..n1 {
-        unsafe { *beta.get_unchecked_mut(g) /= *sw1.get_unchecked(g) };
-    }
-}
-
-/// Run 2-FE acceleration loop (demean_acc_gnl with two_fe=true).
-fn run_2fe_acceleration(
-    fe_info: &FEInfo,
-    in_out: &[f64],
-    alpha: &mut [f64],      // Current coefficients, modified in place
-    beta: &mut [f64],       // Temporary buffer
-    config: &FixestConfig,
-    max_iter: usize,
-    input: &[f64],          // Original input for SSR stopping criterion
-) -> (usize, bool) {
-    let n0 = fe_info.n_groups[0];
-    let n1 = fe_info.n_groups[1];
-    let n_obs = fe_info.n_obs;
-
-    // Working buffers
-    let mut gx = vec![0.0; n0];
-    let mut ggx = vec![0.0; n0];
-    let mut temp = vec![0.0; n0];
-    let mut beta_tmp = vec![0.0; n1];
-
-    // Grand acceleration buffers
-    let mut y = vec![0.0; n0];
-    let mut gy = vec![0.0; n0];
-    let mut ggy = vec![0.0; n0];
-    let mut grand_counter = 0usize;
-
-    // SSR tracking
-    let mut ssr = 0.0;
-    let fe0 = fe_info.fe_ids_slice(0);
-    let fe1 = fe_info.fe_ids_slice(1);
-
-    // First iteration: G(alpha)
-    project_2fe(fe_info, in_out, alpha, &mut gx, beta);
-
-    let mut keep_going = should_continue(alpha, &gx, config.tol);
-    let mut iter = 0;
-
-    if std::env::var("PYFIXEST_DEBUG_ITER").is_ok() {
-        let alpha_norm: f64 = alpha.iter().map(|x| x * x).sum();
-        let gx_norm: f64 = gx.iter().map(|x| x * x).sum();
-        let diff_norm: f64 = alpha.iter().zip(gx.iter()).map(|(a, g)| (a - g).powi(2)).sum();
-        eprintln!("[run_2fe_acc] Initial: alpha_norm={:.6e}, gx_norm={:.6e}, diff_norm={:.6e}, keep_going={}",
-                  alpha_norm, gx_norm, diff_norm, keep_going);
-    }
-
-    while keep_going && iter < max_iter {
-        iter += 1;
-
-        // G(G(alpha))
-        project_2fe(fe_info, in_out, &gx, &mut ggx, &mut beta_tmp);
-
-        // Irons-Tuck
-        if irons_tuck_update(alpha, &gx, &ggx) {
-            break;
-        }
-
-        // Project after acceleration
-        if iter >= config.iter_proj_after_acc {
-            temp.copy_from_slice(alpha);
-            project_2fe(fe_info, in_out, &temp, alpha, &mut beta_tmp);
-        }
-
-        // G(alpha)
-        project_2fe(fe_info, in_out, alpha, &mut gx, beta);
-
-        // Convergence check
-        keep_going = should_continue(alpha, &gx, config.tol);
-
-        // Grand acceleration
-        if iter % config.iter_grand_acc == 0 {
-            grand_counter += 1;
-            match grand_counter {
-                1 => y.copy_from_slice(&gx),
-                2 => gy.copy_from_slice(&gx),
-                _ => {
-                    ggy.copy_from_slice(&gx);
-                    if irons_tuck_update(&mut y, &gy, &ggy) {
-                        break;
-                    }
-                    project_2fe(fe_info, in_out, &y, &mut gx, beta);
-                    grand_counter = 0;
-                }
-            }
-        }
-
-        // SSR stopping criterion every 40 iterations (matching fixest)
-        if iter % 40 == 0 {
-            let ssr_old = ssr;
-
-            // Compute beta from gx (current alpha) for SSR computation
-            // Only need to compute beta, not full projection (matches fixest)
-            compute_beta_from_alpha(fe_info, in_out, &gx, &mut beta_tmp);
-
-            // Compute SSR = sum((input - alpha[fe0] - beta[fe1])^2)
-            ssr = 0.0;
-            for i in 0..n_obs {
-                let resid = input[i] - gx[fe0[i]] - beta_tmp[fe1[i]];
-                ssr += resid * resid;
-            }
-
-            if iter > 40 && stopping_crit(ssr_old, ssr, config.tol) {
-                break;
-            }
-        }
-    }
-
-    (iter, !keep_going)
-}
-
-// =============================================================================
-// General Q-FE Coefficient-Space Implementation (matching compute_fe_gnl)
-// =============================================================================
-
-/// Q-FE projection: Compute G(coef_in) -> coef_out.
-/// Updates FEs in reverse order (Q-1 down to 0) matching fixest.
-/// Specialized for 3 FEs (most common case) with loop unrolling.
-#[inline(always)]
-fn project_qfe(
-    fe_info: &FEInfo,
-    in_out: &[f64],
-    coef_in: &[f64],
-    coef_out: &mut [f64],
-    sum_other_means: &mut [f64], // N-length buffer
-) {
-    let n_fe = fe_info.n_fe;
-    let n_obs = fe_info.n_obs;
-
-    // Pre-compute raw pointers for hot loops
-    let fe_ids_ptr = fe_info.fe_ids.as_ptr();
-    let coef_start = &fe_info.coef_start;
-    let sum_other_ptr = sum_other_means.as_mut_ptr();
-    let coef_in_ptr = coef_in.as_ptr();
-    let coef_out_ptr = coef_out.as_mut_ptr();
-    let weights_ptr = fe_info.weights.as_ptr();
-
-    // Specialized fast path for 3 FEs (common case)
-    if n_fe == 3 && fe_info.is_unweighted {
-        project_qfe_3fe_unweighted(
-            n_obs,
-            fe_ids_ptr,
-            coef_start,
-            sum_other_ptr,
-            coef_in_ptr,
-            coef_out_ptr,
-            in_out,
-            &fe_info.n_groups,
-            &fe_info.sum_weights,
-        );
-        return;
-    }
-
-    // General case for any number of FEs
-    project_qfe_general(
-        fe_info,
-        in_out,
-        coef_in,
-        coef_out,
-        sum_other_means,
-        n_fe,
-        n_obs,
-        fe_ids_ptr,
-        coef_start,
-        sum_other_ptr,
-        coef_in_ptr,
-        coef_out_ptr,
-        weights_ptr,
-    );
-}
-
-/// Specialized 3-FE projection for unweighted case.
-#[inline(always)]
-fn project_qfe_3fe_unweighted(
-    n_obs: usize,
-    fe_ids_ptr: *const usize,
-    coef_start: &[usize],
-    sum_other_ptr: *mut f64,
-    coef_in_ptr: *const f64,
-    coef_out_ptr: *mut f64,
-    in_out: &[f64],
-    n_groups: &[usize],
-    sum_weights: &[f64],
-) {
-    let (start_0, start_1, start_2) = (coef_start[0], coef_start[1], coef_start[2]);
-    let fe_0_ptr = fe_ids_ptr;
-    let fe_1_ptr = unsafe { fe_ids_ptr.add(n_obs) };
-    let fe_2_ptr = unsafe { fe_ids_ptr.add(2 * n_obs) };
-    let in_out_ptr = in_out.as_ptr();
-
-    // === q=2: Process FE 2 (add from FE 0, 1 using coef_in) ===
-    // No need to fill with zeros - we directly assign the sum of FE 0 and FE 1 contributions
-    // Unrolled loop: process 4 observations at a time
-    let n_chunks = n_obs / 4;
-    let remainder = n_obs % 4;
-
-    unsafe {
-        for chunk in 0..n_chunks {
-            let base = chunk * 4;
-            let g0_0 = *fe_0_ptr.add(base);
-            let g0_1 = *fe_0_ptr.add(base + 1);
-            let g0_2 = *fe_0_ptr.add(base + 2);
-            let g0_3 = *fe_0_ptr.add(base + 3);
-            let g1_0 = *fe_1_ptr.add(base);
-            let g1_1 = *fe_1_ptr.add(base + 1);
-            let g1_2 = *fe_1_ptr.add(base + 2);
-            let g1_3 = *fe_1_ptr.add(base + 3);
-
-            *sum_other_ptr.add(base) =
-                *coef_in_ptr.add(start_0 + g0_0) + *coef_in_ptr.add(start_1 + g1_0);
-            *sum_other_ptr.add(base + 1) =
-                *coef_in_ptr.add(start_0 + g0_1) + *coef_in_ptr.add(start_1 + g1_1);
-            *sum_other_ptr.add(base + 2) =
-                *coef_in_ptr.add(start_0 + g0_2) + *coef_in_ptr.add(start_1 + g1_2);
-            *sum_other_ptr.add(base + 3) =
-                *coef_in_ptr.add(start_0 + g0_3) + *coef_in_ptr.add(start_1 + g1_3);
-        }
-
-        for i in (n_chunks * 4)..(n_chunks * 4 + remainder) {
-            let g0 = *fe_0_ptr.add(i);
-            let g1 = *fe_1_ptr.add(i);
-            *sum_other_ptr.add(i) = *coef_in_ptr.add(start_0 + g0) + *coef_in_ptr.add(start_1 + g1);
-        }
-    }
-
-    // Compute coef_out for FE 2
-    let n_groups_2 = n_groups[2];
-    unsafe {
-        std::ptr::copy_nonoverlapping(
-            in_out_ptr.add(start_2),
-            coef_out_ptr.add(start_2),
-            n_groups_2,
-        );
-    }
-
-    unsafe {
-        for i in 0..n_obs {
-            let g = *fe_2_ptr.add(i);
-            *coef_out_ptr.add(start_2 + g) -= *sum_other_ptr.add(i);
-        }
-        for g in 0..n_groups_2 {
-            *coef_out_ptr.add(start_2 + g) /= *sum_weights.get_unchecked(start_2 + g);
-        }
-    }
-
-    // === q=1: Process FE 1 (add from FE 0 using coef_in, FE 2 using coef_out) ===
-    unsafe {
-        for chunk in 0..n_chunks {
-            let base = chunk * 4;
-            let g0_0 = *fe_0_ptr.add(base);
-            let g0_1 = *fe_0_ptr.add(base + 1);
-            let g0_2 = *fe_0_ptr.add(base + 2);
-            let g0_3 = *fe_0_ptr.add(base + 3);
-            let g2_0 = *fe_2_ptr.add(base);
-            let g2_1 = *fe_2_ptr.add(base + 1);
-            let g2_2 = *fe_2_ptr.add(base + 2);
-            let g2_3 = *fe_2_ptr.add(base + 3);
-
-            *sum_other_ptr.add(base) =
-                *coef_in_ptr.add(start_0 + g0_0) + *coef_out_ptr.add(start_2 + g2_0);
-            *sum_other_ptr.add(base + 1) =
-                *coef_in_ptr.add(start_0 + g0_1) + *coef_out_ptr.add(start_2 + g2_1);
-            *sum_other_ptr.add(base + 2) =
-                *coef_in_ptr.add(start_0 + g0_2) + *coef_out_ptr.add(start_2 + g2_2);
-            *sum_other_ptr.add(base + 3) =
-                *coef_in_ptr.add(start_0 + g0_3) + *coef_out_ptr.add(start_2 + g2_3);
-        }
-
-        for i in (n_chunks * 4)..(n_chunks * 4 + remainder) {
-            let g0 = *fe_0_ptr.add(i);
-            let g2 = *fe_2_ptr.add(i);
-            *sum_other_ptr.add(i) = *coef_in_ptr.add(start_0 + g0) + *coef_out_ptr.add(start_2 + g2);
-        }
-    }
-
-    // Compute coef_out for FE 1
-    let n_groups_1 = n_groups[1];
-    unsafe {
-        std::ptr::copy_nonoverlapping(
-            in_out_ptr.add(start_1),
-            coef_out_ptr.add(start_1),
-            n_groups_1,
-        );
-    }
-
-    unsafe {
-        for i in 0..n_obs {
-            let g = *fe_1_ptr.add(i);
-            *coef_out_ptr.add(start_1 + g) -= *sum_other_ptr.add(i);
-        }
-        for g in 0..n_groups_1 {
-            *coef_out_ptr.add(start_1 + g) /= *sum_weights.get_unchecked(start_1 + g);
-        }
-    }
-
-    // === q=0: Process FE 0 (add from FE 1, 2 using coef_out) ===
-    unsafe {
-        for chunk in 0..n_chunks {
-            let base = chunk * 4;
-            let g1_0 = *fe_1_ptr.add(base);
-            let g1_1 = *fe_1_ptr.add(base + 1);
-            let g1_2 = *fe_1_ptr.add(base + 2);
-            let g1_3 = *fe_1_ptr.add(base + 3);
-            let g2_0 = *fe_2_ptr.add(base);
-            let g2_1 = *fe_2_ptr.add(base + 1);
-            let g2_2 = *fe_2_ptr.add(base + 2);
-            let g2_3 = *fe_2_ptr.add(base + 3);
-
-            *sum_other_ptr.add(base) =
-                *coef_out_ptr.add(start_1 + g1_0) + *coef_out_ptr.add(start_2 + g2_0);
-            *sum_other_ptr.add(base + 1) =
-                *coef_out_ptr.add(start_1 + g1_1) + *coef_out_ptr.add(start_2 + g2_1);
-            *sum_other_ptr.add(base + 2) =
-                *coef_out_ptr.add(start_1 + g1_2) + *coef_out_ptr.add(start_2 + g2_2);
-            *sum_other_ptr.add(base + 3) =
-                *coef_out_ptr.add(start_1 + g1_3) + *coef_out_ptr.add(start_2 + g2_3);
-        }
-
-        for i in (n_chunks * 4)..(n_chunks * 4 + remainder) {
-            let g1 = *fe_1_ptr.add(i);
-            let g2 = *fe_2_ptr.add(i);
-            *sum_other_ptr.add(i) =
-                *coef_out_ptr.add(start_1 + g1) + *coef_out_ptr.add(start_2 + g2);
-        }
-    }
-
-    // Compute coef_out for FE 0
-    let n_groups_0 = n_groups[0];
-    unsafe {
-        std::ptr::copy_nonoverlapping(in_out_ptr.add(start_0), coef_out_ptr.add(start_0), n_groups_0);
-    }
-
-    unsafe {
-        for i in 0..n_obs {
-            let g = *fe_0_ptr.add(i);
-            *coef_out_ptr.add(start_0 + g) -= *sum_other_ptr.add(i);
-        }
-        for g in 0..n_groups_0 {
-            *coef_out_ptr.add(start_0 + g) /= *sum_weights.get_unchecked(start_0 + g);
-        }
-    }
-}
-
-/// General Q-FE projection (any number of FEs, weighted or unweighted).
-#[inline(always)]
-#[allow(clippy::too_many_arguments)]
-fn project_qfe_general(
-    fe_info: &FEInfo,
-    in_out: &[f64],
-    _coef_in: &[f64],  // Used via coef_in_ptr
-    _coef_out: &mut [f64],  // Used via coef_out_ptr
-    _sum_other_means: &mut [f64],  // Used via sum_other_ptr
-    n_fe: usize,
-    n_obs: usize,
-    fe_ids_ptr: *const usize,
-    coef_start: &[usize],
-    sum_other_ptr: *mut f64,
-    coef_in_ptr: *const f64,
-    coef_out_ptr: *mut f64,
-    weights_ptr: *const f64,
-) {
-    let in_out_ptr = in_out.as_ptr();
-
-    // Process in reverse order (Q-1 down to 0, matching fixest)
-    for q in (0..n_fe).rev() {
-        // Step 1: Fill sum_other_means with zeros
-        unsafe {
-            std::ptr::write_bytes(sum_other_ptr, 0, n_obs);
-        }
-
-        // Add contributions from FEs with h < q (use coef_in)
-        for h in 0..q {
-            let start_h = coef_start[h];
-            let fe_h_ptr = unsafe { fe_ids_ptr.add(h * n_obs) };
-            for i in 0..n_obs {
-                unsafe {
-                    let g = *fe_h_ptr.add(i);
-                    *sum_other_ptr.add(i) += *coef_in_ptr.add(start_h + g);
-                }
-            }
-        }
-
-        // Add contributions from FEs with h > q (use coef_out)
-        for h in (q + 1)..n_fe {
-            let start_h = coef_start[h];
-            let fe_h_ptr = unsafe { fe_ids_ptr.add(h * n_obs) };
-            for i in 0..n_obs {
-                unsafe {
-                    let g = *fe_h_ptr.add(i);
-                    *sum_other_ptr.add(i) += *coef_out_ptr.add(start_h + g);
-                }
-            }
-        }
-
-        // Step 2: Compute new coefficients for FE q
-        let start_q = coef_start[q];
-        let n_groups_q = fe_info.n_groups[q];
-        let fe_q_ptr = unsafe { fe_ids_ptr.add(q * n_obs) };
-        let sw_q = fe_info.sum_weights_slice(q);
-
-        // Initialize to in_out
-        unsafe {
-            std::ptr::copy_nonoverlapping(
-                in_out_ptr.add(start_q),
-                coef_out_ptr.add(start_q),
-                n_groups_q,
-            );
-        }
-
-        // Subtract weighted other FE contributions
-        if fe_info.is_unweighted {
-            for i in 0..n_obs {
-                unsafe {
-                    let g = *fe_q_ptr.add(i);
-                    *coef_out_ptr.add(start_q + g) -= *sum_other_ptr.add(i);
-                }
-            }
-        } else {
-            for i in 0..n_obs {
-                unsafe {
-                    let g = *fe_q_ptr.add(i);
-                    *coef_out_ptr.add(start_q + g) -=
-                        *sum_other_ptr.add(i) * *weights_ptr.add(i);
-                }
-            }
-        }
-
-        // Divide by sum of weights
-        for g in 0..n_groups_q {
-            unsafe {
-                *coef_out_ptr.add(start_q + g) /= *sw_q.get_unchecked(g);
-            }
-        }
-    }
-}
-
-/// Run Q-FE acceleration loop (demean_acc_gnl).
-#[allow(dead_code)]
-fn run_qfe_acceleration(
-    fe_info: &FEInfo,
-    in_out: &[f64],
-    coef: &mut [f64],       // Current coefficients, modified in place
-    config: &FixestConfig,
-    max_iter: usize,
-    input: &[f64],          // Original input for SSR
-) -> (usize, bool) {
-    let n_coef = fe_info.n_coef_total;
-    let n_obs = fe_info.n_obs;
-
-    // nb_coef_no_Q: all except last FE (what fixest uses for acceleration)
-    let nb_coef_no_q = n_coef - fe_info.n_groups[fe_info.n_fe - 1];
-
-    // Working buffers
-    let mut gx = vec![0.0; n_coef];
-    let mut ggx = vec![0.0; n_coef];
-    let mut temp = vec![0.0; n_coef];
-    let mut sum_other_means = vec![0.0; n_obs];
-
-    // Grand acceleration buffers (only nb_coef_no_q needed)
-    let mut y = vec![0.0; n_coef];
-    let mut gy = vec![0.0; n_coef];
-    let mut ggy = vec![0.0; n_coef];
-    let mut grand_counter = 0usize;
-
-    // SSR buffer
-    let mut output_buf = vec![0.0; n_obs];
-    let mut ssr = 0.0;
-
-    // First iteration: G(coef)
-    project_qfe(fe_info, in_out, coef, &mut gx, &mut sum_other_means);
-
-    let mut keep_going = should_continue(&coef[..nb_coef_no_q], &gx[..nb_coef_no_q], config.tol);
-    let mut iter = 0;
-
-    while keep_going && iter < max_iter {
-        iter += 1;
-
-        // G(G(coef))
-        project_qfe(fe_info, in_out, &gx, &mut ggx, &mut sum_other_means);
-
-        // Irons-Tuck on nb_coef_no_q
-        if irons_tuck_update(&mut coef[..nb_coef_no_q], &gx[..nb_coef_no_q], &ggx[..nb_coef_no_q]) {
-            break;
-        }
-
-        // Project after acceleration
-        if iter >= config.iter_proj_after_acc {
-            temp.copy_from_slice(coef);
-            project_qfe(fe_info, in_out, &temp, coef, &mut sum_other_means);
-        }
-
-        // G(coef)
-        project_qfe(fe_info, in_out, coef, &mut gx, &mut sum_other_means);
-
-        // Convergence check on nb_coef_no_q
-        let prev_keep_going = keep_going;
-        keep_going = should_continue(&coef[..nb_coef_no_q], &gx[..nb_coef_no_q], config.tol);
-        if std::env::var("PYFIXEST_DEBUG_ITER").is_ok() && prev_keep_going && !keep_going {
-            eprintln!("[run_qfe_acc] Coefficient converged at iter {}", iter);
-        }
-
-        // Grand acceleration on nb_coef_no_q
-        if iter % config.iter_grand_acc == 0 {
-            grand_counter += 1;
-            match grand_counter {
-                1 => y[..nb_coef_no_q].copy_from_slice(&gx[..nb_coef_no_q]),
-                2 => gy[..nb_coef_no_q].copy_from_slice(&gx[..nb_coef_no_q]),
-                _ => {
-                    ggy[..nb_coef_no_q].copy_from_slice(&gx[..nb_coef_no_q]);
-                    if irons_tuck_update(&mut y[..nb_coef_no_q], &gy[..nb_coef_no_q], &ggy[..nb_coef_no_q]) {
-                        break;
-                    }
-                    project_qfe(fe_info, in_out, &y, &mut gx, &mut sum_other_means);
-                    grand_counter = 0;
-                }
-            }
-        }
-
-        // SSR stopping every 40 iterations
-        if iter % 40 == 0 {
-            let ssr_old = ssr;
-            fe_info.compute_output(&gx, input, &mut output_buf);
-            ssr = output_buf.iter().map(|&r| r * r).sum();
-
-            if iter > 40 && stopping_crit(ssr_old, ssr, config.tol) {
-                if std::env::var("PYFIXEST_DEBUG_ITER").is_ok() {
-                    eprintln!("[run_qfe_acc] SSR converged at iter {}: ssr_old={:.6e}, ssr={:.6e}",
-                              iter, ssr_old, ssr);
-                }
-                keep_going = false;  // Mark as converged
-                break;
-            }
-        }
-    }
-
-    // Copy final gx to coef
-    coef.copy_from_slice(&gx);
-
-    (iter, !keep_going)
-}
-
-// =============================================================================
-// Public API: demean_single matching fixest's demean_single_gnl
-// =============================================================================
-
-/// Demean a single variable using coefficient-space iteration.
-/// Matches fixest's demean_single_gnl exactly.
-pub fn demean_single(
-    fe_info: &FEInfo,
-    input: &[f64],
-    config: &FixestConfig,
-) -> (Vec<f64>, usize, bool) {
-    let n_obs = fe_info.n_obs;
-    let n_fe = fe_info.n_fe;
-
-    // Output initialized to 0
-    let mut output = vec![0.0; n_obs];
-
-    // Compute initial in_out
-    let in_out = fe_info.compute_in_out(input, &output);
-
-    if n_fe == 1 {
-        // Single FE: closed-form solution
-        let mut result = vec![0.0; n_obs];
-        let fe0 = fe_info.fe_ids_slice(0);
-        let sw0 = fe_info.sum_weights_slice(0);
-
-        // coef[g] = in_out[g] / sw[g]
-        let coef: Vec<f64> = in_out.iter().zip(sw0.iter()).map(|(&io, &sw)| io / sw).collect();
-
-        // output[i] = input[i] - coef[fe0[i]]
-        for i in 0..n_obs {
-            result[i] = input[i] - coef[fe0[i]];
-        }
-
-        return (result, 0, true);
-    }
-
-    if n_fe == 2 {
-        // 2-FE: Use specialized 2-FE algorithm
-        let n0 = fe_info.n_groups[0];
-        let n1 = fe_info.n_groups[1];
-
-        let mut alpha = vec![0.0; n0];
-        let mut beta = vec![0.0; n1];
-
-        let (iter, converged) = run_2fe_acceleration(
-            fe_info,
-            &in_out,
-            &mut alpha,
-            &mut beta,
-            config,
-            config.maxiter,
-            input,
-        );
-
-        // Compute output
-        let mut result = vec![0.0; n_obs];
-        let fe0 = fe_info.fe_ids_slice(0);
-        let fe1 = fe_info.fe_ids_slice(1);
-
-        for i in 0..n_obs {
-            result[i] = input[i] - alpha[fe0[i]] - beta[fe1[i]];
-        }
-
-        return (result, iter, converged);
-    }
-
-    // 3+ FE: Use fixest's multi-phase strategy
-    // Key insight: fixest's output stores SUM OF FE COEFFICIENTS, not residual.
-    // in_out = agg(input - output) = agg(input - sum_of_coefs) = agg(residual)
-    // We'll use mu to store sum of FE coefs, then convert to residual at the end.
-    //
-    // 1. Warmup iterations on all FEs
-    // 2. 2-FE sub-convergence on first 2 FEs
-    // 3. Re-acceleration on all FEs
-
-    let n_coef = fe_info.n_coef_total;
-    let n0 = fe_info.n_groups[0];
-    let n1 = fe_info.n_groups[1];
-    let mut total_iter = 0usize;
-
-    // mu = sum of FE contributions per observation (fixest's "output")
-    // Starts at 0, accumulates FE coefficients across phases
-    let mut mu = vec![0.0; n_obs];
-
-    // Helper to compute in_out = agg(input - mu) per FE group
-    let compute_in_out_from_mu = |mu: &[f64]| -> Vec<f64> {
-        let mut in_out = vec![0.0; fe_info.n_coef_total];
-        for q in 0..fe_info.n_fe {
-            let start = fe_info.coef_start[q];
-            let fe_offset = q * n_obs;
-            if fe_info.is_unweighted {
-                for i in 0..n_obs {
-                    let g = fe_info.fe_ids[fe_offset + i];
-                    in_out[start + g] += input[i] - mu[i];
-                }
-            } else {
-                for i in 0..n_obs {
-                    let g = fe_info.fe_ids[fe_offset + i];
-                    in_out[start + g] += (input[i] - mu[i]) * fe_info.weights[i];
-                }
-            }
-        }
-        in_out
-    };
-
-    // Helper to add coefficients to mu
-    let add_coef_to_mu = |coef: &[f64], mu: &mut [f64]| {
-        for q in 0..fe_info.n_fe {
-            let start = fe_info.coef_start[q];
-            let fe_offset = q * n_obs;
-            for i in 0..n_obs {
-                let g = fe_info.fe_ids[fe_offset + i];
-                mu[i] += coef[start + g];
-            }
-        }
-    };
-
-    // Phase 1: Warmup with all FEs
-    let mut coef = vec![0.0; n_coef];
-    let in_out_phase1 = compute_in_out_from_mu(&mu);
-
-    let t1 = std::time::Instant::now();
-    let (iter1, converged1) = run_qfe_acceleration(
-        fe_info,
-        &in_out_phase1,
-        &mut coef,
-        config,
-        config.iter_warmup,
-        input,
-    );
-    let phase1_time = t1.elapsed();
-    total_iter += iter1;
-
-    // Debug: print iteration counts for 3+ FE case
-    if std::env::var("PYFIXEST_DEBUG_ITER").is_ok() {
-        eprintln!("[demean_single] Phase 1 (warmup): {} iters, converged={}, time={:.2}ms",
-                  iter1, converged1, phase1_time.as_secs_f64() * 1000.0);
-    }
-
-    // Add Phase 1 coefficients to mu
-    add_coef_to_mu(&coef, &mut mu);
-
-    if !converged1 {
-        // Phase 2: 2-FE sub-convergence on first 2 FEs
-        let in_out_phase2 = compute_in_out_from_mu(&mu);
-
-        // Start with fresh alpha, beta
-        let mut alpha = vec![0.0; n0];
-        let mut beta = vec![0.0; n1];
-
-        // Extract only the first 2 FE portions of in_out
-        let in_out_2fe: Vec<f64> = in_out_phase2[..n0 + n1].to_vec();
-
-        if std::env::var("PYFIXEST_DEBUG_ITER").is_ok() {
-            let in_out_norm: f64 = in_out_2fe.iter().map(|x| x * x).sum();
-            eprintln!("[demean_single] Phase 2: in_out_2fe norm^2={:.6e}, n0={}, n1={}",
-                      in_out_norm, n0, n1);
-        }
-
-        // Compute effective input for SSR: input - mu (accounts for Phase 1)
-        let effective_input: Vec<f64> = (0..n_obs).map(|i| input[i] - mu[i]).collect();
-
-        let iter_max_2fe = config.maxiter / 2;
-        let t2 = std::time::Instant::now();
-        let (iter2, conv2) = run_2fe_acceleration(
-            fe_info,
-            &in_out_2fe,
-            &mut alpha,
-            &mut beta,
-            config,
-            iter_max_2fe,
-            &effective_input,
-        );
-        let phase2_time = t2.elapsed();
-        total_iter += iter2;
-
-        if std::env::var("PYFIXEST_DEBUG_ITER").is_ok() {
-            eprintln!("[demean_single] Phase 2 (2-FE): {} iters, converged={}, time={:.2}ms",
-                      iter2, conv2, phase2_time.as_secs_f64() * 1000.0);
-        }
-
-        // Add Phase 2's alpha/beta to mu (only FE0 and FE1)
-        let fe0 = fe_info.fe_ids_slice(0);
-        let fe1 = fe_info.fe_ids_slice(1);
-        for i in 0..n_obs {
-            mu[i] += alpha[fe0[i]] + beta[fe1[i]];
-        }
-
-        // Phase 3: Re-acceleration on all FEs
-        let remaining = config.maxiter.saturating_sub(total_iter);
-        if remaining > 0 {
-            let in_out_phase3 = compute_in_out_from_mu(&mu);
-
-            // Start with fresh coefficients
-            coef.fill(0.0);
-
-            let t3 = std::time::Instant::now();
-            let (iter3, conv3) = run_qfe_acceleration(
-                fe_info,
-                &in_out_phase3,
-                &mut coef,
-                config,
-                remaining,
-                input,
-            );
-            let phase3_time = t3.elapsed();
-            total_iter += iter3;
-
-            if std::env::var("PYFIXEST_DEBUG_ITER").is_ok() {
-                eprintln!("[demean_single] Phase 3 (re-acc): {} iters, converged={}, time={:.2}ms",
-                          iter3, conv3, phase3_time.as_secs_f64() * 1000.0);
-            }
-
-            // Add Phase 3 coefficients to mu
-            add_coef_to_mu(&coef, &mut mu);
-        }
-    }
-
-    // Convert mu (sum of FE coefs) to output (residual = input - mu)
-    for i in 0..n_obs {
-        output[i] = input[i] - mu[i];
-    }
-
-    let converged = total_iter < config.maxiter;
-    (output, total_iter, converged)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_2fe_convergence() {
-        let n_obs = 100;
-        let n_fe = 2;
-
-        // Create simple FE structure
-        let mut group_ids = Vec::with_capacity(n_obs * n_fe);
-        for i in 0..n_obs {
-            group_ids.push(i % 10);  // FE1: 10 groups
-            group_ids.push(i % 5);   // FE2: 5 groups
-        }
-
-        let n_groups = vec![10, 5];
-        let weights = vec![1.0; n_obs];
-
-        let fe_info = FEInfo::new(n_obs, n_fe, &group_ids, &n_groups, &weights);
-
-        // Random input
-        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
-
-        let config = FixestConfig::default();
-        let (result, iter, converged) = demean_single(&fe_info, &input, &config);
-
-        assert!(converged, "Should converge");
-        assert!(iter < 100, "Should converge quickly");
-        assert!(result.iter().all(|&v| v.is_finite()));
-    }
-
-    #[test]
-    fn test_3fe_convergence() {
-        let n_obs = 100;
-        let n_fe = 3;
-
-        let mut group_ids = Vec::with_capacity(n_obs * n_fe);
-        for i in 0..n_obs {
-            group_ids.push(i % 10);  // FE1
-            group_ids.push(i % 5);   // FE2
-            group_ids.push(i % 3);   // FE3
-        }
-
-        let n_groups = vec![10, 5, 3];
-        let weights = vec![1.0; n_obs];
-
-        let fe_info = FEInfo::new(n_obs, n_fe, &group_ids, &n_groups, &weights);
-        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
-
-        let config = FixestConfig::default();
-        let (result, _iter, converged) = demean_single(&fe_info, &input, &config);
-
-        assert!(converged);
-        assert!(result.iter().all(|&v| v.is_finite()));
-    }
-}
diff --git a/src/demean_accelerated/demeaner.rs b/src/demean_accelerated/demeaner.rs
new file mode 100644
index 000000000..2bf6c6183
--- /dev/null
+++ b/src/demean_accelerated/demeaner.rs
@@ -0,0 +1,264 @@
+//! High-level demeaning solver strategies.
+//!
+//! This module provides the [`Demeaner`] trait for complete demeaning operations,
+//! with specialized implementations for different fixed effect counts:
+//!
+//! - [`SingleFEDemeaner`]: O(n) closed-form solution (1 FE)
+//! - [`TwoFEDemeaner`]: Accelerated iteration (2 FEs)
+//! - [`MultiFEDemeaner`]: Multi-phase strategy (3+ FEs)
+//!
+//! # Scatter/Gather Operations
+//!
+//! The scatter/gather operations that transform between observation space and
+//! coefficient space are provided by [`DemeanContext`] methods, not by this trait.
+
+use crate::demean_accelerated::accelerator::{Accelerator, IronsTuckGrand};
+use crate::demean_accelerated::projection::{MultiFEProjector, TwoFEProjector};
+use crate::demean_accelerated::types::{DemeanContext, FixestConfig};
+
+// =============================================================================
+// Demeaner Trait
+// =============================================================================
+
+/// A demeaning solver for a specific fixed-effects configuration.
+///
+/// This trait represents the complete strategy for solving the demeaning
+/// problem with a specific number of fixed effects. Implementations handle
+/// setup, iteration (if needed), and output reconstruction.
+///
+/// Scatter/gather operations are available via [`DemeanContext`] methods:
+/// - [`DemeanContext::scatter_to_coefficients`]
+/// - [`DemeanContext::scatter_residuals`]
+/// - [`DemeanContext::gather_and_add`]
+pub trait Demeaner {
+    /// Solve the demeaning problem.
+    ///
+    /// # Returns
+    ///
+    /// Tuple of (demeaned_output, iterations_used, converged_flag)
+    fn solve(
+        ctx: &DemeanContext,
+        input: &[f64],
+        config: &FixestConfig,
+    ) -> (Vec<f64>, usize, bool);
+}
+
+// =============================================================================
+// SingleFEDemeaner
+// =============================================================================
+
+/// Demeaner for 1 fixed effect: O(n) closed-form solution.
+///
+/// No iteration needed - direct computation.
+pub struct SingleFEDemeaner;
+
+impl Demeaner for SingleFEDemeaner {
+    fn solve(
+        ctx: &DemeanContext,
+        input: &[f64],
+        _config: &FixestConfig,
+    ) -> (Vec<f64>, usize, bool) {
+        let n_obs = ctx.index.n_obs;
+        let output = vec![0.0; n_obs];
+
+        // Scatter input to coefficient space (sum of input per group)
+        let in_out = ctx.scatter_residuals(input, &output);
+
+        let fe0 = ctx.index.group_ids_for_fe(0);
+        let group_weights = ctx.group_weights_for_fe(0);
+
+        // coef[g] = in_out[g] / group_weights[g]
+        let coef: Vec<f64> = in_out
+            .iter()
+            .zip(group_weights.iter())
+            .map(|(&io, &sw)| io / sw)
+            .collect();
+
+        // output[i] = input[i] - coef[fe0[i]]
+        let output: Vec<f64> = (0..n_obs).map(|i| input[i] - coef[fe0[i]]).collect();
+
+        (output, 0, true)
+    }
+}
+
+// =============================================================================
+// TwoFEDemeaner
+// =============================================================================
+
+/// Demeaner for 2 fixed effects: accelerated coefficient-space iteration.
+pub struct TwoFEDemeaner;
+
+impl Demeaner for TwoFEDemeaner {
+    fn solve(
+        ctx: &DemeanContext,
+        input: &[f64],
+        config: &FixestConfig,
+    ) -> (Vec<f64>, usize, bool) {
+        let n_obs = ctx.index.n_obs;
+        let n0 = ctx.index.n_groups[0];
+        let n1 = ctx.index.n_groups[1];
+        let n_coef = n0 + n1;
+
+        // Scatter input to coefficient space
+        let in_out = ctx.scatter_to_coefficients(input);
+
+        // Initialize coefficient array (unified: [alpha | beta])
+        let mut coef = vec![0.0; n_coef];
+
+        // Create buffers and projector
+        let mut buffers = IronsTuckGrand::create_buffers(n_coef);
+        let mut projector = TwoFEProjector::new(ctx, &in_out, input);
+
+        // Run acceleration loop
+        let (iter, converged) =
+            IronsTuckGrand::run(&mut projector, &mut coef, &mut buffers, config, config.maxiter);
+
+        // Reconstruct output: input - alpha - beta
+        let fe0 = ctx.index.group_ids_for_fe(0);
+        let fe1 = ctx.index.group_ids_for_fe(1);
+
+        let result: Vec<f64> = (0..n_obs)
+            .map(|i| input[i] - coef[fe0[i]] - coef[n0 + fe1[i]])
+            .collect();
+
+        (result, iter, converged)
+    }
+}
+
+// =============================================================================
+// MultiFEDemeaner
+// =============================================================================
+
+/// Demeaner for 3+ fixed effects: multi-phase strategy.
+///
+/// # Strategy
+///
+/// 1. **Warmup**: Run all-FE iterations to get initial estimates
+/// 2. **2-FE sub-convergence**: Converge on first 2 FEs (faster)
+/// 3. **Re-acceleration**: Final all-FE iterations to polish
+///
+/// # Convergence
+///
+/// Returns `converged=true` if any phase converges early (before max iterations).
+pub struct MultiFEDemeaner;
+
+impl Demeaner for MultiFEDemeaner {
+    fn solve(
+        ctx: &DemeanContext,
+        input: &[f64],
+        config: &FixestConfig,
+    ) -> (Vec<f64>, usize, bool) {
+        let n_obs = ctx.index.n_obs;
+        let n_coef = ctx.index.n_coef;
+        let n0 = ctx.index.n_groups[0];
+        let n1 = ctx.index.n_groups[1];
+        let n_coef_2fe = n0 + n1;
+        let mut total_iter = 0usize;
+
+        let mut mu = vec![0.0; n_obs];
+        let mut coef = vec![0.0; n_coef];
+
+        // Create buffers (one for multi-FE, one for 2-FE sub-convergence)
+        let mut multi_buffers = IronsTuckGrand::create_buffers(n_coef);
+        let mut two_buffers = IronsTuckGrand::create_buffers(n_coef_2fe);
+
+        // Phase 1: Warmup with all FEs (mu is zeros initially)
+        let in_out_phase1 = ctx.scatter_to_coefficients(input);
+        let mut projector1 = MultiFEProjector::new(ctx, &in_out_phase1, input);
+        let (iter1, converged1) = IronsTuckGrand::run(
+            &mut projector1,
+            &mut coef,
+            &mut multi_buffers,
+            config,
+            config.iter_warmup,
+        );
+        total_iter += iter1;
+        ctx.gather_and_add(&coef, &mut mu);
+
+        // Determine final convergence status based on which phase completes the algorithm
+        let converged = if converged1 {
+            // Early convergence in warmup phase
+            true
+        } else {
+            // Phase 2: 2-FE sub-convergence
+            let in_out_phase2 = ctx.scatter_residuals(input, &mu);
+            let mut coef_2fe = vec![0.0; n_coef_2fe];
+            let in_out_2fe: Vec<f64> = in_out_phase2[..n_coef_2fe].to_vec();
+            let effective_input: Vec<f64> = (0..n_obs).map(|i| input[i] - mu[i]).collect();
+
+            let mut projector2 = TwoFEProjector::new(ctx, &in_out_2fe, &effective_input);
+            let (iter2, converged2) = IronsTuckGrand::run(
+                &mut projector2,
+                &mut coef_2fe,
+                &mut two_buffers,
+                config,
+                config.maxiter / 2,
+            );
+            total_iter += iter2;
+
+            // Add 2-FE coefficients to mu
+            let fe0 = ctx.index.group_ids_for_fe(0);
+            let fe1 = ctx.index.group_ids_for_fe(1);
+            for i in 0..n_obs {
+                mu[i] += coef_2fe[fe0[i]] + coef_2fe[n0 + fe1[i]];
+            }
+
+            // Phase 3: Re-acceleration with all FEs (unless 2-FE converged fully)
+            let remaining = config.maxiter.saturating_sub(total_iter);
+            if remaining > 0 {
+                let in_out_phase3 = ctx.scatter_residuals(input, &mu);
+                coef.fill(0.0);
+                let mut projector3 = MultiFEProjector::new(ctx, &in_out_phase3, input);
+                let (iter3, converged3) = IronsTuckGrand::run(
+                    &mut projector3,
+                    &mut coef,
+                    &mut multi_buffers,
+                    config,
+                    remaining,
+                );
+                total_iter += iter3;
+                ctx.gather_and_add(&coef, &mut mu);
+                converged3
+            } else {
+                // No remaining iterations, use phase 2 convergence status
+                converged2
+            }
+        };
+
+        // Compute output: input - mu
+        let output: Vec<f64> = (0..n_obs).map(|i| input[i] - mu[i]).collect();
+
+        (output, total_iter, converged)
+    }
+}
+
+// =============================================================================
+// Entry Point
+// =============================================================================
+
+/// Demean a single variable using the appropriate solver.
+///
+/// Dispatches to the appropriate [`Demeaner`] implementation based on FE count.
+///
+/// # Panics
+///
+/// Panics in debug builds if `input.len() != ctx.index.n_obs`.
+pub fn demean_single(
+    ctx: &DemeanContext,
+    input: &[f64],
+    config: &FixestConfig,
+) -> (Vec<f64>, usize, bool) {
+    debug_assert_eq!(
+        input.len(),
+        ctx.index.n_obs,
+        "input length ({}) must match number of observations ({})",
+        input.len(),
+        ctx.index.n_obs
+    );
+
+    match ctx.index.n_fe {
+        1 => SingleFEDemeaner::solve(ctx, input, config),
+        2 => TwoFEDemeaner::solve(ctx, input, config),
+        _ => MultiFEDemeaner::solve(ctx, input, config),
+    }
+}
diff --git a/src/demean_accelerated/mod.rs b/src/demean_accelerated/mod.rs
index 72bf6f542..9911f372f 100644
--- a/src/demean_accelerated/mod.rs
+++ b/src/demean_accelerated/mod.rs
@@ -4,14 +4,36 @@
 //! (`https://github.com/lrberge/fixest/blob/master/src/demeaning.cpp`),
 //! using coefficient-space iteration for efficiency.
 //!
-//! Dispatches based on number of fixed effects:
+//! # Module Structure
+//!
+//! - [`types`]: Core data types
+//!   - [`FixedEffectsIndex`](types::FixedEffectsIndex): Fixed effects indexing (which obs belongs to which group)
+//!   - [`ObservationWeights`](types::ObservationWeights): Observation weights and group-level aggregations
+//!   - [`DemeanContext`](types::DemeanContext): Combines index + weights for demeaning operations
+//!   - [`FixestConfig`](types::FixestConfig): Algorithm parameters
+//! - [`projection`]: Projection operations with [`Projector`](projection::Projector) trait
+//!   - [`TwoFEProjector`](projection::TwoFEProjector): Specialized 2-FE projection
+//!   - [`MultiFEProjector`](projection::MultiFEProjector): General Q-FE projection
+//! - [`accelerator`]: Acceleration strategies with [`Accelerator`](accelerator::Accelerator) trait
+//!   - [`IronsTuckGrand`](accelerator::IronsTuckGrand): Default acceleration (matches fixest)
+//! - [`demeaner`]: High-level solver strategies with [`Demeaner`](demeaner::Demeaner) trait
+//!   - [`SingleFEDemeaner`](demeaner::SingleFEDemeaner): O(n) closed-form (1 FE)
+//!   - [`TwoFEDemeaner`](demeaner::TwoFEDemeaner): Accelerated iteration (2 FEs)
+//!   - [`MultiFEDemeaner`](demeaner::MultiFEDemeaner): Multi-phase strategy (3+ FEs)
+//!
+//! # Dispatching based on number of fixed effects:
 //! - 1 FE: O(n) closed-form solution (single pass, no iteration)
 //! - 2 FE: Coefficient-space iteration with Irons-Tuck + Grand acceleration
-//! - 3+ FE: Coefficient-space iteration with Irons-Tuck + Grand acceleration
+//! - 3+ FE: Multi-phase strategy with 2-FE sub-convergence
+
+pub mod accelerator;
+pub mod demeaner;
+pub mod projection;
+pub mod types;
 
-mod coef_space;
+use demeaner::demean_single;
+use types::{DemeanContext, FixestConfig};
 
-use coef_space::{demean_single, FEInfo, FixestConfig};
 use ndarray::{Array2, ArrayView1, ArrayView2, Zip};
 use numpy::{PyArray2, PyReadonlyArray1, PyReadonlyArray2};
 use pyo3::prelude::*;
@@ -19,6 +41,7 @@ use rayon::prelude::*;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 
+/// Demean using accelerated coefficient-space iteration.
 pub(crate) fn demean_accelerated(
     x: &ArrayView2<f64>,
     flist: &ArrayView2<usize>,
@@ -27,21 +50,6 @@ pub(crate) fn demean_accelerated(
     maxiter: usize,
 ) -> (Array2<f64>, bool) {
     let (n_samples, n_features) = x.dim();
-    let n_factors = flist.ncols();
-
-    let sample_weights: Vec<f64> = weights.iter().cloned().collect();
-    let group_ids: Vec<usize> = flist.iter().cloned().collect();
-
-    // Compute n_groups per factor
-    let n_groups_per_factor: Vec<usize> = (0..n_factors)
-        .map(|j| {
-            (0..n_samples)
-                .map(|i| group_ids[i * n_factors + j])
-                .max()
-                .unwrap_or(0)
-                + 1
-        })
-        .collect();
 
     let config = FixestConfig {
         tol,
@@ -49,49 +57,19 @@ pub(crate) fn demean_accelerated(
         ..FixestConfig::default()
     };
 
-    // Use the unified coefficient-space implementation for all FE counts
-    demean_coef_space(
-        x,
-        &sample_weights,
-        &group_ids,
-        n_samples,
-        n_features,
-        n_factors,
-        &n_groups_per_factor,
-        &config,
-    )
-}
-
-/// Demean using coefficient-space iteration (unified for all FE counts).
-fn demean_coef_space(
-    x: &ArrayView2<f64>,
-    sample_weights: &[f64],
-    group_ids: &[usize],
-    n_samples: usize,
-    n_features: usize,
-    n_factors: usize,
-    n_groups_per_factor: &[usize],
-    config: &FixestConfig,
-) -> (Array2<f64>, bool) {
     let not_converged = Arc::new(AtomicUsize::new(0));
     let mut res = Array2::<f64>::zeros((n_samples, n_features));
 
-    // Create FEInfo once and share across all columns (it only depends on FE structure)
-    let fe_info = FEInfo::new(
-        n_samples,
-        n_factors,
-        group_ids,
-        n_groups_per_factor,
-        sample_weights,
-    );
+    let ctx = DemeanContext::new(flist, weights);
 
     res.axis_iter_mut(ndarray::Axis(1))
         .into_par_iter()
         .enumerate()
         .for_each(|(k, mut col)| {
-            let xk: Vec<f64> = (0..n_samples).map(|i| x[[i, k]]).collect();
-
-            let (result, _iter, converged) = demean_single(&fe_info, &xk, config);
+            // Use ndarray's column view and convert to contiguous Vec
+            // (column() returns a non-contiguous view, to_vec() copies to contiguous)
+            let xk: Vec<f64> = x.column(k).to_vec();
+            let (result, _iter, converged) = demean_single(&ctx, &xk, &config);
 
             if !converged {
                 not_converged.fetch_add(1, Ordering::SeqCst);
@@ -106,6 +84,7 @@ fn demean_coef_space(
     (res, success)
 }
 
+/// Python-exposed function for accelerated demeaning.
 #[pyfunction]
 #[pyo3(signature = (x, flist, weights, tol=1e-8, maxiter=100_000))]
 pub fn _demean_accelerated_rs(
@@ -126,3 +105,57 @@ pub fn _demean_accelerated_rs(
     let pyarray = PyArray2::from_owned_array(py, out);
     Ok((pyarray.into(), success))
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ndarray::{Array1, Array2};
+
+    #[test]
+    fn test_2fe_convergence() {
+        let n_obs = 100;
+        let n_fe = 2;
+
+        let mut flist = Array2::<usize>::zeros((n_obs, n_fe));
+        for i in 0..n_obs {
+            flist[[i, 0]] = i % 10;
+            flist[[i, 1]] = i % 5;
+        }
+
+        let weights = Array1::<f64>::ones(n_obs);
+
+        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+
+        let config = FixestConfig::default();
+        let (result, iter, converged) = demean_single(&ctx, &input, &config);
+
+        assert!(converged, "Should converge");
+        assert!(iter < 100, "Should converge quickly");
+        assert!(result.iter().all(|&v| v.is_finite()));
+    }
+
+    #[test]
+    fn test_3fe_convergence() {
+        let n_obs = 100;
+        let n_fe = 3;
+
+        let mut flist = Array2::<usize>::zeros((n_obs, n_fe));
+        for i in 0..n_obs {
+            flist[[i, 0]] = i % 10;
+            flist[[i, 1]] = i % 5;
+            flist[[i, 2]] = i % 3;
+        }
+
+        let weights = Array1::<f64>::ones(n_obs);
+
+        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+
+        let config = FixestConfig::default();
+        let (result, _iter, converged) = demean_single(&ctx, &input, &config);
+
+        assert!(converged);
+        assert!(result.iter().all(|&v| v.is_finite()));
+    }
+}
diff --git a/src/demean_accelerated/projection.rs b/src/demean_accelerated/projection.rs
new file mode 100644
index 000000000..f29eb3ba0
--- /dev/null
+++ b/src/demean_accelerated/projection.rs
@@ -0,0 +1,347 @@
+//! Projection operations for fixed effects demeaning.
+//!
+//! # Overview
+//!
+//! The demeaning algorithm iteratively applies a projection operator G that
+//! updates coefficient estimates. Different FE counts have different projection
+//! implementations, but they all share the same interface defined by [`Projector`].
+//!
+//! # Projection Semantics
+//!
+//! A projection takes current coefficient estimates and produces updated estimates:
+//!
+//! ```text
+//! G: coef_in -> coef_out
+//! ```
+//!
+//! The projection is defined such that repeated application converges to the
+//! fixed effects solution: `G(G(G(...))) -> optimal coefficients`.
+//!
+//! # Usage with Accelerators
+//!
+//! Projectors are used with [`Accelerator`](crate::demean_accelerated::accelerator::Accelerator)
+//! implementations that handle the iteration strategy (e.g., Irons-Tuck acceleration).
+
+use crate::demean_accelerated::types::DemeanContext;
+
+// =============================================================================
+// Projector Trait
+// =============================================================================
+
+/// A projection operation for fixed-effects demeaning.
+///
+/// Projectors hold all context needed for projection: the [`DemeanContext`],
+/// scattered input sums, original input values, and scratch buffers.
+/// This makes the projection interface simple and clear.
+///
+/// Projectors are used with [`Accelerator`](crate::demean_accelerated::accelerator::Accelerator)
+/// implementations that handle the iteration strategy.
+///
+/// # Performance
+///
+/// All methods are called in tight loops and should be marked `#[inline(always)]`.
+/// Using static dispatch (`impl Projector` or generics) ensures zero overhead.
+pub trait Projector {
+    /// Project coefficients: coef_in → coef_out.
+    fn project(&mut self, coef_in: &[f64], coef_out: &mut [f64]);
+
+    /// Compute sum of squared residuals for the given coefficients.
+    fn compute_ssr(&mut self, coef: &[f64]) -> f64;
+
+    /// Length of coefficient slice to use for convergence checking.
+    fn convergence_len(&self) -> usize;
+}
+
+// =============================================================================
+// TwoFEProjector
+// =============================================================================
+
+/// Projector for 2 fixed effects.
+///
+/// Uses a specialized algorithm that works directly in coefficient space,
+/// avoiding N-length intermediate arrays. This matches fixest's `compute_fe_coef_2`.
+///
+/// # Coefficient Layout
+///
+/// Coefficients are stored as `[alpha_0, ..., alpha_{n0-1}, beta_0, ..., beta_{n1-1}]`
+/// where alpha are the coefficients for FE 0 and beta for FE 1.
+pub struct TwoFEProjector<'a> {
+    ctx: &'a DemeanContext,
+    in_out: &'a [f64],
+    input: &'a [f64],
+    scratch: Vec<f64>,
+}
+
+impl<'a> TwoFEProjector<'a> {
+    /// Create a new 2-FE projector.
+    #[inline]
+    pub fn new(ctx: &'a DemeanContext, in_out: &'a [f64], input: &'a [f64]) -> Self {
+        let n1 = ctx.index.n_groups[1];
+        Self {
+            ctx,
+            in_out,
+            input,
+            scratch: vec![0.0; n1],
+        }
+    }
+
+    /// Compute beta coefficients from alpha, storing result in scratch buffer.
+    ///
+    /// For each group g1 in FE1:
+    ///   beta[g1] = (in_out[g1] - Σ alpha[g0] * w) / group_weight[g1]
+    #[inline]
+    fn compute_beta_from_alpha(&mut self, alpha: &[f64]) {
+        let n0 = self.ctx.index.n_groups[0];
+        let n1 = self.ctx.index.n_groups[1];
+        let fe0 = self.ctx.index.group_ids_for_fe(0);
+        let fe1 = self.ctx.index.group_ids_for_fe(1);
+        let sw1 = self.ctx.group_weights_for_fe(1);
+
+        self.scratch[..n1].copy_from_slice(&self.in_out[n0..n0 + n1]);
+
+        if self.ctx.weights.is_uniform {
+            for (&g0, &g1) in fe0.iter().zip(fe1.iter()) {
+                self.scratch[g1] -= alpha[g0];
+            }
+        } else {
+            for ((&g0, &g1), &w) in fe0.iter().zip(fe1.iter()).zip(self.ctx.weights.per_obs.iter())
+            {
+                self.scratch[g1] -= alpha[g0] * w;
+            }
+        }
+
+        for (b, &sw) in self.scratch[..n1].iter_mut().zip(sw1.iter()) {
+            *b /= sw;
+        }
+    }
+
+    /// Compute alpha coefficients from beta (stored in scratch), writing to alpha_out.
+    ///
+    /// For each group g0 in FE0:
+    ///   alpha[g0] = (in_out[g0] - Σ beta[g1] * w) / group_weight[g0]
+    #[inline]
+    fn compute_alpha_from_beta(&self, alpha_out: &mut [f64]) {
+        let n0 = self.ctx.index.n_groups[0];
+        let fe0 = self.ctx.index.group_ids_for_fe(0);
+        let fe1 = self.ctx.index.group_ids_for_fe(1);
+        let sw0 = self.ctx.group_weights_for_fe(0);
+
+        alpha_out[..n0].copy_from_slice(&self.in_out[..n0]);
+
+        if self.ctx.weights.is_uniform {
+            for (&g0, &g1) in fe0.iter().zip(fe1.iter()) {
+                alpha_out[g0] -= self.scratch[g1];
+            }
+        } else {
+            for ((&g0, &g1), &w) in fe0.iter().zip(fe1.iter()).zip(self.ctx.weights.per_obs.iter())
+            {
+                alpha_out[g0] -= self.scratch[g1] * w;
+            }
+        }
+
+        for (a, &sw) in alpha_out[..n0].iter_mut().zip(sw0.iter()) {
+            *a /= sw;
+        }
+    }
+}
+
+impl Projector for TwoFEProjector<'_> {
+    #[inline(always)]
+    fn project(&mut self, coef_in: &[f64], coef_out: &mut [f64]) {
+        let n0 = self.ctx.index.n_groups[0];
+        let n1 = self.ctx.index.n_groups[1];
+
+        // Step 1: alpha_in -> beta
+        self.compute_beta_from_alpha(&coef_in[..n0]);
+
+        // Step 2: beta -> alpha_out
+        self.compute_alpha_from_beta(coef_out);
+
+        // Step 3: Copy beta to output
+        coef_out[n0..n0 + n1].copy_from_slice(&self.scratch[..n1]);
+    }
+
+    /// Compute sum of squared residuals for the given coefficients.
+    ///
+    /// # Side Effects
+    ///
+    /// This method recomputes beta from alpha and stores it in `self.scratch`.
+    /// After this call, `self.scratch[..n1]` contains the beta coefficients
+    /// derived from `coef[..n0]` (the alpha coefficients).
+    ///
+    /// This is intentional: the SSR computation needs consistent alpha/beta pairs,
+    /// and recomputing beta ensures correctness even if the caller's `coef` array
+    /// has stale beta values.
+    #[inline(always)]
+    fn compute_ssr(&mut self, coef: &[f64]) -> f64 {
+        let n0 = self.ctx.index.n_groups[0];
+        let fe0 = self.ctx.index.group_ids_for_fe(0);
+        let fe1 = self.ctx.index.group_ids_for_fe(1);
+
+        // Compute beta from alpha (updates self.scratch)
+        self.compute_beta_from_alpha(&coef[..n0]);
+
+        // Compute SSR: Σ (input[i] - alpha[fe0[i]] - beta[fe1[i]])²
+        let mut ssr = 0.0;
+        for ((&g0, &g1), &x) in fe0.iter().zip(fe1.iter()).zip(self.input.iter()) {
+            let resid = x - coef[g0] - self.scratch[g1];
+            ssr += resid * resid;
+        }
+        ssr
+    }
+
+    #[inline(always)]
+    fn convergence_len(&self) -> usize {
+        self.ctx.index.n_groups[0]
+    }
+}
+
+// =============================================================================
+// MultiFEProjector
+// =============================================================================
+
+/// Projector for 3+ fixed effects.
+///
+/// Uses a general Q-FE projection that processes FEs in reverse order,
+/// matching fixest's algorithm.
+pub struct MultiFEProjector<'a> {
+    ctx: &'a DemeanContext,
+    in_out: &'a [f64],
+    input: &'a [f64],
+    scratch: Vec<f64>,
+}
+
+impl<'a> MultiFEProjector<'a> {
+    /// Create a new multi-FE projector.
+    #[inline]
+    pub fn new(ctx: &'a DemeanContext, in_out: &'a [f64], input: &'a [f64]) -> Self {
+        let n_obs = ctx.index.n_obs;
+        Self {
+            ctx,
+            in_out,
+            input,
+            scratch: vec![0.0; n_obs],
+        }
+    }
+
+    /// Accumulate coefficient contributions from one FE into the scratch buffer.
+    ///
+    /// For each observation i: scratch[i] += coef[start + fe[i]]
+    #[inline]
+    fn accumulate_fe_contributions(&mut self, fe_idx: usize, coef: &[f64]) {
+        let start = self.ctx.index.coef_start[fe_idx];
+        let fe = self.ctx.index.group_ids_for_fe(fe_idx);
+
+        for (sum, &g) in self.scratch.iter_mut().zip(fe.iter()) {
+            *sum += coef[start + g];
+        }
+    }
+
+    /// Update coefficients for a single FE given the accumulated other-FE sums.
+    ///
+    /// For each group g in FE q:
+    ///   coef_out[g] = (in_out[g] - Σ scratch[i] * w) / group_weight[g]
+    #[inline]
+    fn update_fe_coefficients(&self, fe_idx: usize, coef_out: &mut [f64]) {
+        let start = self.ctx.index.coef_start[fe_idx];
+        let n_groups = self.ctx.index.n_groups[fe_idx];
+        let fe = self.ctx.index.group_ids_for_fe(fe_idx);
+        let group_weights = self.ctx.group_weights_for_fe(fe_idx);
+
+        // Initialize from in_out
+        coef_out[start..start + n_groups]
+            .copy_from_slice(&self.in_out[start..start + n_groups]);
+
+        // Subtract accumulated other-FE contributions
+        if self.ctx.weights.is_uniform {
+            for (&g, &sum) in fe.iter().zip(self.scratch.iter()) {
+                coef_out[start + g] -= sum;
+            }
+        } else {
+            for ((&g, &sum), &w) in fe
+                .iter()
+                .zip(self.scratch.iter())
+                .zip(self.ctx.weights.per_obs.iter())
+            {
+                coef_out[start + g] -= sum * w;
+            }
+        }
+
+        // Normalize by group weights
+        for (coef, &sw) in coef_out[start..start + n_groups]
+            .iter_mut()
+            .zip(group_weights.iter())
+        {
+            *coef /= sw;
+        }
+    }
+}
+
+impl Projector for MultiFEProjector<'_> {
+    /// Project coefficients using reverse-order FE updates.
+    ///
+    /// For each FE q from (n_fe-1) down to 0:
+    ///   1. Accumulate contributions from FEs before q (from coef_in)
+    ///   2. Accumulate contributions from FEs after q (from coef_out, already computed)
+    ///   3. Update coef_out for FE q
+    #[inline(always)]
+    fn project(&mut self, coef_in: &[f64], coef_out: &mut [f64]) {
+        let n_fe = self.ctx.index.n_fe;
+
+        for q in (0..n_fe).rev() {
+            // Reset scratch buffer
+            self.scratch.fill(0.0);
+
+            // Accumulate from FEs before q (use coef_in)
+            for h in 0..q {
+                self.accumulate_fe_contributions(h, coef_in);
+            }
+
+            // Accumulate from FEs after q (use coef_out, already computed)
+            for h in (q + 1)..n_fe {
+                self.accumulate_fe_contributions(h, coef_out);
+            }
+
+            // Update coefficients for FE q
+            self.update_fe_coefficients(q, coef_out);
+        }
+    }
+
+    #[inline(always)]
+    fn compute_ssr(&mut self, coef: &[f64]) -> f64 {
+        let n_fe = self.ctx.index.n_fe;
+
+        // Compute SSR: Σ (input[i] - Σ_q coef[fe_q[i]])²
+        //
+        // We iterate over FEs in the outer loop and observations in the inner loop.
+        // This improves cache locality because:
+        // 1. group_ids_for_fe(q) returns a contiguous slice for FE q
+        // 2. We access the scratch buffer sequentially
+        // 3. The coefficient array (typically small) stays in cache
+
+        // Accumulate coefficient sums per observation using the scratch buffer
+        self.scratch.fill(0.0);
+        for q in 0..n_fe {
+            let offset = self.ctx.index.coef_start[q];
+            let fe_ids = self.ctx.index.group_ids_for_fe(q);
+            for (sum, &g) in self.scratch.iter_mut().zip(fe_ids.iter()) {
+                *sum += coef[offset + g];
+            }
+        }
+
+        // Compute SSR from residuals
+        self.input
+            .iter()
+            .zip(self.scratch.iter())
+            .map(|(&x, &sum)| {
+                let resid = x - sum;
+                resid * resid
+            })
+            .sum()
+    }
+
+    #[inline(always)]
+    fn convergence_len(&self) -> usize {
+        self.ctx.index.n_coef - self.ctx.index.n_groups[self.ctx.index.n_fe - 1]
+    }
+}
diff --git a/src/demean_accelerated/types.rs b/src/demean_accelerated/types.rs
new file mode 100644
index 000000000..6d70b51e4
--- /dev/null
+++ b/src/demean_accelerated/types.rs
@@ -0,0 +1,447 @@
+//! Core data types for accelerated fixed effects demeaning.
+//!
+//! # Overview
+//!
+//! Fixed effects demeaning removes group means from data. For example, with
+//! individual and time fixed effects, we remove both individual-specific and
+//! time-specific means from each observation.
+//!
+//! # Two Spaces
+//!
+//! The algorithm works in two "spaces":
+//!
+//! - **Observation space**: Length N (number of observations)
+//!   - Input data, output data, residuals
+//!
+//! - **Coefficient space**: Length = sum of groups across all FEs
+//!   - One coefficient per group per FE
+//!   - Example: 1000 individuals + 10 years = 1010 coefficients
+//!   - Stored flat: `[individual_0, ..., individual_999, year_0, ..., year_9]`
+//!
+//! # Core Operations
+//!
+//! 1. **Scatter** (obs → coef): Aggregate weighted values from observations to group sums
+//! 2. **Gather** (coef → obs): Look up each observation's group coefficients and combine
+//!
+//! These operations are the building blocks of the iterative demeaning algorithm.
+//!
+//! # Main Types
+//!
+//! - [`FixedEffectsIndex`]: Maps observations to their group IDs for each FE
+//! - [`ObservationWeights`]: Per-observation and per-group weight sums
+//! - [`DemeanContext`]: Combines index + weights, provides scatter/gather operations
+//! - [`FixestConfig`]: Algorithm parameters (tolerance, max iterations, etc.)
+
+use ndarray::{ArrayView1, ArrayView2};
+use std::ops::Range;
+
+// =============================================================================
+// FixedEffectsIndex
+// =============================================================================
+
+/// Index mapping observations to fixed effect groups.
+///
+/// # Purpose
+///
+/// Maps each observation to its group ID for each fixed effect. For example,
+/// observation 42 might belong to individual 7 and time period 3.
+///
+/// # Memory Layout
+///
+/// Group IDs are stored in column-major order for cache efficiency during iteration:
+/// ```text
+/// group_ids = [fe0_obs0, fe0_obs1, ..., fe0_obsN, fe1_obs0, fe1_obs1, ..., fe1_obsN, ...]
+///              |-------- FE 0 ----------|         |-------- FE 1 ----------|
+/// ```
+///
+/// Access pattern: `group_ids[fe_index * n_obs + obs_index]`
+///
+/// # Example
+///
+/// ```text
+/// 1000 observations, 2 fixed effects (individual, year):
+/// - n_groups = [100, 10]      // 100 individuals, 10 years
+/// - coef_start = [0, 100]     // individuals at 0..100, years at 100..110
+/// - n_coef = 110              // total coefficients
+/// ```
+pub struct FixedEffectsIndex {
+    /// Number of observations (N).
+    pub n_obs: usize,
+
+    /// Number of fixed effects (e.g., 2 for individual + time).
+    pub n_fe: usize,
+
+    /// Flat group IDs in column-major order.
+    /// Index with `fe * n_obs + obs` to get the group ID for observation `obs` in FE `fe`.
+    pub group_ids: Vec<usize>,
+
+    /// Number of groups in each fixed effect.
+    /// Example: `[100, 10]` means FE 0 has 100 groups, FE 1 has 10 groups.
+    pub n_groups: Vec<usize>,
+
+    /// Starting index in coefficient arrays for each FE.
+    /// Example: `[0, 100]` means FE 0 coefficients are at indices 0..100,
+    /// FE 1 coefficients are at indices 100..110.
+    pub coef_start: Vec<usize>,
+
+    /// Total number of coefficients (sum of `n_groups`).
+    pub n_coef: usize,
+}
+
+impl FixedEffectsIndex {
+    /// Create a fixed effects index from the input array.
+    ///
+    /// # Arguments
+    ///
+    /// * `flist` - Fixed effect group IDs with shape `(n_obs, n_fe)`.
+    ///   Each row is one observation, each column is one fixed effect.
+    ///   Values must be 0-indexed group IDs.
+    ///
+    /// # Computed Fields
+    ///
+    /// - `n_groups`: Computed as `max(group_id) + 1` for each FE
+    /// - `coef_start`: Cumulative sum of `n_groups`
+    /// - `group_ids`: Transposed to column-major order for cache efficiency
+    ///
+    /// # Panics
+    ///
+    /// Panics in debug builds if `n_obs == 0` or `n_fe == 0`.
+    pub fn new(flist: &ArrayView2<usize>) -> Self {
+        let (n_obs, n_fe) = flist.dim();
+
+        debug_assert!(n_obs > 0, "Cannot create FixedEffectsIndex with 0 observations");
+        debug_assert!(n_fe > 0, "Cannot create FixedEffectsIndex with 0 fixed effects");
+
+        // Compute n_groups: max group_id + 1 for each FE
+        let n_groups: Vec<usize> = (0..n_fe)
+            .map(|j| flist.column(j).iter().max().unwrap_or(&0) + 1)
+            .collect();
+
+        // Compute coefficient start indices (cumulative sum of n_groups)
+        let mut coef_start = vec![0usize; n_fe];
+        for q in 1..n_fe {
+            coef_start[q] = coef_start[q - 1] + n_groups[q - 1];
+        }
+        let n_coef: usize = n_groups.iter().sum();
+
+        // Transpose group_ids from row-major (obs, fe) to column-major (fe, obs)
+        // This layout is better for the inner loops which iterate over observations
+        let mut group_ids = vec![0usize; n_fe * n_obs];
+        for q in 0..n_fe {
+            for (i, &g) in flist.column(q).iter().enumerate() {
+                group_ids[q * n_obs + i] = g;
+            }
+        }
+
+        Self {
+            n_obs,
+            n_fe,
+            group_ids,
+            n_groups,
+            coef_start,
+            n_coef,
+        }
+    }
+
+    /// Get the group IDs for all observations in fixed effect `fe`.
+    ///
+    /// Returns a slice of length `n_obs` where `result[i]` is the group ID
+    /// for observation `i` in this fixed effect.
+    ///
+    /// # Example
+    ///
+    /// ```ignore
+    /// let individual_ids = index.group_ids_for_fe(0);  // [7, 3, 7, 12, ...]
+    /// let year_ids = index.group_ids_for_fe(1);        // [0, 1, 0, 2, ...]
+    /// ```
+    #[inline(always)]
+    pub fn group_ids_for_fe(&self, fe: usize) -> &[usize] {
+        let start = fe * self.n_obs;
+        &self.group_ids[start..start + self.n_obs]
+    }
+
+    /// Get the coefficient index range for fixed effect `fe`.
+    ///
+    /// Returns the range of indices in coefficient arrays that correspond
+    /// to this fixed effect's groups.
+    #[inline(always)]
+    pub fn coef_range_for_fe(&self, fe: usize) -> Range<usize> {
+        let start = self.coef_start[fe];
+        let end = if fe + 1 < self.n_fe {
+            self.coef_start[fe + 1]
+        } else {
+            self.n_coef
+        };
+        start..end
+    }
+}
+
+// =============================================================================
+// ObservationWeights
+// =============================================================================
+
+/// Observation weights and their aggregation to group level.
+///
+/// # Purpose
+///
+/// In weighted least squares, observations have different weights (e.g., inverse
+/// variance weights). To compute weighted group means, we need:
+///
+/// 1. Per-observation weights for the numerator: `Σ(weight[i] * value[i])`
+/// 2. Per-group weight sums for the denominator: `Σ(weight[i])` for each group
+///
+/// # Uniform Weights Fast Path
+///
+/// When all weights are 1.0 (unweighted regression), `is_uniform = true` enables
+/// optimized code paths that skip multiplication by weights.
+pub struct ObservationWeights {
+    /// Weight for each observation (length: `n_obs`).
+    /// Used when scattering values to coefficient space.
+    pub per_obs: Vec<f64>,
+
+    /// Sum of observation weights for each group (length: `n_coef`).
+    /// Used as denominator when computing group means.
+    /// Layout matches coefficient space: `[fe0_group0, ..., fe0_groupK, fe1_group0, ...]`.
+    pub per_group: Vec<f64>,
+
+    /// True if all observation weights are 1.0 (enables fast path).
+    pub is_uniform: bool,
+}
+
+impl ObservationWeights {
+    /// Create observation weights from the input array.
+    ///
+    /// # Arguments
+    ///
+    /// * `weights` - Per-observation weights (length: `n_obs`)
+    /// * `index` - Fixed effects index (needed to aggregate weights to groups)
+    ///
+    /// # Computed Fields
+    ///
+    /// - `is_uniform`: True if all weights are 1.0 (within floating-point tolerance)
+    /// - `per_group`: Sum of observation weights for each group
+    pub fn new(weights: &ArrayView1<f64>, index: &FixedEffectsIndex) -> Self {
+        // Tolerance for detecting uniform weights (all 1.0).
+        // Using 1e-10 to account for floating-point representation errors
+        // while being strict enough to catch intentionally non-uniform weights.
+        const UNIFORM_WEIGHT_TOL: f64 = 1e-10;
+        let is_uniform = weights.iter().all(|&w| (w - 1.0).abs() < UNIFORM_WEIGHT_TOL);
+
+        // Aggregate observation weights to group level
+        let mut per_group = vec![0.0; index.n_coef];
+        for q in 0..index.n_fe {
+            let offset = index.coef_start[q];
+            let fe_offset = q * index.n_obs;
+            for (i, &w) in weights.iter().enumerate() {
+                let g = index.group_ids[fe_offset + i];
+                per_group[offset + g] += w;
+            }
+        }
+
+        // Avoid division by zero for empty groups
+        for w in &mut per_group {
+            if *w == 0.0 {
+                *w = 1.0;
+            }
+        }
+
+        Self {
+            per_obs: weights.to_vec(),
+            per_group,
+            is_uniform,
+        }
+    }
+}
+
+// =============================================================================
+// DemeanContext
+// =============================================================================
+
+/// Complete context for fixed effects demeaning operations.
+///
+/// # Purpose
+///
+/// Combines the fixed effects index (which observation belongs to which groups)
+/// with observation weights. Provides the core scatter/gather operations needed
+/// by the iterative demeaning algorithm.
+///
+/// # Operations
+///
+/// The demeaning algorithm repeatedly:
+///
+/// 1. **Scatter**: Aggregate residuals from observations to group coefficients
+/// 2. **Gather**: Subtract group coefficients from observations
+///
+/// These operations transform data between observation space (N values) and
+/// coefficient space (`n_coef` values).
+///
+/// # Example Usage
+///
+/// ```ignore
+/// let ctx = DemeanContext::new(&flist, &weights);
+///
+/// // Scatter input to coefficient space
+/// let coef_sums = ctx.scatter_to_coefficients(&input);
+///
+/// // Compute group means: coef[g] = coef_sums[g] / group_weight[g]
+/// // ... (done in solver)
+/// ```
+pub struct DemeanContext {
+    /// Fixed effects index (observation → group mapping).
+    pub index: FixedEffectsIndex,
+
+    /// Observation weights and group-level aggregations.
+    pub weights: ObservationWeights,
+}
+
+impl DemeanContext {
+    /// Create a demeaning context from input arrays.
+    ///
+    /// # Arguments
+    ///
+    /// * `flist` - Fixed effect group IDs with shape `(n_obs, n_fe)`
+    /// * `weights` - Per-observation weights (length: `n_obs`)
+    ///
+    /// # Panics
+    ///
+    /// Panics in debug builds if `weights.len() != flist.nrows()`.
+    pub fn new(flist: &ArrayView2<usize>, weights: &ArrayView1<f64>) -> Self {
+        debug_assert_eq!(
+            weights.len(),
+            flist.nrows(),
+            "weights length ({}) must match number of observations ({})",
+            weights.len(),
+            flist.nrows()
+        );
+
+        let index = FixedEffectsIndex::new(flist);
+        let weights = ObservationWeights::new(weights, &index);
+        Self { index, weights }
+    }
+
+    /// Get the weight sums for all groups in fixed effect `fe`.
+    #[inline(always)]
+    pub fn group_weights_for_fe(&self, fe: usize) -> &[f64] {
+        &self.weights.per_group[self.index.coef_range_for_fe(fe)]
+    }
+
+    // =========================================================================
+    // Scatter/Gather Operations
+    // =========================================================================
+
+    /// Scatter values from observation space to coefficient space.
+    ///
+    /// Computes weighted sums of `values` for each group in each FE.
+    /// Returns a vector of length `n_coef` with the aggregated sums.
+    #[inline]
+    pub fn scatter_to_coefficients(&self, values: &[f64]) -> Vec<f64> {
+        let mut result = vec![0.0; self.index.n_coef];
+        self.scatter_inner(values, None, &mut result);
+        result
+    }
+
+    /// Scatter residuals from observation space to coefficient space.
+    ///
+    /// Like [`scatter_to_coefficients`], but first subtracts `baseline` from `values`.
+    /// Computes: `Σ (values[i] - baseline[i]) * weight[i]` for each group.
+    #[inline]
+    pub fn scatter_residuals(&self, values: &[f64], baseline: &[f64]) -> Vec<f64> {
+        let mut result = vec![0.0; self.index.n_coef];
+        self.scatter_inner(values, Some(baseline), &mut result);
+        result
+    }
+
+    /// Gather coefficients to observation space and add to output.
+    ///
+    /// For each observation, looks up its coefficient for each FE and adds to output.
+    /// Computes: `output[i] += Σ_q coef[offset_q + fe_q[i]]`
+    #[inline]
+    pub fn gather_and_add(&self, coef: &[f64], output: &mut [f64]) {
+        for q in 0..self.index.n_fe {
+            let offset = self.index.coef_start[q];
+            let fe_ids = self.index.group_ids_for_fe(q);
+            for (i, &g) in fe_ids.iter().enumerate() {
+                output[i] += coef[offset + g];
+            }
+        }
+    }
+
+    /// Inner scatter implementation with optional baseline subtraction.
+    ///
+    /// Handles both uniform and non-uniform weights with optimized code paths.
+    #[inline(always)]
+    fn scatter_inner(&self, values: &[f64], baseline: Option<&[f64]>, result: &mut [f64]) {
+        for q in 0..self.index.n_fe {
+            let offset = self.index.coef_start[q];
+            let fe_ids = self.index.group_ids_for_fe(q);
+
+            match (self.weights.is_uniform, baseline) {
+                (true, None) => {
+                    for (i, &g) in fe_ids.iter().enumerate() {
+                        result[offset + g] += values[i];
+                    }
+                }
+                (true, Some(base)) => {
+                    for (i, &g) in fe_ids.iter().enumerate() {
+                        result[offset + g] += values[i] - base[i];
+                    }
+                }
+                (false, None) => {
+                    for (i, &g) in fe_ids.iter().enumerate() {
+                        result[offset + g] += values[i] * self.weights.per_obs[i];
+                    }
+                }
+                (false, Some(base)) => {
+                    for (i, &g) in fe_ids.iter().enumerate() {
+                        result[offset + g] += (values[i] - base[i]) * self.weights.per_obs[i];
+                    }
+                }
+            }
+        }
+    }
+}
+
+// =============================================================================
+// FixestConfig
+// =============================================================================
+
+/// Algorithm configuration parameters.
+///
+/// These parameters control the convergence behavior of the iterative
+/// demeaning algorithm. The defaults match R's fixest package.
+#[derive(Clone, Copy)]
+pub struct FixestConfig {
+    /// Convergence tolerance for coefficient changes.
+    pub tol: f64,
+
+    /// Maximum number of iterations before giving up.
+    pub maxiter: usize,
+
+    /// Warmup iterations before 2-FE sub-convergence (for 3+ FE).
+    /// During warmup, all FEs are updated together.
+    pub iter_warmup: usize,
+
+    /// Iterations before applying projection after acceleration.
+    pub iter_proj_after_acc: usize,
+
+    /// Iterations between grand acceleration steps.
+    pub iter_grand_acc: usize,
+}
+
+impl Default for FixestConfig {
+    /// Default values match R's fixest package for consistency.
+    fn default() -> Self {
+        Self {
+            // Default tolerance matches fixest's `fixest_options("demean_tol")`
+            tol: 1e-6,
+            // Generous iteration limit to handle difficult convergence cases
+            maxiter: 100_000,
+            // Warmup iterations before 2-FE sub-convergence (fixest default)
+            iter_warmup: 15,
+            // Post-acceleration projection starts after this many iterations
+            iter_proj_after_acc: 40,
+            // Grand acceleration frequency (every N iterations)
+            iter_grand_acc: 4,
+        }
+    }
+}

From f7f8bed1c6bb2fdbfef5ff365a17c6b7ce0dcb1f Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Fri, 2 Jan 2026 01:59:26 +0100
Subject: [PATCH 04/24] Add Rust singleton detection and Python-side
 optimizations

Eliminate Python/numba overhead in the estimation pipeline:
- Implement detect_singletons in Rust to avoid numba JIT compilation
- Add Python wrapper maintaining API compatibility
- Optimize factorize() using pd.factorize instead of category conversion
- Replace slow df.isin() with np.isinf() for infinite value detection
---
 pyfixest/core/__init__.py                   |   2 +
 pyfixest/core/_core_impl.pyi                |   1 +
 pyfixest/core/detect_singletons.py          |  48 +++++++
 pyfixest/estimation/__init__.py             |   7 +-
 pyfixest/estimation/model_matrix_fixest_.py |  30 ++---
 src/detect_singletons.rs                    |  93 ++++++++++++++
 src/lib.rs                                  |   2 +
 tests/test_demean.py                        | 131 +++++++++++++++++---
 tests/test_detect_singletons.py             |  62 +++++++--
 9 files changed, 331 insertions(+), 45 deletions(-)
 create mode 100644 pyfixest/core/detect_singletons.py
 create mode 100644 src/detect_singletons.rs

diff --git a/pyfixest/core/__init__.py b/pyfixest/core/__init__.py
index 841aa440a..dc64909fb 100644
--- a/pyfixest/core/__init__.py
+++ b/pyfixest/core/__init__.py
@@ -1,11 +1,13 @@
 from .collinear import find_collinear_variables
 from .crv1 import crv1_meat_loop
 from .demean import demean
+from .detect_singletons import detect_singletons
 from .nested_fixed_effects import count_fixef_fully_nested_all
 
 __all__ = [
     "count_fixef_fully_nested_all",
     "crv1_meat_loop",
     "demean",
+    "detect_singletons",
     "find_collinear_variables",
 ]
diff --git a/pyfixest/core/_core_impl.pyi b/pyfixest/core/_core_impl.pyi
index ac714e33a..8e4bed02d 100644
--- a/pyfixest/core/_core_impl.pyi
+++ b/pyfixest/core/_core_impl.pyi
@@ -27,3 +27,4 @@ def _demean_accelerated_rs(
     tol: float = 1e-08,
     maxiter: int = 100_000,
 ) -> tuple[np.ndarray, bool]: ...
+def _detect_singletons_rs(ids: NDArray[np.uint32]) -> NDArray[np.bool_]: ...
diff --git a/pyfixest/core/detect_singletons.py b/pyfixest/core/detect_singletons.py
new file mode 100644
index 000000000..604010ee3
--- /dev/null
+++ b/pyfixest/core/detect_singletons.py
@@ -0,0 +1,48 @@
+import numpy as np
+from numpy.typing import NDArray
+
+from pyfixest.core._core_impl import _detect_singletons_rs
+
+
+def detect_singletons(ids: NDArray[np.integer]) -> NDArray[np.bool_]:
+    """
+    Detect singleton fixed effects in a dataset.
+
+    This function iterates over the columns of a 2D numpy array representing
+    fixed effects to identify singleton fixed effects.
+    An observation is considered a singleton if it is the only one in its group
+    (fixed effect identifier).
+
+    Parameters
+    ----------
+    ids : np.ndarray
+        A 2D numpy array representing fixed effects, with a shape of (n_samples,
+        n_features).
+        Elements should be non-negative integers representing fixed effect identifiers.
+
+    Returns
+    -------
+    numpy.ndarray
+        A boolean array of shape (n_samples,), indicating which observations have
+        a singleton fixed effect.
+
+    Notes
+    -----
+    The algorithm iterates over columns to identify fixed effects. After each
+    column is processed, it updates the record of non-singleton rows. This approach
+    accounts for the possibility that removing an observation in one column can
+    lead to the emergence of new singletons in subsequent columns.
+
+    For performance reasons, the input array should be in column-major order.
+    Operating on a row-major array can lead to significant performance losses.
+    """
+    if not np.issubdtype(ids.dtype, np.integer):
+        raise TypeError("Fixed effects must be integers")
+
+    # Convert to uint32 F-contiguous array for optimal performance
+    # (matches numba implementation behavior)
+    # Using empty((m,n)).T gives F-order (n,m) layout
+    n, m = ids.shape
+    out: NDArray[np.uint32] = np.empty((m, n), dtype=np.uint32).T
+    out[:] = ids
+    return _detect_singletons_rs(out)
diff --git a/pyfixest/estimation/__init__.py b/pyfixest/estimation/__init__.py
index f82a17d59..6a34b9b75 100644
--- a/pyfixest/estimation/__init__.py
+++ b/pyfixest/estimation/__init__.py
@@ -1,3 +1,6 @@
+from pyfixest.core.detect_singletons import (
+    detect_singletons,
+)
 from pyfixest.estimation import literals
 from pyfixest.estimation.api import (
     feglm,
@@ -8,9 +11,7 @@
 from pyfixest.estimation.demean_ import (
     demean,
 )
-from pyfixest.estimation.detect_singletons_ import (
-    detect_singletons,
-)
+
 from pyfixest.estimation.fegaussian_ import Fegaussian
 from pyfixest.estimation.feiv_ import (
     Feiv,
diff --git a/pyfixest/estimation/model_matrix_fixest_.py b/pyfixest/estimation/model_matrix_fixest_.py
index 2a6b713a8..993455736 100644
--- a/pyfixest/estimation/model_matrix_fixest_.py
+++ b/pyfixest/estimation/model_matrix_fixest_.py
@@ -7,7 +7,7 @@
 import pandas as pd
 from formulaic import Formula
 
-from pyfixest.estimation.detect_singletons_ import detect_singletons
+from pyfixest.core.detect_singletons import detect_singletons
 from pyfixest.estimation.FormulaParser import FixestFormula
 from pyfixest.utils.utils import capture_context
 
@@ -153,14 +153,16 @@ def model_matrix_fixest(
     if weights is not None:
         weights_df = mm["weights"]
 
-    # drop infinite values
-    inf_idx_list = []
+    # drop infinite values - use numpy for speed (df.isin is very slow)
+    inf_mask = np.zeros(Y.shape[0], dtype=bool)
     for df in [Y, X, Z, endogvar, weights_df]:
         if df is not None:
-            inf_idx = np.where(df.isin([np.inf, -np.inf]).any(axis=1))[0].tolist()
-            inf_idx_list.extend(inf_idx)
+            arr = df.to_numpy()
+            # Check for inf values: ~np.isfinite catches both inf and nan,
+            # but we only want inf, so use explicit check
+            inf_mask |= np.isinf(arr).any(axis=1)
 
-    inf_idx = list(set(inf_idx_list))
+    inf_idx = np.where(inf_mask)[0]
     if len(inf_idx) > 0:
         warnings.warn(
             f"{len(inf_idx)} rows with infinite values detected. These rows are dropped from the model."
@@ -560,24 +562,24 @@ def _is_finite_positive(x: Union[pd.DataFrame, pd.Series, np.ndarray]) -> bool:
     return bool((x[~np.isnan(x)] > 0).all())
 
 
-def factorize(fe: pd.DataFrame) -> pd.DataFrame:
+def factorize(fe: pd.Series) -> pd.Series:
     """
     Factorize / Convert fixed effects into integers.
 
     Parameters
     ----------
-    - fe: A DataFrame of fixed effects.
+    - fe: A Series of fixed effects (single column).
 
     Returns
     -------
-    - A DataFrame of fixed effects where each unique value is replaced by an integer.
+    - A Series of fixed effects where each unique value is replaced by an integer.
       NaNs are not removed but set to -1.
     """
-    if fe.dtype != "category":
-        fe = fe.astype("category")
-    res = fe.cat.codes
-    res[res == -1] = np.nan
-    return res
+    codes, _ = pd.factorize(fe)
+    # pd.factorize returns -1 for NaN, convert to actual NaN
+    result = codes.astype(float)
+    result[codes == -1] = np.nan
+    return pd.Series(result, index=fe.index)
 
 
 def wrap_factorize(pattern: str) -> str:
diff --git a/src/detect_singletons.rs b/src/detect_singletons.rs
new file mode 100644
index 000000000..1abcff335
--- /dev/null
+++ b/src/detect_singletons.rs
@@ -0,0 +1,93 @@
+use numpy::{IntoPyArray, PyArray1, PyReadonlyArray2};
+use pyo3::prelude::*;
+
+/// Detect singleton fixed effects in a dataset.
+///
+/// This function iterates over the columns of a 2D numpy array representing
+/// fixed effects to identify singleton fixed effects.
+/// An observation is considered a singleton if it is the only one in its group
+/// (fixed effect identifier).
+///
+/// # Arguments
+/// * `ids` - A 2D numpy array of shape (n_samples, n_features) containing
+///   non-negative integers representing fixed effect identifiers.
+///
+/// # Returns
+/// A boolean array of shape (n_samples,), indicating which observations have
+/// a singleton fixed effect.
+///
+/// # Notes
+/// The algorithm iterates over columns to identify fixed effects. After each
+/// column is processed, it updates the record of non-singleton rows. This approach
+/// accounts for the possibility that removing an observation in one column can
+/// lead to the emergence of new singletons in subsequent columns.
+#[pyfunction]
+pub fn _detect_singletons_rs(py: Python<'_>, ids: PyReadonlyArray2<u32>) -> Py<PyArray1<bool>> {
+    let ids = ids.as_array();
+    let (n_samples, n_features) = ids.dim();
+
+    if n_samples == 0 {
+        return vec![false; 0].into_pyarray(py).into();
+    }
+
+    // Find max value across all columns for count array sizing
+    let max_fixef = ids.iter().cloned().max().unwrap_or(0) as usize;
+    let mut counts = vec![0u32; max_fixef + 1];
+
+    // Track non-singleton indices
+    let mut non_singletons: Vec<u32> = (0..n_samples as u32).collect();
+    let mut n_non_singletons = n_samples;
+
+    loop {
+        let n_non_singletons_curr = n_non_singletons;
+
+        for j in 0..n_features {
+            // Extract column once for faster 1D access (like numba does)
+            let col = ids.column(j);
+
+            // Reset counts
+            counts.iter_mut().for_each(|c| *c = 0);
+
+            // Count occurrences and track singleton count
+            let mut n_singletons: i32 = 0;
+            for i in 0..n_non_singletons {
+                let idx = non_singletons[i] as usize;
+                let e = col[idx] as usize;
+                let c = counts[e];
+                // Branchless version:
+                // if c == 0: n_singletons += 1
+                // if c == 1: n_singletons -= 1
+                n_singletons += (c == 0) as i32 - (c == 1) as i32;
+                counts[e] += 1;
+            }
+
+            if n_singletons == 0 {
+                continue;
+            }
+
+            // Remove singletons from non_singletons list
+            let mut cnt = 0;
+            for i in 0..n_non_singletons {
+                let idx = non_singletons[i] as usize;
+                let e = col[idx] as usize;
+                if counts[e] != 1 {
+                    non_singletons[cnt] = non_singletons[i];
+                    cnt += 1;
+                }
+            }
+            n_non_singletons = cnt;
+        }
+
+        if n_non_singletons_curr == n_non_singletons {
+            break;
+        }
+    }
+
+    // Build result: true means singleton
+    let mut is_singleton = vec![true; n_samples];
+    for i in 0..n_non_singletons {
+        is_singleton[non_singletons[i] as usize] = false;
+    }
+
+    is_singleton.into_pyarray(py).into()
+}
diff --git a/src/lib.rs b/src/lib.rs
index 0a5df7878..d1cf3b5c7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -3,6 +3,7 @@ use pyo3::prelude::*;
 mod collinear;
 mod crv1;
 mod demean;
+mod detect_singletons;
 mod nested_fixed_effects;
 mod demean_accelerated;
 
@@ -15,5 +16,6 @@ fn _core_impl(m: &Bound<'_, PyModule>) -> PyResult<()> {
         nested_fixed_effects::_count_fixef_fully_nested_all_rs
     ))?;
     m.add_wrapped(wrap_pyfunction!(demean_accelerated::_demean_accelerated_rs))?;
+    m.add_wrapped(wrap_pyfunction!(detect_singletons::_detect_singletons_rs))?;
     Ok(())
 }
diff --git a/tests/test_demean.py b/tests/test_demean.py
index e79ed2844..15dc71032 100644
--- a/tests/test_demean.py
+++ b/tests/test_demean.py
@@ -4,6 +4,7 @@
 import pytest
 
 from pyfixest.core import demean as demean_rs
+from pyfixest.core.demean_accelerated import demean_accelerated as demean_accelerated_rs
 from pyfixest.estimation.cupy.demean_cupy_ import demean_cupy32, demean_cupy64
 from pyfixest.estimation.demean_ import _set_demeaner_backend, demean, demean_model
 from pyfixest.estimation.jax.demean_jax_ import demean_jax
@@ -11,8 +12,22 @@
 
 @pytest.mark.parametrize(
     argnames="demean_func",
-    argvalues=[demean, demean_jax, demean_rs, demean_cupy32, demean_cupy64],
-    ids=["demean_numba", "demean_jax", "demean_rs", "demean_cupy32", "demean_cupy64"],
+    argvalues=[
+        demean,
+        demean_jax,
+        demean_rs,
+        demean_accelerated_rs,
+        demean_cupy32,
+        demean_cupy64,
+    ],
+    ids=[
+        "demean_numba",
+        "demean_jax",
+        "demean_rs",
+        "demean_accelerated_rs",
+        "demean_cupy32",
+        "demean_cupy64",
+    ],
 )
 def test_demean(benchmark, demean_func):
     rng = np.random.default_rng(929291)
@@ -65,8 +80,22 @@ def test_set_demeaner_backend():
 
 @pytest.mark.parametrize(
     argnames="demean_func",
-    argvalues=[demean, demean_jax, demean_rs, demean_cupy32, demean_cupy64],
-    ids=["demean_numba", "demean_jax", "demean_rs", "demean_cupy32", "demean_cupy64"],
+    argvalues=[
+        demean,
+        demean_jax,
+        demean_rs,
+        demean_accelerated_rs,
+        demean_cupy32,
+        demean_cupy64,
+    ],
+    ids=[
+        "demean_numba",
+        "demean_jax",
+        "demean_rs",
+        "demean_accelerated_rs",
+        "demean_cupy32",
+        "demean_cupy64",
+    ],
 )
 def test_demean_model_no_fixed_effects(benchmark, demean_func):
     """Test demean_model when there are no fixed effects."""
@@ -100,8 +129,22 @@ def test_demean_model_no_fixed_effects(benchmark, demean_func):
 
 @pytest.mark.parametrize(
     argnames="demean_func",
-    argvalues=[demean, demean_jax, demean_rs, demean_cupy32, demean_cupy64],
-    ids=["demean_numba", "demean_jax", "demean_rs", "demean_cupy32", "demean_cupy64"],
+    argvalues=[
+        demean,
+        demean_jax,
+        demean_rs,
+        demean_accelerated_rs,
+        demean_cupy32,
+        demean_cupy64,
+    ],
+    ids=[
+        "demean_numba",
+        "demean_jax",
+        "demean_rs",
+        "demean_accelerated_rs",
+        "demean_cupy32",
+        "demean_cupy64",
+    ],
 )
 def test_demean_model_with_fixed_effects(benchmark, demean_func):
     """Test demean_model with fixed effects."""
@@ -146,8 +189,22 @@ def test_demean_model_with_fixed_effects(benchmark, demean_func):
 
 @pytest.mark.parametrize(
     argnames="demean_func",
-    argvalues=[demean, demean_jax, demean_rs, demean_cupy32, demean_cupy64],
-    ids=["demean_numba", "demean_jax", "demean_rs", "demean_cupy32", "demean_cupy64"],
+    argvalues=[
+        demean,
+        demean_jax,
+        demean_rs,
+        demean_accelerated_rs,
+        demean_cupy32,
+        demean_cupy64,
+    ],
+    ids=[
+        "demean_numba",
+        "demean_jax",
+        "demean_rs",
+        "demean_accelerated_rs",
+        "demean_cupy32",
+        "demean_cupy64",
+    ],
 )
 def test_demean_model_with_weights(benchmark, demean_func):
     """Test demean_model with weights."""
@@ -194,8 +251,22 @@ def test_demean_model_with_weights(benchmark, demean_func):
 
 @pytest.mark.parametrize(
     argnames="demean_func",
-    argvalues=[demean, demean_jax, demean_rs, demean_cupy32, demean_cupy64],
-    ids=["demean_numba", "demean_jax", "demean_rs", "demean_cupy32", "demean_cupy64"],
+    argvalues=[
+        demean,
+        demean_jax,
+        demean_rs,
+        demean_accelerated_rs,
+        demean_cupy32,
+        demean_cupy64,
+    ],
+    ids=[
+        "demean_numba",
+        "demean_jax",
+        "demean_rs",
+        "demean_accelerated_rs",
+        "demean_cupy32",
+        "demean_cupy64",
+    ],
 )
 def test_demean_model_caching(benchmark, demean_func):
     """Test the caching behavior of demean_model."""
@@ -263,8 +334,22 @@ def test_demean_model_caching(benchmark, demean_func):
 
 @pytest.mark.parametrize(
     argnames="demean_func",
-    argvalues=[demean, demean_jax, demean_rs, demean_cupy32, demean_cupy64],
-    ids=["demean_numba", "demean_jax", "demean_rs", "demean_cupy32", "demean_cupy64"],
+    argvalues=[
+        demean,
+        demean_jax,
+        demean_rs,
+        demean_accelerated_rs,
+        demean_cupy32,
+        demean_cupy64,
+    ],
+    ids=[
+        "demean_numba",
+        "demean_jax",
+        "demean_rs",
+        "demean_accelerated_rs",
+        "demean_cupy32",
+        "demean_cupy64",
+    ],
 )
 def test_demean_model_maxiter_convergence_failure(demean_func):
     """Test that demean_model fails when maxiter is too small."""
@@ -297,8 +382,22 @@ def test_demean_model_maxiter_convergence_failure(demean_func):
 
 @pytest.mark.parametrize(
     argnames="demean_func",
-    argvalues=[demean, demean_jax, demean_rs, demean_cupy32, demean_cupy64],
-    ids=["demean_numba", "demean_jax", "demean_rs", "demean_cupy32", "demean_cupy64"],
+    argvalues=[
+        demean,
+        demean_jax,
+        demean_rs,
+        demean_accelerated_rs,
+        demean_cupy32,
+        demean_cupy64,
+    ],
+    ids=[
+        "demean_numba",
+        "demean_jax",
+        "demean_rs",
+        "demean_accelerated_rs",
+        "demean_cupy32",
+        "demean_cupy64",
+    ],
 )
 def test_demean_model_custom_maxiter_success(demean_func):
     """Test that demean_model succeeds with reasonable maxiter."""
@@ -377,8 +476,8 @@ def test_feols_integration_maxiter():
 
 @pytest.mark.parametrize(
     argnames="demean_func",
-    argvalues=[demean_rs, demean_cupy32, demean_cupy64],
-    ids=["demean_rs", "demean_cupy32", "demean_cupy64"],
+    argvalues=[demean_rs, demean_accelerated_rs, demean_cupy32, demean_cupy64],
+    ids=["demean_rs", "demean_accelerated_rs", "demean_cupy32", "demean_cupy64"],
 )
 def test_demean_complex_fixed_effects(benchmark, demean_func):
     """Benchmark demean functions with complex multi-level fixed effects."""
diff --git a/tests/test_detect_singletons.py b/tests/test_detect_singletons.py
index 9e13d0c38..5a930ea88 100644
--- a/tests/test_detect_singletons.py
+++ b/tests/test_detect_singletons.py
@@ -1,7 +1,10 @@
 import numpy as np
 import pytest
 
-from pyfixest.estimation.detect_singletons_ import detect_singletons
+from pyfixest.core.detect_singletons import detect_singletons as detect_singletons_rust
+from pyfixest.estimation.detect_singletons_ import (
+    detect_singletons as detect_singletons_numba,
+)
 from pyfixest.estimation.jax.detect_singletons_jax import detect_singletons_jax
 
 input1 = np.array([[0, 2, 1], [0, 2, 1], [0, 1, 3], [0, 1, 2], [0, 1, 2]])
@@ -20,8 +23,8 @@
 )
 @pytest.mark.parametrize(
     argnames="detection_function",
-    argvalues=[detect_singletons, detect_singletons_jax],
-    ids=["numba", "jax"],
+    argvalues=[detect_singletons_rust, detect_singletons_numba, detect_singletons_jax],
+    ids=["rust", "numba", "jax"],
 )
 def test_correctness(input, solution, detection_function):
     assert np.array_equal(detection_function(input), solution)
@@ -29,8 +32,8 @@ def test_correctness(input, solution, detection_function):
 
 @pytest.mark.parametrize(
     argnames="detection_function",
-    argvalues=[detect_singletons, detect_singletons_jax],
-    ids=["numba", "jax"],
+    argvalues=[detect_singletons_rust, detect_singletons_numba, detect_singletons_jax],
+    ids=["rust", "numba", "jax"],
 )
 def test_single_column(detection_function):
     """Test with a single fixed effect column."""
@@ -42,8 +45,8 @@ def test_single_column(detection_function):
 
 @pytest.mark.parametrize(
     argnames="detection_function",
-    argvalues=[detect_singletons, detect_singletons_jax],
-    ids=["numba", "jax"],
+    argvalues=[detect_singletons_rust, detect_singletons_numba, detect_singletons_jax],
+    ids=["rust", "numba", "jax"],
 )
 def test_all_singletons(detection_function):
     """Test when all observations are singletons."""
@@ -55,8 +58,8 @@ def test_all_singletons(detection_function):
 
 @pytest.mark.parametrize(
     argnames="detection_function",
-    argvalues=[detect_singletons, detect_singletons_jax],
-    ids=["numba", "jax"],
+    argvalues=[detect_singletons_rust, detect_singletons_numba, detect_singletons_jax],
+    ids=["rust", "numba", "jax"],
 )
 def test_no_singletons(detection_function):
     """Test when there are no singletons."""
@@ -68,8 +71,8 @@ def test_no_singletons(detection_function):
 
 @pytest.mark.parametrize(
     argnames="detection_function",
-    argvalues=[detect_singletons, detect_singletons_jax],
-    ids=["numba", "jax"],
+    argvalues=[detect_singletons_rust, detect_singletons_numba, detect_singletons_jax],
+    ids=["rust", "numba", "jax"],
 )
 def test_large_input(detection_function):
     """Test with a larger input to check performance and correctness."""
@@ -84,9 +87,44 @@ def test_large_input(detection_function):
     )
 
     # For large input, we compare against the Numba implementation as reference
-    reference = detect_singletons(input_data)
+    reference = detect_singletons_numba(input_data)
     result = detection_function(input_data)
 
     assert np.array_equal(result, reference)
     assert len(result) == N
     assert result.dtype == np.bool_
+
+
+# Tests specific to the Rust wrapper's Python preprocessing logic
+
+
+@pytest.mark.parametrize("dtype", [np.float32, np.float64])
+def test_rust_wrapper_rejects_float_dtypes(dtype):
+    """Test that the Rust wrapper raises TypeError for float dtypes."""
+    input_data = np.array([[0, 1], [0, 1], [1, 2]], dtype=dtype)
+    with pytest.raises(TypeError, match="Fixed effects must be integers"):
+        detect_singletons_rust(input_data)
+
+
+@pytest.mark.parametrize(
+    "dtype", [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32]
+)
+def test_rust_wrapper_accepts_integer_dtypes(dtype):
+    """Test that the Rust wrapper accepts all integer dtypes."""
+    input_data = np.array([[0, 1], [0, 1], [1, 2], [1, 2]], dtype=dtype)
+    expected = np.array([False, False, False, False])
+    result = detect_singletons_rust(input_data)
+    assert np.array_equal(result, expected)
+
+
+@pytest.mark.parametrize("order", ["C", "F"])
+def test_rust_wrapper_handles_memory_layout(order):
+    """Test that the Rust wrapper handles both C and F memory layouts."""
+    input_data = np.array(
+        [[0, 2, 1], [0, 2, 1], [0, 1, 3], [0, 1, 2], [0, 1, 2]],
+        dtype=np.int64,
+        order=order,
+    )
+    expected = np.array([False, False, True, False, False])
+    result = detect_singletons_rust(input_data)
+    assert np.array_equal(result, expected)

From 1ed8d098456685df3454d934121a4d6790177515 Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Sat, 3 Jan 2026 02:31:57 +0100
Subject: [PATCH 05/24] Add tests and improve buffer management

Testing and code quality improvements:
- Add edge case tests for demean_accelerated
- Implement buffer reuse via for_each_init pattern
- Extract MultiFEBuffers struct for better readability
- Refactor Demeaner trait to own context and config references
---
 benchmarks/bench_demean_r.R                   |  71 ---
 benchmarks/bench_native_comparison.py         | 209 --------
 benchmarks/demean_benchmark.py                | 456 ------------------
 docs/specs/demean_accelerated_optimization.md | 370 --------------
 pyfixest/estimation/__init__.py               |   1 -
 src/demean_accelerated/demeaner.rs            | 302 +++++++-----
 src/demean_accelerated/mod.rs                 | 268 +++++++++-
 7 files changed, 428 insertions(+), 1249 deletions(-)
 delete mode 100644 benchmarks/bench_demean_r.R
 delete mode 100644 benchmarks/bench_native_comparison.py
 delete mode 100644 benchmarks/demean_benchmark.py
 delete mode 100644 docs/specs/demean_accelerated_optimization.md

diff --git a/benchmarks/bench_demean_r.R b/benchmarks/bench_demean_r.R
deleted file mode 100644
index 66bdc342a..000000000
--- a/benchmarks/bench_demean_r.R
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env Rscript
-# Benchmark fixest demeaning directly in R
-# Usage: Rscript bench_demean_r.R [n_obs] [dgp_type] [n_fe]
-
-library(fixest)
-
-args <- commandArgs(trailingOnly = TRUE)
-n_obs <- if (length(args) >= 1) as.integer(args[1]) else 100000L
-dgp_type <- if (length(args) >= 2) args[2] else "difficult"
-n_fe <- if (length(args) >= 3) as.integer(args[3]) else 3L
-
-# Use 2 threads to match fixest_benchmarks settings
-setFixest_nthreads(2)
-
-# Generate data matching Python benchmark DGP
-set.seed(42)
-n_year <- 10L
-n_indiv_per_firm <- 23L
-n_indiv <- max(1L, round(n_obs / n_year))
-n_firm <- max(1L, round(n_indiv / n_indiv_per_firm))
-
-indiv_id <- rep(1:n_indiv, each = n_year)[1:n_obs]
-year <- rep(1:n_year, times = n_indiv)[1:n_obs]
-
-if (dgp_type == "simple") {
-  firm_id <- sample(1:n_firm, n_obs, replace = TRUE)
-} else {
-  # difficult: sequential assignment
-  firm_id <- rep(1:n_firm, length.out = n_obs)
-}
-
-# Generate outcome
-x1 <- rnorm(n_obs)
-firm_fe <- rnorm(n_firm)[firm_id]
-unit_fe <- rnorm(n_indiv)[indiv_id]
-year_fe <- rnorm(n_year)[year]
-y <- x1 + firm_fe + unit_fe + year_fe + rnorm(n_obs)
-
-df <- data.frame(
-  y = y,
-  x1 = x1,
-  indiv_id = indiv_id,
-  year = year,
-  firm_id = firm_id
-)
-
-# Build formula based on n_fe
-if (n_fe == 2) {
-  fml <- y ~ x1 | indiv_id + year
-} else {
-  fml <- y ~ x1 | indiv_id + year + firm_id
-}
-
-# Warm up
-invisible(feols(fml, data = df, notes = FALSE, warn = FALSE, nthreads = 2L))
-
-# Benchmark
-n_runs <- 5L
-times <- numeric(n_runs)
-
-for (i in 1:n_runs) {
-  start <- Sys.time()
-  fit <- feols(fml, data = df, notes = FALSE, warn = FALSE, nthreads = 2L)
-  end <- Sys.time()
-  times[i] <- as.numeric(end - start, units = "secs") * 1000  # ms
-}
-
-cat(sprintf("fixest (R native) - n=%d, type=%s, %dFE\n", n_obs, dgp_type, n_fe))
-cat(sprintf("  Times (ms): %s\n", paste(round(times, 2), collapse = ", ")))
-cat(sprintf("  Median: %.2f ms\n", median(times)))
-cat(sprintf("  Min: %.2f ms\n", min(times)))
diff --git a/benchmarks/bench_native_comparison.py b/benchmarks/bench_native_comparison.py
deleted file mode 100644
index f45ffd08f..000000000
--- a/benchmarks/bench_native_comparison.py
+++ /dev/null
@@ -1,209 +0,0 @@
-#!/usr/bin/env python3
-"""
-Benchmark comparing pyfixest feols vs native fixest feols.
-
-Runs fixest directly in R to avoid rpy2 overhead, then compares with pyfixest.
-This is a fair apples-to-apples comparison of full feols() routines.
-"""
-
-from __future__ import annotations
-
-import os
-
-# Set thread count for Rayon (pyfixest) BEFORE importing pyfixest
-os.environ["RAYON_NUM_THREADS"] = "2"
-
-import json
-import subprocess
-import time
-from pathlib import Path
-from statistics import median
-
-import numpy as np
-import pandas as pd
-
-
-def generate_dgp(
-    n: int,
-    dgp_type: str = "simple",
-    n_years: int = 10,
-    n_indiv_per_firm: int = 23,
-) -> pd.DataFrame:
-    """Generate test data matching fixest benchmark DGP."""
-    np.random.seed(42)
-
-    n_indiv = max(1, round(n / n_years))
-    n_firm = max(1, round(n_indiv / n_indiv_per_firm))
-
-    indiv_id = np.repeat(np.arange(n_indiv), n_years)[:n]
-    year = np.tile(np.arange(n_years), n_indiv)[:n]
-
-    if dgp_type == "simple":
-        firm_id = np.random.randint(0, n_firm, size=n)
-    else:  # difficult
-        firm_id = np.tile(np.arange(n_firm), (n // n_firm) + 1)[:n]
-
-    x1 = np.random.randn(n)
-    firm_fe = np.random.randn(n_firm)[firm_id]
-    unit_fe = np.random.randn(n_indiv)[indiv_id]
-    year_fe = np.random.randn(n_years)[year]
-    y = x1 + firm_fe + unit_fe + year_fe + np.random.randn(n)
-
-    return pd.DataFrame(
-        {
-            "y": y,
-            "x1": x1,
-            "indiv_id": indiv_id,
-            "year": year,
-            "firm_id": firm_id,
-        }
-    )
-
-
-def run_r_benchmark(n_obs: int, dgp_type: str, n_fe: int, n_runs: int = 5) -> dict:
-    """Run fixest benchmark in R subprocess."""
-    r_script = Path(__file__).parent / "bench_demean_r.R"
-
-    try:
-        result = subprocess.run(
-            ["Rscript", str(r_script), str(n_obs), dgp_type, str(n_fe)],
-            capture_output=True,
-            text=True,
-            timeout=300,
-        )
-
-        if result.returncode != 0:
-            return {"error": result.stderr, "times": [], "median": float("inf")}
-
-        # Parse output
-        lines = result.stdout.strip().split("\n")
-        median_ms = None
-        for line in lines:
-            if "Median:" in line:
-                median_ms = float(line.split(":")[1].strip().replace(" ms", ""))
-
-        return {
-            "median": median_ms if median_ms else float("inf"),
-            "output": result.stdout,
-        }
-    except subprocess.TimeoutExpired:
-        return {"error": "timeout", "median": float("inf")}
-    except FileNotFoundError:
-        return {"error": "R not found", "median": float("inf")}
-
-
-def run_pyfixest_benchmark(
-    df: pd.DataFrame,
-    n_fe: int,
-    n_runs: int = 5,
-) -> dict:
-    """Run pyfixest feols benchmark."""
-    import pyfixest as pf
-
-    # Build formula matching R benchmark
-    if n_fe == 2:
-        fml = "y ~ x1 | indiv_id + year"
-    else:
-        fml = "y ~ x1 | indiv_id + year + firm_id"
-
-    # Warmup - use rust backend for accelerated demeaning
-    pf.feols(fml, data=df, demeaner_backend="rust")
-
-    times = []
-    for _ in range(n_runs):
-        start = time.perf_counter()
-        fit = pf.feols(fml, data=df, demeaner_backend="rust")
-        elapsed = (time.perf_counter() - start) * 1000  # ms
-        times.append(elapsed)
-
-    return {
-        "median": median(times),
-        "times": times,
-        "coef": float(fit.coef().iloc[0]),
-    }
-
-
-def main():
-    """Run benchmark comparing pyfixest feols vs native fixest feols."""
-    configs = [
-        (10_000, "simple", 2),
-        (10_000, "difficult", 2),
-        (10_000, "simple", 3),
-        (10_000, "difficult", 3),
-        (100_000, "simple", 2),
-        (100_000, "difficult", 2),
-        (100_000, "simple", 3),
-        (100_000, "difficult", 3),
-        (1_000_000, "simple", 2),
-        (1_000_000, "difficult", 2),
-        (1_000_000, "simple", 3),
-        (1_000_000, "difficult", 3),
-    ]
-
-    results = []
-
-    print("=" * 70)
-    print("PyFixest feols() vs Fixest feols() Benchmark")
-    print("=" * 70)
-
-    for n_obs, dgp_type, n_fe in configs:
-        print(f"\nConfig: n={n_obs:,}, type={dgp_type}, fe={n_fe}")
-        print("-" * 50)
-
-        # Generate data
-        df = generate_dgp(n_obs, dgp_type)
-
-        # Run R benchmark (feols)
-        r_result = run_r_benchmark(n_obs, dgp_type, n_fe)
-        r_time = r_result.get("median", float("inf"))
-        print(f"  fixest (R):      {r_time:8.2f} ms")
-
-        # Run pyfixest benchmark (feols)
-        py_result = run_pyfixest_benchmark(df, n_fe)
-        py_time = py_result.get("median", float("inf"))
-
-        if r_time > 0 and py_time < float("inf"):
-            ratio = py_time / r_time
-            print(f"  pyfixest:        {py_time:8.2f} ms ({ratio:.2f}x)")
-        else:
-            print(f"  pyfixest:        {py_time:8.2f} ms")
-
-        results.append(
-            {
-                "n_obs": n_obs,
-                "dgp_type": dgp_type,
-                "n_fe": n_fe,
-                "fixest_r_ms": r_time,
-                "pyfixest_ms": py_time,
-            }
-        )
-
-    # Summary
-    print("\n" + "=" * 70)
-    print("SUMMARY (pyfixest feols vs fixest feols)")
-    print("=" * 70)
-
-    print(f"{'Config':<35} {'fixest':>10} {'pyfixest':>10} {'ratio':>8}")
-    print("-" * 65)
-
-    for r in results:
-        config = f"n={r['n_obs']:,} {r['dgp_type']:9} {r['n_fe']}FE"
-        fixest = r["fixest_r_ms"]
-        pyfixest = r["pyfixest_ms"]
-
-        if fixest > 0 and fixest < float("inf") and pyfixest < float("inf"):
-            ratio = pyfixest / fixest
-            print(f"{config:<35} {fixest:>8.1f}ms {pyfixest:>8.1f}ms {ratio:>7.2f}x")
-        else:
-            print(f"{config:<35} {'N/A':>10} {'N/A':>10}")
-
-    # Save results
-    output_path = Path(__file__).parent / "results" / "native_comparison.json"
-    output_path.parent.mkdir(exist_ok=True)
-    with open(output_path, "w") as f:
-        json.dump(results, f, indent=2)
-    print(f"\nResults saved to {output_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmarks/demean_benchmark.py b/benchmarks/demean_benchmark.py
deleted file mode 100644
index 6a587b75f..000000000
--- a/benchmarks/demean_benchmark.py
+++ /dev/null
@@ -1,456 +0,0 @@
-#!/usr/bin/env python3
-"""
-Benchmark script for comparing demeaning implementations.
-
-Oriented on fixest_benchmarks/bench_ols.R but focused on demeaning only
-and optimized for fast iteration.
-
-Usage:
-    python benchmarks/demean_benchmark.py           # Fast mode (~30s)
-    python benchmarks/demean_benchmark.py --full    # Full mode (~5min)
-    python benchmarks/demean_benchmark.py --save    # Save results to JSON
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import time
-from dataclasses import dataclass
-from pathlib import Path
-from statistics import median
-from typing import Callable
-
-import numpy as np
-
-
-@dataclass
-class BenchmarkConfig:
-    """Configuration for a single benchmark run."""
-
-    n_obs: int
-    dgp_type: str  # "simple" or "difficult"
-    n_fe: int
-    n_iters: int
-
-
-@dataclass
-class BenchmarkResult:
-    """Result of a benchmark run."""
-
-    config: BenchmarkConfig
-    backend: str
-    times: list[float]
-    median_time: float
-    available: bool
-    error: str | None = None
-
-
-def generate_dgp(
-    n: int,
-    dgp_type: str = "simple",
-    n_years: int = 10,
-    n_indiv_per_firm: int = 23,
-) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
-    """
-    Generate data matching fixest_benchmarks DGP.
-
-    Parameters
-    ----------
-    n : int
-        Number of observations
-    dgp_type : str
-        "simple" (random firm assignment) or "difficult" (sequential)
-    n_years : int
-        Number of years
-    n_indiv_per_firm : int
-        Average individuals per firm
-
-    Returns
-    -------
-    x : np.ndarray
-        Feature matrix (n, 1)
-    flist : np.ndarray
-        Fixed effect IDs (n, 2 or 3) - [indiv_id, year] or [indiv_id, year, firm_id]
-    weights : np.ndarray
-        Sample weights (n,)
-    """
-    n_indiv = max(1, round(n / n_years))
-    n_firm = max(1, round(n_indiv / n_indiv_per_firm))
-
-    # Create FE IDs
-    indiv_id = np.repeat(np.arange(n_indiv), n_years)[:n]
-    year = np.tile(np.arange(n_years), n_indiv)[:n]
-
-    if dgp_type == "simple":
-        # Random firm assignment - easier convergence
-        firm_id = np.random.randint(0, n_firm, size=n)
-    elif dgp_type == "difficult":
-        # Sequential firm assignment - harder convergence (messy data)
-        firm_id = np.tile(np.arange(n_firm), (n // n_firm) + 1)[:n]
-    else:
-        raise ValueError(f"Unknown dgp_type: {dgp_type}")
-
-    # Generate features
-    x1 = np.random.randn(n)
-
-    # Generate y with FE structure
-    firm_fe = np.random.randn(n_firm)[firm_id]
-    unit_fe = np.random.randn(n_indiv)[indiv_id]
-    year_fe = np.random.randn(n_years)[year]
-    y = x1 + firm_fe + unit_fe + year_fe + np.random.randn(n)
-
-    # Stack into matrices
-    x = np.column_stack([y, x1])  # Demean both y and x1
-    weights = np.ones(n)
-
-    return x, indiv_id, year, firm_id, weights
-
-
-def get_demean_backends() -> dict[str, Callable | None]:
-    """Get available demeaning backends with graceful fallbacks."""
-    backends: dict[str, Callable | None] = {}
-
-    # Rust accelerated (default)
-    try:
-        from pyfixest.core.demean import demean as demean_rust
-
-        backends["rust-accelerated"] = demean_rust
-    except ImportError:
-        backends["rust-accelerated"] = None
-
-    # Rust simple (via env var)
-    def demean_rust_simple(x, flist, weights, tol=1e-8, maxiter=100_000):
-        os.environ["PYFIXEST_DEMEAN_SIMPLE"] = "1"
-        try:
-            from pyfixest.core.demean import demean as demean_rust
-
-            return demean_rust(x, flist, weights, tol, maxiter)
-        finally:
-            del os.environ["PYFIXEST_DEMEAN_SIMPLE"]
-
-    backends["rust-simple"] = (
-        demean_rust_simple if backends["rust-accelerated"] else None
-    )
-
-    # Numba
-    try:
-        from pyfixest.estimation.demean_ import demean as demean_numba
-
-        backends["numba"] = demean_numba
-    except ImportError:
-        backends["numba"] = None
-
-    # CuPy 32-bit
-    try:
-        from pyfixest.estimation.cupy.demean_cupy_ import demean_cupy32
-
-        backends["cupy32"] = demean_cupy32
-    except ImportError:
-        backends["cupy32"] = None
-
-    # CuPy 64-bit
-    try:
-        from pyfixest.estimation.cupy.demean_cupy_ import demean_cupy64
-
-        backends["cupy64"] = demean_cupy64
-    except ImportError:
-        backends["cupy64"] = None
-
-    # R fixest via rpy2 - use feols with only FE (no covariates) to measure demean time
-    try:
-        import pandas as pd
-        import rpy2.robjects as ro
-        from rpy2.robjects import numpy2ri, pandas2ri
-        from rpy2.robjects.packages import importr
-
-        numpy2ri.activate()
-        pandas2ri.activate()
-        importr("fixest")  # Load fixest package
-
-        def demean_fixest(x, flist, weights, tol=1e-8, maxiter=100_000):
-            # Create a minimal regression problem that exercises the demeaning
-            _n, k = x.shape
-            n_fe = flist.shape[1] if flist.ndim > 1 else 1
-
-            # Build a dataframe with y and FE columns
-            data = {"y": x[:, 0]}
-            fe_names = []
-            for j in range(n_fe):
-                fe_col = f"fe{j + 1}"
-                fe_names.append(fe_col)
-                if flist.ndim > 1:
-                    data[fe_col] = flist[:, j].astype(int)
-                else:
-                    data[fe_col] = flist.astype(int)
-
-            df = pd.DataFrame(data)
-            r_df = pandas2ri.py2rpy(df)
-
-            # Build formula: y ~ 1 | fe1 + fe2 + ...
-            fe_formula = " + ".join(fe_names)
-            formula = f"y ~ 1 | {fe_formula}"
-
-            # Call feols (this includes demeaning time)
-            ro.r.assign("df", r_df)
-            ro.r(f"result <- fixest::feols({formula}, data=df, nthreads=1)")
-
-            # Return the residuals as "demeaned" values
-            resid = np.array(ro.r("residuals(result)"))
-            result = np.column_stack([resid] + [x[:, j] for j in range(1, k)])
-            return result, True
-
-        backends["fixest"] = demean_fixest
-    except (ImportError, Exception):
-        backends["fixest"] = None
-
-    return backends
-
-
-def run_single_benchmark(
-    demean_func: Callable,
-    x: np.ndarray,
-    flist: np.ndarray,
-    weights: np.ndarray,
-    n_iters: int,
-) -> list[float]:
-    """Run a single benchmark configuration multiple times."""
-    times = []
-
-    for _ in range(n_iters):
-        # Copy arrays to avoid caching effects
-        x_copy = x.copy()
-
-        start = time.perf_counter()
-        demean_func(x_copy, flist, weights)
-        elapsed = time.perf_counter() - start
-
-        times.append(elapsed)
-
-    return times
-
-
-def run_benchmarks(
-    configs: list[BenchmarkConfig],
-    backends: dict[str, Callable | None],
-) -> list[BenchmarkResult]:
-    """Run all benchmark configurations across all backends."""
-    results = []
-
-    for config in configs:
-        print(f"\n{'=' * 60}")
-        print(f"Config: n={config.n_obs:,}, type={config.dgp_type}, fe={config.n_fe}")
-        print("=" * 60)
-
-        # Generate data
-        x, indiv_id, year, firm_id, weights = generate_dgp(
-            config.n_obs, config.dgp_type
-        )
-
-        # Build flist based on n_fe
-        if config.n_fe == 2:
-            flist = np.column_stack([indiv_id, year]).astype(np.uint64)
-        else:  # n_fe == 3
-            flist = np.column_stack([indiv_id, year, firm_id]).astype(np.uint64)
-
-        for backend_name, demean_func in backends.items():
-            if demean_func is None:
-                result = BenchmarkResult(
-                    config=config,
-                    backend=backend_name,
-                    times=[],
-                    median_time=float("inf"),
-                    available=False,
-                    error="Not installed",
-                )
-                results.append(result)
-                print(f"  {backend_name:20s}: not available")
-                continue
-
-            try:
-                times = run_single_benchmark(
-                    demean_func, x, flist, weights, config.n_iters
-                )
-                med_time = median(times)
-                result = BenchmarkResult(
-                    config=config,
-                    backend=backend_name,
-                    times=times,
-                    median_time=med_time,
-                    available=True,
-                )
-                results.append(result)
-                print(
-                    f"  {backend_name:20s}: {med_time * 1000:8.2f} ms (median of {len(times)})"
-                )
-            except Exception as e:
-                result = BenchmarkResult(
-                    config=config,
-                    backend=backend_name,
-                    times=[],
-                    median_time=float("inf"),
-                    available=False,
-                    error=str(e),
-                )
-                results.append(result)
-                print(f"  {backend_name:20s}: ERROR - {e}")
-
-    return results
-
-
-def print_summary(results: list[BenchmarkResult]) -> None:
-    """Print a summary table of results."""
-    print("\n" + "=" * 80)
-    print("SUMMARY")
-    print("=" * 80)
-
-    # Group by config
-    configs = sorted(
-        set((r.config.n_obs, r.config.dgp_type, r.config.n_fe) for r in results)
-    )
-
-    backends = sorted(set(r.backend for r in results))
-
-    # Header
-    header = f"{'Config':30s}"
-    for backend in backends:
-        header += f" {backend:>12s}"
-    print(header)
-    print("-" * len(header))
-
-    # Find fixest baseline for relative comparison
-    fixest_times = {}
-    for r in results:
-        if r.backend == "fixest" and r.available:
-            key = (r.config.n_obs, r.config.dgp_type, r.config.n_fe)
-            fixest_times[key] = r.median_time
-
-    # Rows
-    for n_obs, dgp_type, n_fe in configs:
-        config_str = f"n={n_obs:,} {dgp_type:9s} {n_fe}FE"
-        row = f"{config_str:30s}"
-
-        key = (n_obs, dgp_type, n_fe)
-        baseline = fixest_times.get(key)
-
-        for backend in backends:
-            matching = [
-                r
-                for r in results
-                if r.config.n_obs == n_obs
-                and r.config.dgp_type == dgp_type
-                and r.config.n_fe == n_fe
-                and r.backend == backend
-            ]
-            if matching and matching[0].available:
-                time_ms = matching[0].median_time * 1000
-                if baseline and backend != "fixest":
-                    ratio = matching[0].median_time / baseline
-                    row += f" {time_ms:7.1f}ms({ratio:.1f}x)"
-                else:
-                    row += f" {time_ms:12.1f}ms"
-            else:
-                row += f" {'N/A':>12s}"
-
-        print(row)
-
-
-def save_results(results: list[BenchmarkResult], path: Path) -> None:
-    """Save results to JSON."""
-    data = []
-    for r in results:
-        data.append(
-            {
-                "n_obs": r.config.n_obs,
-                "dgp_type": r.config.dgp_type,
-                "n_fe": r.config.n_fe,
-                "n_iters": r.config.n_iters,
-                "backend": r.backend,
-                "times": r.times,
-                "median_time": r.median_time if r.median_time != float("inf") else None,
-                "available": r.available,
-                "error": r.error,
-            }
-        )
-
-    path.parent.mkdir(parents=True, exist_ok=True)
-    with open(path, "w") as f:
-        json.dump(data, f, indent=2)
-    print(f"\nResults saved to {path}")
-
-
-def main():
-    """Run demeaning benchmarks."""
-    parser = argparse.ArgumentParser(description="Benchmark demeaning implementations")
-    parser.add_argument(
-        "--full", action="store_true", help="Run full benchmark (slower)"
-    )
-    parser.add_argument("--save", action="store_true", help="Save results to JSON")
-    parser.add_argument(
-        "--output",
-        type=Path,
-        default=Path("benchmarks/results/benchmark.json"),
-        help="Output path for results",
-    )
-    args = parser.parse_args()
-
-    # Define configurations
-    if args.full:
-        configs = [
-            # Small (fast)
-            BenchmarkConfig(n_obs=10_000, dgp_type="simple", n_fe=2, n_iters=5),
-            BenchmarkConfig(n_obs=10_000, dgp_type="difficult", n_fe=2, n_iters=5),
-            BenchmarkConfig(n_obs=10_000, dgp_type="simple", n_fe=3, n_iters=5),
-            BenchmarkConfig(n_obs=10_000, dgp_type="difficult", n_fe=3, n_iters=5),
-            # Medium
-            BenchmarkConfig(n_obs=100_000, dgp_type="simple", n_fe=2, n_iters=3),
-            BenchmarkConfig(n_obs=100_000, dgp_type="difficult", n_fe=2, n_iters=3),
-            BenchmarkConfig(n_obs=100_000, dgp_type="simple", n_fe=3, n_iters=3),
-            BenchmarkConfig(n_obs=100_000, dgp_type="difficult", n_fe=3, n_iters=3),
-            # Large
-            BenchmarkConfig(n_obs=500_000, dgp_type="simple", n_fe=2, n_iters=2),
-            BenchmarkConfig(n_obs=500_000, dgp_type="difficult", n_fe=2, n_iters=2),
-            BenchmarkConfig(n_obs=1_000_000, dgp_type="simple", n_fe=2, n_iters=1),
-            BenchmarkConfig(n_obs=1_000_000, dgp_type="difficult", n_fe=2, n_iters=1),
-        ]
-    else:
-        # Fast mode - minimal configs for quick iteration
-        configs = [
-            BenchmarkConfig(n_obs=10_000, dgp_type="simple", n_fe=2, n_iters=5),
-            BenchmarkConfig(n_obs=10_000, dgp_type="difficult", n_fe=2, n_iters=5),
-            BenchmarkConfig(n_obs=10_000, dgp_type="simple", n_fe=3, n_iters=5),
-            BenchmarkConfig(n_obs=10_000, dgp_type="difficult", n_fe=3, n_iters=5),
-            BenchmarkConfig(n_obs=100_000, dgp_type="simple", n_fe=2, n_iters=3),
-            BenchmarkConfig(n_obs=100_000, dgp_type="difficult", n_fe=2, n_iters=3),
-            BenchmarkConfig(n_obs=100_000, dgp_type="simple", n_fe=3, n_iters=3),
-            BenchmarkConfig(n_obs=100_000, dgp_type="difficult", n_fe=3, n_iters=3),
-        ]
-
-    print("Demeaning Benchmark")
-    print("=" * 60)
-    print(f"Mode: {'full' if args.full else 'fast'}")
-    print(f"Configurations: {len(configs)}")
-
-    # Get available backends
-    backends = get_demean_backends()
-    available = [name for name, func in backends.items() if func is not None]
-    unavailable = [name for name, func in backends.items() if func is None]
-
-    print(f"Available backends: {', '.join(available)}")
-    if unavailable:
-        print(f"Unavailable backends: {', '.join(unavailable)}")
-
-    # Run benchmarks
-    results = run_benchmarks(configs, backends)
-
-    # Print summary
-    print_summary(results)
-
-    # Save if requested
-    if args.save:
-        save_results(results, args.output)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/docs/specs/demean_accelerated_optimization.md b/docs/specs/demean_accelerated_optimization.md
deleted file mode 100644
index 89cb4b2c3..000000000
--- a/docs/specs/demean_accelerated_optimization.md
+++ /dev/null
@@ -1,370 +0,0 @@
-# Optimization Specification: demean_accelerated.rs
-
-## 1. Current Implementation Analysis
-
-### 1.1 Overview of demean_accelerated.rs
-
-The current implementation in `src/demean_accelerated.rs` (336 lines) provides:
-
-- **Irons-Tuck acceleration**: Applied every 3rd iteration
-- **Struct abstractions**: `FactorDemeaner`, `MultiFactorDemeaner`, `AccelerationBuffers`, `IronTucksAcceleration`
-- **Parallelization**: rayon for column-level parallelism
-- **Memory**: Heap-allocated `Vec<f64>` buffers
-
-### 1.2 Comparison: demean.rs vs demean_accelerated.rs
-
-| Aspect | demean.rs | demean_accelerated.rs |
-|--------|-----------|----------------------|
-| Algorithm | Simple alternating projection | Irons-Tuck acceleration |
-| Iteration | One projection per iter | 2 projections + acceleration step |
-| Memory | Minimal buffers | 6 buffers × n_samples |
-| Convergence | Element-wise SAD | Element-wise SAD |
-
-### 1.3 Reference: fixest C++ (demeaning.cpp)
-
-Key features in fixest not present in current Rust implementation:
-
-| Feature | fixest | demean_accelerated.rs |
-|---------|--------|----------------------|
-| Grand acceleration | ✓ (3-point history) | ✗ |
-| 2-FE optimization | ✓ (no N-length temps) | ✗ |
-| SSR convergence | ✓ (every 40 iters) | ✗ |
-| Coefficient-based | ✓ (iterates on FE coeffs) | ✗ (observation-based) |
-
----
-
-## 2. Missing Parts (vs fixest)
-
-### 2.1 Grand Acceleration (Priority: HIGH)
-
-fixest implements a **two-tier acceleration scheme**:
-
-```
-Standard iterations: Apply Irons-Tuck every 3 iterations
-Grand acceleration: Every `iter_grandAcc` iterations, apply Irons-Tuck
-                    on a 3-point history (Y, GY, GGY) of coefficient vectors
-```
-
-The grand acceleration operates on a coarser timescale, accelerating convergence on slow-moving modes. This can significantly reduce iteration count for hard-to-converge problems.
-
-**Implementation sketch:**
-```rust
-struct GrandAccelerationState {
-    y: Vec<f64>,      // First history point
-    gy: Vec<f64>,     // Second history point
-    ggy: Vec<f64>,    // Third history point
-    counter: usize,   // Cycles 0-2
-    interval: usize,  // Apply every N iterations (default ~15)
-}
-```
-
-### 2.2 Specialized 2-FE Path (Priority: MEDIUM)
-
-When `n_factors == 2`, fixest uses a specialized routine that:
-- Stores second FE coefficients in a `nb_coef_Q[1]`-length buffer instead of `n_obs`
-- Avoids materializing full N-length residual vectors
-- Alternates between updating both effects without intermediate storage
-
-Current implementation always allocates `n_samples`-length buffers regardless of factor count.
-
-### 2.3 SSR-Based Convergence (Priority: MEDIUM)
-
-fixest checks residual sum-of-squares every 40 iterations:
-
-```cpp
-ssr = Σ(input[i] - mu_current[i])²
-if (stopping_crit(ssr_old, ssr, diffMax)) break;
-```
-
-This complements the element-wise convergence check and can detect convergence earlier in some cases.
-
-### 2.4 Coefficient-Based Iteration (Priority: LOW)
-
-fixest iterates on FE **coefficients** rather than demeaned **observations**:
-- Coefficient vector length: `Σ n_groups[j]` (often << n_samples)
-- More cache-friendly for problems with many observations but few groups
-- Requires restructuring the core algorithm
-
----
-
-## 3. Potential Speedup Opportunities
-
-### 3.1 SIMD Vectorization (Priority: HIGH)
-
-Current inner loops rely on compiler autovectorization:
-
-```rust
-// Current: relies on autovectorization
-for i in 0..n {
-    self.buffers.delta_gx[i] = self.buffers.ggx_curr[i] - gx_tmp;
-    // ...
-}
-```
-
-**Opportunity**: Use explicit SIMD via `std::simd` (nightly) or `wide` crate:
-
-```rust
-use wide::f64x4;
-
-// Process 4 elements at a time
-for chunk in buffers.chunks_exact_mut(4) {
-    let a = f64x4::from_slice(a_slice);
-    let b = f64x4::from_slice(b_slice);
-    (a - b).store(chunk);
-}
-```
-
-Potential gains:
-- **2-4x** for memory-bound operations (likely scenario)
-- Requires careful handling of non-aligned tails
-
-### 3.2 Memory Layout Optimization (Priority: HIGH)
-
-Current: Separate `Vec<f64>` for each buffer (AoS pattern)
-
-```rust
-struct AccelerationBuffers {
-    x_curr: Vec<f64>,
-    gx_curr: Vec<f64>,
-    ggx_curr: Vec<f64>,
-    // ... 6 separate allocations
-}
-```
-
-**Opportunity**: Interleaved SoA layout for better cache locality:
-
-```rust
-struct InterleavedBuffers {
-    // All data in single allocation, interleaved for spatial locality
-    data: Vec<f64>,  // [x0, gx0, ggx0, x1, gx1, ggx1, ...]
-}
-```
-
-Or single contiguous allocation with computed offsets:
-
-```rust
-struct AccelerationBuffers {
-    data: Vec<f64>,  // Single allocation: 6 * n_samples
-    n_samples: usize,
-}
-impl AccelerationBuffers {
-    fn x_curr(&mut self) -> &mut [f64] { &mut self.data[0..self.n_samples] }
-    // ...
-}
-```
-
-### 3.3 Reduce Per-Column Allocations (Priority: HIGH)
-
-Current implementation allocates `MultiFactorDemeaner` per column:
-
-```rust
-// src/demean_accelerated.rs:274
-let process_column = |(k, mut col): (...)| {
-    let demeaner = MultiFactorDemeaner::new(...);  // Allocation per column!
-    let mut acceleration = IronTucksAcceleration::new(...);
-    // ...
-};
-```
-
-**Opportunity**: Pre-allocate demeaners and reuse via thread-local storage:
-
-```rust
-use rayon::prelude::*;
-use std::cell::RefCell;
-
-thread_local! {
-    static DEMEANER: RefCell<Option<MultiFactorDemeaner>> = RefCell::new(None);
-}
-
-// Or use rayon's broadcast for pre-allocation
-```
-
-### 3.4 Convergence Check Optimization (Priority: MEDIUM)
-
-Current: Full pass over all elements every iteration:
-
-```rust
-fn sad_converged(a: &[f64], b: &[f64], tol: f64) -> bool {
-    a.iter().zip(b).all(|(&x, &y)| (x - y).abs() < tol)
-}
-```
-
-**Opportunity**: Early exit with SIMD max-reduction:
-
-```rust
-fn sad_converged_simd(a: &[f64], b: &[f64], tol: f64) -> bool {
-    // SIMD: compute max |a-b| in chunks, early exit if any chunk exceeds tol
-    let tol_vec = f64x4::splat(tol);
-    for (a_chunk, b_chunk) in a.chunks_exact(4).zip(b.chunks_exact(4)) {
-        let diff = (f64x4::from_slice(a_chunk) - f64x4::from_slice(b_chunk)).abs();
-        if diff.reduce_max() >= tol {
-            return false;
-        }
-    }
-    // Handle remainder...
-    true
-}
-```
-
-### 3.5 Group Mean Computation (Priority: MEDIUM)
-
-Current scatter-gather pattern:
-
-```rust
-// Scatter: accumulate weighted sums
-input.iter().zip(&self.sample_weights).zip(&self.group_ids)
-    .for_each(|((&xi, &wi), &gid)| {
-        self.group_weighted_sums[gid] += wi * xi;  // Random access
-    });
-```
-
-**Opportunity**:
-- Sort observations by group ID for sequential access (one-time cost)
-- Use sparse matrix representation for very large groups
-- Consider prefix sums for sorted data
-
-### 3.6 Use ndarray-linalg for BLAS (Priority: LOW)
-
-Add `ndarray-linalg` for optimized linear algebra:
-
-```toml
-[dependencies]
-ndarray-linalg = { version = "0.16", features = ["openblas-system"] }
-```
-
-Could accelerate matrix operations if algorithm is restructured.
-
----
-
-## 4. Benchmark Strategy
-
-### 4.1 Minimal Benchmark Fixture
-
-Add to `tests/test_demean.py`:
-
-```python
-import pytest
-import numpy as np
-from pyfixest.core.demean import demean
-from pyfixest.core.demean_accelerated import demean_accelerated
-
-@pytest.fixture
-def benchmark_data_small():
-    """Small dataset for quick iteration."""
-    rng = np.random.default_rng(42)
-    n, k = 10_000, 5
-    return {
-        'x': rng.normal(0, 1, (n, k)),
-        'flist': np.column_stack([
-            rng.integers(0, 100, n),
-            rng.integers(0, 50, n),
-        ]).astype(np.uint64),
-        'weights': np.ones(n),
-    }
-
-@pytest.fixture
-def benchmark_data_complex():
-    """Complex FE structure from fixest benchmarks."""
-    # Use generate_complex_fixed_effects_data() from test_demean.py
-    X, flist, weights = generate_complex_fixed_effects_data()
-    return {'x': X, 'flist': flist, 'weights': weights}
-
-@pytest.mark.benchmark(group="demean")
-def test_bench_demean_simple(benchmark, benchmark_data_small):
-    data = benchmark_data_small
-    result, success = benchmark(
-        demean, data['x'], data['flist'], data['weights'], tol=1e-8
-    )
-    assert success
-
-@pytest.mark.benchmark(group="demean")
-def test_bench_demean_accelerated(benchmark, benchmark_data_small):
-    data = benchmark_data_small
-    result, success = benchmark(
-        demean_accelerated, data['x'], data['flist'], data['weights'], tol=1e-8
-    )
-    assert success
-```
-
-### 4.2 Run Benchmarks
-
-```bash
-# Quick benchmark during iteration
-pytest tests/test_demean.py -k "bench" --benchmark-only --benchmark-compare
-
-# Full benchmark with stats
-pytest tests/test_demean.py -k "bench" --benchmark-only \
-    --benchmark-columns=mean,stddev,rounds \
-    --benchmark-save=baseline
-```
-
-### 4.3 Benchmark Scenarios
-
-| Scenario | n_samples | n_features | n_factors | n_groups_per_factor |
-|----------|-----------|------------|-----------|---------------------|
-| Small-simple | 10K | 5 | 2 | 100, 50 |
-| Medium-2FE | 100K | 10 | 2 | 1000, 500 |
-| Large-3FE | 1M | 5 | 3 | 5000, 2500, 100 |
-| Complex | 100K | 3 | 3 | (per fixest) |
-
----
-
-## 5. Implementation Roadmap
-
-### Phase 1: Low-Hanging Fruit (Quick Wins)
-1. [ ] Reduce per-column allocations (thread-local reuse)
-2. [ ] Single contiguous buffer allocation
-3. [ ] Add SIMD convergence check
-
-### Phase 2: Algorithm Improvements
-4. [ ] Implement grand acceleration
-5. [ ] Add SSR-based convergence check
-6. [ ] Specialized 2-FE path
-
-### Phase 3: Advanced Optimization
-7. [ ] Explicit SIMD for inner loops (wide crate)
-8. [ ] Sort-by-group optimization
-9. [ ] Coefficient-based iteration (major refactor)
-
----
-
-## 6. Testing Requirements (Minimal)
-
-Keep tests minimal for fast iteration:
-
-```python
-# Correctness: compare against pyhdfe (already in test_demean.py)
-def test_accelerated_correctness():
-    """Verify accelerated matches reference implementation."""
-    X, flist, weights = generate_data()
-    res_simple, _ = demean(X, flist, weights, tol=1e-10)
-    res_accel, _ = demean_accelerated(X, flist, weights, tol=1e-10)
-    assert np.allclose(res_simple, res_accel, rtol=1e-6, atol=1e-8)
-
-# Benchmark: already covered above
-```
-
----
-
-## 7. Expected Performance Gains
-
-| Optimization | Expected Gain | Effort |
-|--------------|---------------|--------|
-| Reduce allocations | 10-20% | Low |
-| SIMD convergence | 5-10% | Low |
-| Grand acceleration | 20-50% (hard problems) | Medium |
-| 2-FE specialization | 10-30% (2-FE cases) | Medium |
-| Full SIMD loops | 2-4x (compute-bound) | High |
-| Coefficient-based | Variable | Very High |
-
-**Realistic target**: 2-3x speedup over current `demean_accelerated.rs` for typical workloads, approaching fixest C++ performance.
-
----
-
-## 8. Files to Modify
-
-- `src/demean_accelerated.rs` - Main implementation
-- `src/lib.rs` - Expose new functions if needed
-- `pyfixest/core/demean_accelerated.py` - Python wrapper
-- `tests/test_demean.py` - Add benchmarks
-- `Cargo.toml` - Add `wide` crate for SIMD (optional)
diff --git a/pyfixest/estimation/__init__.py b/pyfixest/estimation/__init__.py
index 6a34b9b75..dc43fb7db 100644
--- a/pyfixest/estimation/__init__.py
+++ b/pyfixest/estimation/__init__.py
@@ -11,7 +11,6 @@
 from pyfixest.estimation.demean_ import (
     demean,
 )
-
 from pyfixest.estimation.fegaussian_ import Fegaussian
 from pyfixest.estimation.feiv_ import (
     Feiv,
diff --git a/src/demean_accelerated/demeaner.rs b/src/demean_accelerated/demeaner.rs
index 2bf6c6183..9f131b6b2 100644
--- a/src/demean_accelerated/demeaner.rs
+++ b/src/demean_accelerated/demeaner.rs
@@ -7,12 +7,13 @@
 //! - [`TwoFEDemeaner`]: Accelerated iteration (2 FEs)
 //! - [`MultiFEDemeaner`]: Multi-phase strategy (3+ FEs)
 //!
-//! # Scatter/Gather Operations
+//! # Buffer Reuse
 //!
-//! The scatter/gather operations that transform between observation space and
-//! coefficient space are provided by [`DemeanContext`] methods, not by this trait.
+//! Demeaners own their working buffers, allowing reuse across multiple `solve()` calls.
+//! This is important for parallel processing where each thread can have its own
+//! demeaner instance that reuses buffers across columns.
 
-use crate::demean_accelerated::accelerator::{Accelerator, IronsTuckGrand};
+use crate::demean_accelerated::accelerator::{Accelerator, IronsTuckGrand, IronsTuckGrandBuffers};
 use crate::demean_accelerated::projection::{MultiFEProjector, TwoFEProjector};
 use crate::demean_accelerated::types::{DemeanContext, FixestConfig};
 
@@ -22,25 +23,15 @@ use crate::demean_accelerated::types::{DemeanContext, FixestConfig};
 
 /// A demeaning solver for a specific fixed-effects configuration.
 ///
-/// This trait represents the complete strategy for solving the demeaning
-/// problem with a specific number of fixed effects. Implementations handle
-/// setup, iteration (if needed), and output reconstruction.
-///
-/// Scatter/gather operations are available via [`DemeanContext`] methods:
-/// - [`DemeanContext::scatter_to_coefficients`]
-/// - [`DemeanContext::scatter_residuals`]
-/// - [`DemeanContext::gather_and_add`]
+/// Demeaners own references to their context and configuration, as well as
+/// working buffers that are reused across multiple `solve()` calls.
 pub trait Demeaner {
     /// Solve the demeaning problem.
     ///
     /// # Returns
     ///
     /// Tuple of (demeaned_output, iterations_used, converged_flag)
-    fn solve(
-        ctx: &DemeanContext,
-        input: &[f64],
-        config: &FixestConfig,
-    ) -> (Vec<f64>, usize, bool);
+    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, bool);
 }
 
 // =============================================================================
@@ -49,23 +40,29 @@ pub trait Demeaner {
 
 /// Demeaner for 1 fixed effect: O(n) closed-form solution.
 ///
-/// No iteration needed - direct computation.
-pub struct SingleFEDemeaner;
-
-impl Demeaner for SingleFEDemeaner {
-    fn solve(
-        ctx: &DemeanContext,
-        input: &[f64],
-        _config: &FixestConfig,
-    ) -> (Vec<f64>, usize, bool) {
-        let n_obs = ctx.index.n_obs;
+/// No iteration or buffers needed - direct computation.
+pub struct SingleFEDemeaner<'a> {
+    ctx: &'a DemeanContext,
+}
+
+impl<'a> SingleFEDemeaner<'a> {
+    /// Create a new single-FE demeaner.
+    #[inline]
+    pub fn new(ctx: &'a DemeanContext) -> Self {
+        Self { ctx }
+    }
+}
+
+impl Demeaner for SingleFEDemeaner<'_> {
+    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, bool) {
+        let n_obs = self.ctx.index.n_obs;
         let output = vec![0.0; n_obs];
 
         // Scatter input to coefficient space (sum of input per group)
-        let in_out = ctx.scatter_residuals(input, &output);
+        let in_out = self.ctx.scatter_residuals(input, &output);
 
-        let fe0 = ctx.index.group_ids_for_fe(0);
-        let group_weights = ctx.group_weights_for_fe(0);
+        let fe0 = self.ctx.index.group_ids_for_fe(0);
+        let group_weights = self.ctx.group_weights_for_fe(0);
 
         // coef[g] = in_out[g] / group_weights[g]
         let coef: Vec<f64> = in_out
@@ -86,39 +83,63 @@ impl Demeaner for SingleFEDemeaner {
 // =============================================================================
 
 /// Demeaner for 2 fixed effects: accelerated coefficient-space iteration.
-pub struct TwoFEDemeaner;
-
-impl Demeaner for TwoFEDemeaner {
-    fn solve(
-        ctx: &DemeanContext,
-        input: &[f64],
-        config: &FixestConfig,
-    ) -> (Vec<f64>, usize, bool) {
-        let n_obs = ctx.index.n_obs;
+///
+/// Owns working buffers that are reused across multiple `solve()` calls.
+pub struct TwoFEDemeaner<'a> {
+    ctx: &'a DemeanContext,
+    config: &'a FixestConfig,
+    /// Coefficient array [alpha | beta], reused across solves
+    coef: Vec<f64>,
+    /// Acceleration buffers, reused across solves
+    buffers: IronsTuckGrandBuffers,
+}
+
+impl<'a> TwoFEDemeaner<'a> {
+    /// Create a new two-FE demeaner with pre-allocated buffers.
+    #[inline]
+    pub fn new(ctx: &'a DemeanContext, config: &'a FixestConfig) -> Self {
         let n0 = ctx.index.n_groups[0];
         let n1 = ctx.index.n_groups[1];
         let n_coef = n0 + n1;
 
+        Self {
+            ctx,
+            config,
+            coef: vec![0.0; n_coef],
+            buffers: IronsTuckGrand::create_buffers(n_coef),
+        }
+    }
+}
+
+impl Demeaner for TwoFEDemeaner<'_> {
+    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, bool) {
+        let n_obs = self.ctx.index.n_obs;
+        let n0 = self.ctx.index.n_groups[0];
+
         // Scatter input to coefficient space
-        let in_out = ctx.scatter_to_coefficients(input);
+        let in_out = self.ctx.scatter_to_coefficients(input);
 
-        // Initialize coefficient array (unified: [alpha | beta])
-        let mut coef = vec![0.0; n_coef];
+        // Reset coefficient array for this solve
+        self.coef.fill(0.0);
 
-        // Create buffers and projector
-        let mut buffers = IronsTuckGrand::create_buffers(n_coef);
-        let mut projector = TwoFEProjector::new(ctx, &in_out, input);
+        // Create projector (lightweight, references in_out and input)
+        let mut projector = TwoFEProjector::new(self.ctx, &in_out, input);
 
-        // Run acceleration loop
-        let (iter, converged) =
-            IronsTuckGrand::run(&mut projector, &mut coef, &mut buffers, config, config.maxiter);
+        // Run acceleration loop with reused buffers
+        let (iter, converged) = IronsTuckGrand::run(
+            &mut projector,
+            &mut self.coef,
+            &mut self.buffers,
+            self.config,
+            self.config.maxiter,
+        );
 
         // Reconstruct output: input - alpha - beta
-        let fe0 = ctx.index.group_ids_for_fe(0);
-        let fe1 = ctx.index.group_ids_for_fe(1);
+        let fe0 = self.ctx.index.group_ids_for_fe(0);
+        let fe1 = self.ctx.index.group_ids_for_fe(1);
 
         let result: Vec<f64> = (0..n_obs)
-            .map(|i| input[i] - coef[fe0[i]] - coef[n0 + fe1[i]])
+            .map(|i| input[i] - self.coef[fe0[i]] - self.coef[n0 + fe1[i]])
             .collect();
 
         (result, iter, converged)
@@ -129,51 +150,103 @@ impl Demeaner for TwoFEDemeaner {
 // MultiFEDemeaner
 // =============================================================================
 
+/// Working buffers for multi-FE demeaning.
+///
+/// Groups the observation-space and coefficient-space arrays that are
+/// reused across multiple `solve()` calls.
+struct MultiFEBuffers {
+    /// Accumulated fixed effects per observation (observation-space)
+    mu: Vec<f64>,
+    /// Coefficient array for all FEs (coefficient-space)
+    coef: Vec<f64>,
+    /// Coefficient array for 2-FE sub-convergence (coefficient-space, first 2 FEs only)
+    coef_2fe: Vec<f64>,
+    /// Effective input after subtracting mu (observation-space)
+    effective_input: Vec<f64>,
+}
+
+impl MultiFEBuffers {
+    /// Create new buffers with the given dimensions.
+    fn new(n_obs: usize, n_coef: usize, n_coef_2fe: usize) -> Self {
+        Self {
+            mu: vec![0.0; n_obs],
+            coef: vec![0.0; n_coef],
+            coef_2fe: vec![0.0; n_coef_2fe],
+            effective_input: vec![0.0; n_obs],
+        }
+    }
+
+    /// Reset all buffers to zero for a new solve.
+    #[inline]
+    fn reset(&mut self) {
+        self.mu.fill(0.0);
+        self.coef.fill(0.0);
+    }
+}
+
 /// Demeaner for 3+ fixed effects: multi-phase strategy.
 ///
+/// Owns working buffers that are reused across multiple `solve()` calls.
+///
 /// # Strategy
 ///
 /// 1. **Warmup**: Run all-FE iterations to get initial estimates
 /// 2. **2-FE sub-convergence**: Converge on first 2 FEs (faster)
 /// 3. **Re-acceleration**: Final all-FE iterations to polish
-///
-/// # Convergence
-///
-/// Returns `converged=true` if any phase converges early (before max iterations).
-pub struct MultiFEDemeaner;
-
-impl Demeaner for MultiFEDemeaner {
-    fn solve(
-        ctx: &DemeanContext,
-        input: &[f64],
-        config: &FixestConfig,
-    ) -> (Vec<f64>, usize, bool) {
+pub struct MultiFEDemeaner<'a> {
+    ctx: &'a DemeanContext,
+    config: &'a FixestConfig,
+    /// Working buffers for coefficient and observation arrays
+    buffers: MultiFEBuffers,
+    /// Acceleration buffers for multi-FE iterations
+    multi_acc: IronsTuckGrandBuffers,
+    /// Acceleration buffers for 2-FE sub-convergence
+    two_acc: IronsTuckGrandBuffers,
+}
+
+impl<'a> MultiFEDemeaner<'a> {
+    /// Create a new multi-FE demeaner with pre-allocated buffers.
+    #[inline]
+    pub fn new(ctx: &'a DemeanContext, config: &'a FixestConfig) -> Self {
         let n_obs = ctx.index.n_obs;
         let n_coef = ctx.index.n_coef;
         let n0 = ctx.index.n_groups[0];
         let n1 = ctx.index.n_groups[1];
         let n_coef_2fe = n0 + n1;
-        let mut total_iter = 0usize;
 
-        let mut mu = vec![0.0; n_obs];
-        let mut coef = vec![0.0; n_coef];
+        Self {
+            ctx,
+            config,
+            buffers: MultiFEBuffers::new(n_obs, n_coef, n_coef_2fe),
+            multi_acc: IronsTuckGrand::create_buffers(n_coef),
+            two_acc: IronsTuckGrand::create_buffers(n_coef_2fe),
+        }
+    }
+}
+
+impl Demeaner for MultiFEDemeaner<'_> {
+    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, bool) {
+        let n_obs = self.ctx.index.n_obs;
+        let n0 = self.ctx.index.n_groups[0];
+        let n1 = self.ctx.index.n_groups[1];
+        let n_coef_2fe = n0 + n1;
+        let mut total_iter = 0usize;
 
-        // Create buffers (one for multi-FE, one for 2-FE sub-convergence)
-        let mut multi_buffers = IronsTuckGrand::create_buffers(n_coef);
-        let mut two_buffers = IronsTuckGrand::create_buffers(n_coef_2fe);
+        // Reset buffers for this solve
+        self.buffers.reset();
 
         // Phase 1: Warmup with all FEs (mu is zeros initially)
-        let in_out_phase1 = ctx.scatter_to_coefficients(input);
-        let mut projector1 = MultiFEProjector::new(ctx, &in_out_phase1, input);
+        let in_out_phase1 = self.ctx.scatter_to_coefficients(input);
+        let mut projector1 = MultiFEProjector::new(self.ctx, &in_out_phase1, input);
         let (iter1, converged1) = IronsTuckGrand::run(
             &mut projector1,
-            &mut coef,
-            &mut multi_buffers,
-            config,
-            config.iter_warmup,
+            &mut self.buffers.coef,
+            &mut self.multi_acc,
+            self.config,
+            self.config.iter_warmup,
         );
         total_iter += iter1;
-        ctx.gather_and_add(&coef, &mut mu);
+        self.ctx.gather_and_add(&self.buffers.coef, &mut self.buffers.mu);
 
         // Determine final convergence status based on which phase completes the algorithm
         let converged = if converged1 {
@@ -181,43 +254,49 @@ impl Demeaner for MultiFEDemeaner {
             true
         } else {
             // Phase 2: 2-FE sub-convergence
-            let in_out_phase2 = ctx.scatter_residuals(input, &mu);
-            let mut coef_2fe = vec![0.0; n_coef_2fe];
+            let in_out_phase2 = self.ctx.scatter_residuals(input, &self.buffers.mu);
+            self.buffers.coef_2fe.fill(0.0);
             let in_out_2fe: Vec<f64> = in_out_phase2[..n_coef_2fe].to_vec();
-            let effective_input: Vec<f64> = (0..n_obs).map(|i| input[i] - mu[i]).collect();
 
-            let mut projector2 = TwoFEProjector::new(ctx, &in_out_2fe, &effective_input);
+            // Compute effective input: input - mu
+            for i in 0..n_obs {
+                self.buffers.effective_input[i] = input[i] - self.buffers.mu[i];
+            }
+
+            let mut projector2 =
+                TwoFEProjector::new(self.ctx, &in_out_2fe, &self.buffers.effective_input);
             let (iter2, converged2) = IronsTuckGrand::run(
                 &mut projector2,
-                &mut coef_2fe,
-                &mut two_buffers,
-                config,
-                config.maxiter / 2,
+                &mut self.buffers.coef_2fe,
+                &mut self.two_acc,
+                self.config,
+                self.config.maxiter / 2,
             );
             total_iter += iter2;
 
             // Add 2-FE coefficients to mu
-            let fe0 = ctx.index.group_ids_for_fe(0);
-            let fe1 = ctx.index.group_ids_for_fe(1);
+            let fe0 = self.ctx.index.group_ids_for_fe(0);
+            let fe1 = self.ctx.index.group_ids_for_fe(1);
             for i in 0..n_obs {
-                mu[i] += coef_2fe[fe0[i]] + coef_2fe[n0 + fe1[i]];
+                self.buffers.mu[i] +=
+                    self.buffers.coef_2fe[fe0[i]] + self.buffers.coef_2fe[n0 + fe1[i]];
             }
 
             // Phase 3: Re-acceleration with all FEs (unless 2-FE converged fully)
-            let remaining = config.maxiter.saturating_sub(total_iter);
+            let remaining = self.config.maxiter.saturating_sub(total_iter);
             if remaining > 0 {
-                let in_out_phase3 = ctx.scatter_residuals(input, &mu);
-                coef.fill(0.0);
-                let mut projector3 = MultiFEProjector::new(ctx, &in_out_phase3, input);
+                let in_out_phase3 = self.ctx.scatter_residuals(input, &self.buffers.mu);
+                self.buffers.coef.fill(0.0);
+                let mut projector3 = MultiFEProjector::new(self.ctx, &in_out_phase3, input);
                 let (iter3, converged3) = IronsTuckGrand::run(
                     &mut projector3,
-                    &mut coef,
-                    &mut multi_buffers,
-                    config,
+                    &mut self.buffers.coef,
+                    &mut self.multi_acc,
+                    self.config,
                     remaining,
                 );
                 total_iter += iter3;
-                ctx.gather_and_add(&coef, &mut mu);
+                self.ctx.gather_and_add(&self.buffers.coef, &mut self.buffers.mu);
                 converged3
             } else {
                 // No remaining iterations, use phase 2 convergence status
@@ -226,39 +305,8 @@ impl Demeaner for MultiFEDemeaner {
         };
 
         // Compute output: input - mu
-        let output: Vec<f64> = (0..n_obs).map(|i| input[i] - mu[i]).collect();
+        let output: Vec<f64> = (0..n_obs).map(|i| input[i] - self.buffers.mu[i]).collect();
 
         (output, total_iter, converged)
     }
 }
-
-// =============================================================================
-// Entry Point
-// =============================================================================
-
-/// Demean a single variable using the appropriate solver.
-///
-/// Dispatches to the appropriate [`Demeaner`] implementation based on FE count.
-///
-/// # Panics
-///
-/// Panics in debug builds if `input.len() != ctx.index.n_obs`.
-pub fn demean_single(
-    ctx: &DemeanContext,
-    input: &[f64],
-    config: &FixestConfig,
-) -> (Vec<f64>, usize, bool) {
-    debug_assert_eq!(
-        input.len(),
-        ctx.index.n_obs,
-        "input length ({}) must match number of observations ({})",
-        input.len(),
-        ctx.index.n_obs
-    );
-
-    match ctx.index.n_fe {
-        1 => SingleFEDemeaner::solve(ctx, input, config),
-        2 => TwoFEDemeaner::solve(ctx, input, config),
-        _ => MultiFEDemeaner::solve(ctx, input, config),
-    }
-}
diff --git a/src/demean_accelerated/mod.rs b/src/demean_accelerated/mod.rs
index 9911f372f..96ae4d757 100644
--- a/src/demean_accelerated/mod.rs
+++ b/src/demean_accelerated/mod.rs
@@ -31,7 +31,7 @@ pub mod demeaner;
 pub mod projection;
 pub mod types;
 
-use demeaner::demean_single;
+use demeaner::{Demeaner, MultiFEDemeaner, SingleFEDemeaner, TwoFEDemeaner};
 use types::{DemeanContext, FixestConfig};
 
 use ndarray::{Array2, ArrayView1, ArrayView2, Zip};
@@ -41,7 +41,42 @@ use rayon::prelude::*;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 
+
+/// Thread-local demeaner state that wraps the appropriate demeaner type.
+///
+/// This enum allows `for_each_init` to create a demeaner once per thread,
+/// reusing its buffers across all columns processed by that thread.
+enum ThreadLocalDemeaner<'a> {
+    Single(SingleFEDemeaner<'a>),
+    Two(TwoFEDemeaner<'a>),
+    Multi(MultiFEDemeaner<'a>),
+}
+
+impl<'a> ThreadLocalDemeaner<'a> {
+    /// Create a new thread-local demeaner based on the FE count.
+    fn new(ctx: &'a DemeanContext, config: &'a FixestConfig) -> Self {
+        match ctx.index.n_fe {
+            1 => ThreadLocalDemeaner::Single(SingleFEDemeaner::new(ctx)),
+            2 => ThreadLocalDemeaner::Two(TwoFEDemeaner::new(ctx, config)),
+            _ => ThreadLocalDemeaner::Multi(MultiFEDemeaner::new(ctx, config)),
+        }
+    }
+
+    /// Solve the demeaning problem, reusing internal buffers.
+    #[inline]
+    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, bool) {
+        match self {
+            ThreadLocalDemeaner::Single(d) => d.solve(input),
+            ThreadLocalDemeaner::Two(d) => d.solve(input),
+            ThreadLocalDemeaner::Multi(d) => d.solve(input),
+        }
+    }
+}
+
 /// Demean using accelerated coefficient-space iteration.
+///
+/// Uses `for_each_init` to create one demeaner per thread, reusing buffers
+/// across all columns processed by that thread.
 pub(crate) fn demean_accelerated(
     x: &ArrayView2<f64>,
     flist: &ArrayView2<usize>,
@@ -65,20 +100,25 @@ pub(crate) fn demean_accelerated(
     res.axis_iter_mut(ndarray::Axis(1))
         .into_par_iter()
         .enumerate()
-        .for_each(|(k, mut col)| {
-            // Use ndarray's column view and convert to contiguous Vec
-            // (column() returns a non-contiguous view, to_vec() copies to contiguous)
-            let xk: Vec<f64> = x.column(k).to_vec();
-            let (result, _iter, converged) = demean_single(&ctx, &xk, &config);
+        .for_each_init(
+            // Init closure: called once per thread to create thread-local state
+            || ThreadLocalDemeaner::new(&ctx, &config),
+            // Body closure: called for each column, reusing thread-local state
+            |demeaner, (k, mut col)| {
+                // Use ndarray's column view and convert to contiguous Vec
+                // (column() returns a non-contiguous view, to_vec() copies to contiguous)
+                let xk: Vec<f64> = x.column(k).to_vec();
+                let (result, _iter, converged) = demeaner.solve(&xk);
 
-            if !converged {
-                not_converged.fetch_add(1, Ordering::SeqCst);
-            }
+                if !converged {
+                    not_converged.fetch_add(1, Ordering::SeqCst);
+                }
 
-            Zip::from(&mut col).and(&result).for_each(|col_elm, &val| {
-                *col_elm = val;
-            });
-        });
+                Zip::from(&mut col).and(&result).for_each(|col_elm, &val| {
+                    *col_elm = val;
+                });
+            },
+        );
 
     let success = not_converged.load(Ordering::SeqCst) == 0;
     (res, success)
@@ -109,6 +149,7 @@ pub fn _demean_accelerated_rs(
 #[cfg(test)]
 mod tests {
     use super::*;
+    use demeaner::{MultiFEDemeaner, SingleFEDemeaner};
     use ndarray::{Array1, Array2};
 
     #[test]
@@ -128,7 +169,8 @@ mod tests {
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
-        let (result, iter, converged) = demean_single(&ctx, &input, &config);
+        let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
+        let (result, iter, converged) = demeaner.solve(&input);
 
         assert!(converged, "Should converge");
         assert!(iter < 100, "Should converge quickly");
@@ -153,9 +195,205 @@ mod tests {
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
-        let (result, _iter, converged) = demean_single(&ctx, &input, &config);
+        let mut demeaner = MultiFEDemeaner::new(&ctx, &config);
+        let (result, _iter, converged) = demeaner.solve(&input);
 
         assert!(converged);
         assert!(result.iter().all(|&v| v.is_finite()));
     }
+
+    #[test]
+    fn test_single_fe() {
+        let n_obs = 100;
+        let n_groups = 10;
+
+        // Single fixed effect
+        let mut flist = Array2::<usize>::zeros((n_obs, 1));
+        for i in 0..n_obs {
+            flist[[i, 0]] = i % n_groups;
+        }
+
+        let weights = Array1::<f64>::ones(n_obs);
+        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+
+        let mut demeaner = SingleFEDemeaner::new(&ctx);
+        let (result, iter, converged) = demeaner.solve(&input);
+
+        assert!(converged, "Single FE should always converge");
+        assert_eq!(iter, 0, "Single FE should be closed-form (0 iterations)");
+
+        // Verify demeaning: each group's sum should be approximately 0
+        for g in 0..n_groups {
+            let group_sum: f64 = result
+                .iter()
+                .enumerate()
+                .filter(|(i, _)| i % n_groups == g)
+                .map(|(_, &v)| v)
+                .sum();
+            assert!(
+                group_sum.abs() < 1e-10,
+                "Group {} sum should be ~0, got {}",
+                g,
+                group_sum
+            );
+        }
+    }
+
+    #[test]
+    fn test_weighted_regression() {
+        let n_obs = 100;
+        let n_fe = 2;
+
+        let mut flist = Array2::<usize>::zeros((n_obs, n_fe));
+        for i in 0..n_obs {
+            flist[[i, 0]] = i % 10;
+            flist[[i, 1]] = i % 5;
+        }
+
+        // Non-uniform weights: 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, ...
+        let weights: Array1<f64> = (0..n_obs).map(|i| 1.0 + (i % 3) as f64).collect();
+        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+
+        assert!(
+            !ctx.weights.is_uniform,
+            "Weights should be detected as non-uniform"
+        );
+
+        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+        let config = FixestConfig::default();
+        let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
+        let (result, _iter, converged) = demeaner.solve(&input);
+
+        assert!(converged, "Weighted regression should converge");
+        assert!(
+            result.iter().all(|&v| v.is_finite()),
+            "All results should be finite"
+        );
+    }
+
+    #[test]
+    fn test_singleton_groups() {
+        // Each observation in its own group for FE 0 (singleton groups)
+        let n_obs = 20;
+
+        let mut flist = Array2::<usize>::zeros((n_obs, 2));
+        for i in 0..n_obs {
+            flist[[i, 0]] = i; // Singleton groups (each obs is its own group)
+            flist[[i, 1]] = i % 4; // 4 groups in FE 1
+        }
+
+        let weights = Array1::<f64>::ones(n_obs);
+        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+
+        let config = FixestConfig::default();
+        let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
+        let (result, _iter, converged) = demeaner.solve(&input);
+
+        assert!(converged, "Singleton groups should converge");
+
+        // With singleton groups in FE 0, each observation's own mean is subtracted,
+        // then adjusted for FE 1. The result should be all zeros since each
+        // observation perfectly absorbs its own value in FE 0.
+        assert!(
+            result.iter().all(|&v| v.abs() < 1e-10),
+            "Singleton groups should yield near-zero residuals"
+        );
+    }
+
+    #[test]
+    fn test_small_groups() {
+        // Test with very few observations per group
+        let n_obs = 30;
+
+        let mut flist = Array2::<usize>::zeros((n_obs, 2));
+        for i in 0..n_obs {
+            flist[[i, 0]] = i / 3; // 10 groups, 3 obs each
+            flist[[i, 1]] = i % 2; // 2 groups, 15 obs each
+        }
+
+        let weights = Array1::<f64>::ones(n_obs);
+        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+
+        let config = FixestConfig::default();
+        let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
+        let (result, _iter, converged) = demeaner.solve(&input);
+
+        assert!(converged, "Small groups should converge");
+        assert!(
+            result.iter().all(|&v| v.is_finite()),
+            "All results should be finite"
+        );
+    }
+
+    #[test]
+    fn test_uniform_weights_detection() {
+        let n_obs = 50;
+
+        let mut flist = Array2::<usize>::zeros((n_obs, 2));
+        for i in 0..n_obs {
+            flist[[i, 0]] = i % 5;
+            flist[[i, 1]] = i % 3;
+        }
+
+        // Test uniform weights (all 1.0)
+        let uniform_weights = Array1::<f64>::ones(n_obs);
+        let ctx_uniform = DemeanContext::new(&flist.view(), &uniform_weights.view());
+        assert!(
+            ctx_uniform.weights.is_uniform,
+            "All-ones weights should be detected as uniform"
+        );
+
+        // Test non-uniform weights
+        let mut non_uniform_weights = Array1::<f64>::ones(n_obs);
+        non_uniform_weights[0] = 2.0;
+        let ctx_non_uniform = DemeanContext::new(&flist.view(), &non_uniform_weights.view());
+        assert!(
+            !ctx_non_uniform.weights.is_uniform,
+            "Varying weights should be detected as non-uniform"
+        );
+    }
+
+    #[test]
+    fn test_buffer_reuse_produces_same_results() {
+        // Test that solving multiple times with the same demeaner produces correct results
+        let n_obs = 100;
+        let n_fe = 2;
+
+        let mut flist = Array2::<usize>::zeros((n_obs, n_fe));
+        for i in 0..n_obs {
+            flist[[i, 0]] = i % 10;
+            flist[[i, 1]] = i % 5;
+        }
+
+        let weights = Array1::<f64>::ones(n_obs);
+        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let config = FixestConfig::default();
+
+        // Create a single demeaner and use it multiple times
+        let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
+
+        let input1: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+        let input2: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.2 + 1.0).collect();
+
+        let (result1a, _, _) = demeaner.solve(&input1);
+        let (result2, _, _) = demeaner.solve(&input2);
+        let (result1b, _, _) = demeaner.solve(&input1);
+
+        // Results for the same input should be identical
+        for (a, b) in result1a.iter().zip(result1b.iter()) {
+            assert!(
+                (a - b).abs() < 1e-12,
+                "Buffer reuse should produce identical results"
+            );
+        }
+
+        // Results for different inputs should be different
+        assert!(
+            result1a.iter().zip(result2.iter()).any(|(a, b)| (a - b).abs() > 0.01),
+            "Different inputs should produce different results"
+        );
+    }
 }

From 0ffdaea49ac58e1cf868b60213e5968da532579d Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Sat, 3 Jan 2026 20:54:20 +0100
Subject: [PATCH 06/24] Simplify accelerator architecture

Remove unnecessary abstractions after experimentation phase:
- Remove Accelerator trait in favor of direct IronsTuckGrand impl
- Move config into IronsTuckGrand struct
- Consolidate ConvergenceState and related types
- Update to PyO3 0.26 API (allow_threads -> detach)
---
 benchmarks/bench_demean_r.R           |  71 ++++
 benchmarks/bench_native_comparison.py | 209 ++++++++++++
 benchmarks/demean_benchmark.py        | 456 +++++++++++++++++++++++++
 src/demean.rs                         |  27 +-
 src/demean_accelerated/accelerator.rs | 473 +++++++++++++++-----------
 src/demean_accelerated/demeaner.rs    |  83 ++---
 src/demean_accelerated/mod.rs         |  51 ++-
 src/demean_accelerated/projection.rs  |  28 +-
 src/demean_accelerated/types.rs       |  21 ++
 9 files changed, 1133 insertions(+), 286 deletions(-)
 create mode 100644 benchmarks/bench_demean_r.R
 create mode 100644 benchmarks/bench_native_comparison.py
 create mode 100644 benchmarks/demean_benchmark.py

diff --git a/benchmarks/bench_demean_r.R b/benchmarks/bench_demean_r.R
new file mode 100644
index 000000000..66bdc342a
--- /dev/null
+++ b/benchmarks/bench_demean_r.R
@@ -0,0 +1,71 @@
+#!/usr/bin/env Rscript
+# Benchmark fixest demeaning directly in R
+# Usage: Rscript bench_demean_r.R [n_obs] [dgp_type] [n_fe]
+
+library(fixest)
+
+args <- commandArgs(trailingOnly = TRUE)
+n_obs <- if (length(args) >= 1) as.integer(args[1]) else 100000L
+dgp_type <- if (length(args) >= 2) args[2] else "difficult"
+n_fe <- if (length(args) >= 3) as.integer(args[3]) else 3L
+
+# Use 2 threads to match fixest_benchmarks settings
+setFixest_nthreads(2)
+
+# Generate data matching Python benchmark DGP
+set.seed(42)
+n_year <- 10L
+n_indiv_per_firm <- 23L
+n_indiv <- max(1L, round(n_obs / n_year))
+n_firm <- max(1L, round(n_indiv / n_indiv_per_firm))
+
+indiv_id <- rep(1:n_indiv, each = n_year)[1:n_obs]
+year <- rep(1:n_year, times = n_indiv)[1:n_obs]
+
+if (dgp_type == "simple") {
+  firm_id <- sample(1:n_firm, n_obs, replace = TRUE)
+} else {
+  # difficult: sequential assignment
+  firm_id <- rep(1:n_firm, length.out = n_obs)
+}
+
+# Generate outcome
+x1 <- rnorm(n_obs)
+firm_fe <- rnorm(n_firm)[firm_id]
+unit_fe <- rnorm(n_indiv)[indiv_id]
+year_fe <- rnorm(n_year)[year]
+y <- x1 + firm_fe + unit_fe + year_fe + rnorm(n_obs)
+
+df <- data.frame(
+  y = y,
+  x1 = x1,
+  indiv_id = indiv_id,
+  year = year,
+  firm_id = firm_id
+)
+
+# Build formula based on n_fe
+if (n_fe == 2) {
+  fml <- y ~ x1 | indiv_id + year
+} else {
+  fml <- y ~ x1 | indiv_id + year + firm_id
+}
+
+# Warm up
+invisible(feols(fml, data = df, notes = FALSE, warn = FALSE, nthreads = 2L))
+
+# Benchmark
+n_runs <- 5L
+times <- numeric(n_runs)
+
+for (i in 1:n_runs) {
+  start <- Sys.time()
+  fit <- feols(fml, data = df, notes = FALSE, warn = FALSE, nthreads = 2L)
+  end <- Sys.time()
+  times[i] <- as.numeric(end - start, units = "secs") * 1000  # ms
+}
+
+cat(sprintf("fixest (R native) - n=%d, type=%s, %dFE\n", n_obs, dgp_type, n_fe))
+cat(sprintf("  Times (ms): %s\n", paste(round(times, 2), collapse = ", ")))
+cat(sprintf("  Median: %.2f ms\n", median(times)))
+cat(sprintf("  Min: %.2f ms\n", min(times)))
diff --git a/benchmarks/bench_native_comparison.py b/benchmarks/bench_native_comparison.py
new file mode 100644
index 000000000..f45ffd08f
--- /dev/null
+++ b/benchmarks/bench_native_comparison.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+"""
+Benchmark comparing pyfixest feols vs native fixest feols.
+
+Runs fixest directly in R to avoid rpy2 overhead, then compares with pyfixest.
+This is a fair apples-to-apples comparison of full feols() routines.
+"""
+
+from __future__ import annotations
+
+import os
+
+# Set thread count for Rayon (pyfixest) BEFORE importing pyfixest
+os.environ["RAYON_NUM_THREADS"] = "2"
+
+import json
+import subprocess
+import time
+from pathlib import Path
+from statistics import median
+
+import numpy as np
+import pandas as pd
+
+
+def generate_dgp(
+    n: int,
+    dgp_type: str = "simple",
+    n_years: int = 10,
+    n_indiv_per_firm: int = 23,
+) -> pd.DataFrame:
+    """Generate test data matching fixest benchmark DGP."""
+    np.random.seed(42)
+
+    n_indiv = max(1, round(n / n_years))
+    n_firm = max(1, round(n_indiv / n_indiv_per_firm))
+
+    indiv_id = np.repeat(np.arange(n_indiv), n_years)[:n]
+    year = np.tile(np.arange(n_years), n_indiv)[:n]
+
+    if dgp_type == "simple":
+        firm_id = np.random.randint(0, n_firm, size=n)
+    else:  # difficult
+        firm_id = np.tile(np.arange(n_firm), (n // n_firm) + 1)[:n]
+
+    x1 = np.random.randn(n)
+    firm_fe = np.random.randn(n_firm)[firm_id]
+    unit_fe = np.random.randn(n_indiv)[indiv_id]
+    year_fe = np.random.randn(n_years)[year]
+    y = x1 + firm_fe + unit_fe + year_fe + np.random.randn(n)
+
+    return pd.DataFrame(
+        {
+            "y": y,
+            "x1": x1,
+            "indiv_id": indiv_id,
+            "year": year,
+            "firm_id": firm_id,
+        }
+    )
+
+
+def run_r_benchmark(n_obs: int, dgp_type: str, n_fe: int, n_runs: int = 5) -> dict:
+    """Run fixest benchmark in R subprocess."""
+    r_script = Path(__file__).parent / "bench_demean_r.R"
+
+    try:
+        result = subprocess.run(
+            ["Rscript", str(r_script), str(n_obs), dgp_type, str(n_fe)],
+            capture_output=True,
+            text=True,
+            timeout=300,
+        )
+
+        if result.returncode != 0:
+            return {"error": result.stderr, "times": [], "median": float("inf")}
+
+        # Parse output
+        lines = result.stdout.strip().split("\n")
+        median_ms = None
+        for line in lines:
+            if "Median:" in line:
+                median_ms = float(line.split(":")[1].strip().replace(" ms", ""))
+
+        return {
+            "median": median_ms if median_ms else float("inf"),
+            "output": result.stdout,
+        }
+    except subprocess.TimeoutExpired:
+        return {"error": "timeout", "median": float("inf")}
+    except FileNotFoundError:
+        return {"error": "R not found", "median": float("inf")}
+
+
+def run_pyfixest_benchmark(
+    df: pd.DataFrame,
+    n_fe: int,
+    n_runs: int = 5,
+) -> dict:
+    """Run pyfixest feols benchmark."""
+    import pyfixest as pf
+
+    # Build formula matching R benchmark
+    if n_fe == 2:
+        fml = "y ~ x1 | indiv_id + year"
+    else:
+        fml = "y ~ x1 | indiv_id + year + firm_id"
+
+    # Warmup - use rust backend for accelerated demeaning
+    pf.feols(fml, data=df, demeaner_backend="rust")
+
+    times = []
+    for _ in range(n_runs):
+        start = time.perf_counter()
+        fit = pf.feols(fml, data=df, demeaner_backend="rust")
+        elapsed = (time.perf_counter() - start) * 1000  # ms
+        times.append(elapsed)
+
+    return {
+        "median": median(times),
+        "times": times,
+        "coef": float(fit.coef().iloc[0]),
+    }
+
+
+def main():
+    """Run benchmark comparing pyfixest feols vs native fixest feols."""
+    configs = [
+        (10_000, "simple", 2),
+        (10_000, "difficult", 2),
+        (10_000, "simple", 3),
+        (10_000, "difficult", 3),
+        (100_000, "simple", 2),
+        (100_000, "difficult", 2),
+        (100_000, "simple", 3),
+        (100_000, "difficult", 3),
+        (1_000_000, "simple", 2),
+        (1_000_000, "difficult", 2),
+        (1_000_000, "simple", 3),
+        (1_000_000, "difficult", 3),
+    ]
+
+    results = []
+
+    print("=" * 70)
+    print("PyFixest feols() vs Fixest feols() Benchmark")
+    print("=" * 70)
+
+    for n_obs, dgp_type, n_fe in configs:
+        print(f"\nConfig: n={n_obs:,}, type={dgp_type}, fe={n_fe}")
+        print("-" * 50)
+
+        # Generate data
+        df = generate_dgp(n_obs, dgp_type)
+
+        # Run R benchmark (feols)
+        r_result = run_r_benchmark(n_obs, dgp_type, n_fe)
+        r_time = r_result.get("median", float("inf"))
+        print(f"  fixest (R):      {r_time:8.2f} ms")
+
+        # Run pyfixest benchmark (feols)
+        py_result = run_pyfixest_benchmark(df, n_fe)
+        py_time = py_result.get("median", float("inf"))
+
+        if r_time > 0 and py_time < float("inf"):
+            ratio = py_time / r_time
+            print(f"  pyfixest:        {py_time:8.2f} ms ({ratio:.2f}x)")
+        else:
+            print(f"  pyfixest:        {py_time:8.2f} ms")
+
+        results.append(
+            {
+                "n_obs": n_obs,
+                "dgp_type": dgp_type,
+                "n_fe": n_fe,
+                "fixest_r_ms": r_time,
+                "pyfixest_ms": py_time,
+            }
+        )
+
+    # Summary
+    print("\n" + "=" * 70)
+    print("SUMMARY (pyfixest feols vs fixest feols)")
+    print("=" * 70)
+
+    print(f"{'Config':<35} {'fixest':>10} {'pyfixest':>10} {'ratio':>8}")
+    print("-" * 65)
+
+    for r in results:
+        config = f"n={r['n_obs']:,} {r['dgp_type']:9} {r['n_fe']}FE"
+        fixest = r["fixest_r_ms"]
+        pyfixest = r["pyfixest_ms"]
+
+        if fixest > 0 and fixest < float("inf") and pyfixest < float("inf"):
+            ratio = pyfixest / fixest
+            print(f"{config:<35} {fixest:>8.1f}ms {pyfixest:>8.1f}ms {ratio:>7.2f}x")
+        else:
+            print(f"{config:<35} {'N/A':>10} {'N/A':>10}")
+
+    # Save results
+    output_path = Path(__file__).parent / "results" / "native_comparison.json"
+    output_path.parent.mkdir(exist_ok=True)
+    with open(output_path, "w") as f:
+        json.dump(results, f, indent=2)
+    print(f"\nResults saved to {output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/demean_benchmark.py b/benchmarks/demean_benchmark.py
new file mode 100644
index 000000000..6a587b75f
--- /dev/null
+++ b/benchmarks/demean_benchmark.py
@@ -0,0 +1,456 @@
+#!/usr/bin/env python3
+"""
+Benchmark script for comparing demeaning implementations.
+
+Oriented on fixest_benchmarks/bench_ols.R but focused on demeaning only
+and optimized for fast iteration.
+
+Usage:
+    python benchmarks/demean_benchmark.py           # Fast mode (~30s)
+    python benchmarks/demean_benchmark.py --full    # Full mode (~5min)
+    python benchmarks/demean_benchmark.py --save    # Save results to JSON
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from statistics import median
+from typing import Callable
+
+import numpy as np
+
+
+@dataclass
+class BenchmarkConfig:
+    """Configuration for a single benchmark run."""
+
+    n_obs: int
+    dgp_type: str  # "simple" or "difficult"
+    n_fe: int
+    n_iters: int
+
+
+@dataclass
+class BenchmarkResult:
+    """Result of a benchmark run."""
+
+    config: BenchmarkConfig
+    backend: str
+    times: list[float]
+    median_time: float
+    available: bool
+    error: str | None = None
+
+
+def generate_dgp(
+    n: int,
+    dgp_type: str = "simple",
+    n_years: int = 10,
+    n_indiv_per_firm: int = 23,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Generate data matching fixest_benchmarks DGP.
+
+    Parameters
+    ----------
+    n : int
+        Number of observations
+    dgp_type : str
+        "simple" (random firm assignment) or "difficult" (sequential)
+    n_years : int
+        Number of years
+    n_indiv_per_firm : int
+        Average individuals per firm
+
+    Returns
+    -------
+    x : np.ndarray
+        Feature matrix (n, 1)
+    flist : np.ndarray
+        Fixed effect IDs (n, 2 or 3) - [indiv_id, year] or [indiv_id, year, firm_id]
+    weights : np.ndarray
+        Sample weights (n,)
+    """
+    n_indiv = max(1, round(n / n_years))
+    n_firm = max(1, round(n_indiv / n_indiv_per_firm))
+
+    # Create FE IDs
+    indiv_id = np.repeat(np.arange(n_indiv), n_years)[:n]
+    year = np.tile(np.arange(n_years), n_indiv)[:n]
+
+    if dgp_type == "simple":
+        # Random firm assignment - easier convergence
+        firm_id = np.random.randint(0, n_firm, size=n)
+    elif dgp_type == "difficult":
+        # Sequential firm assignment - harder convergence (messy data)
+        firm_id = np.tile(np.arange(n_firm), (n // n_firm) + 1)[:n]
+    else:
+        raise ValueError(f"Unknown dgp_type: {dgp_type}")
+
+    # Generate features
+    x1 = np.random.randn(n)
+
+    # Generate y with FE structure
+    firm_fe = np.random.randn(n_firm)[firm_id]
+    unit_fe = np.random.randn(n_indiv)[indiv_id]
+    year_fe = np.random.randn(n_years)[year]
+    y = x1 + firm_fe + unit_fe + year_fe + np.random.randn(n)
+
+    # Stack into matrices
+    x = np.column_stack([y, x1])  # Demean both y and x1
+    weights = np.ones(n)
+
+    return x, indiv_id, year, firm_id, weights
+
+
+def get_demean_backends() -> dict[str, Callable | None]:
+    """Get available demeaning backends with graceful fallbacks."""
+    backends: dict[str, Callable | None] = {}
+
+    # Rust accelerated (default)
+    try:
+        from pyfixest.core.demean import demean as demean_rust
+
+        backends["rust-accelerated"] = demean_rust
+    except ImportError:
+        backends["rust-accelerated"] = None
+
+    # Rust simple (via env var)
+    def demean_rust_simple(x, flist, weights, tol=1e-8, maxiter=100_000):
+        os.environ["PYFIXEST_DEMEAN_SIMPLE"] = "1"
+        try:
+            from pyfixest.core.demean import demean as demean_rust
+
+            return demean_rust(x, flist, weights, tol, maxiter)
+        finally:
+            del os.environ["PYFIXEST_DEMEAN_SIMPLE"]
+
+    backends["rust-simple"] = (
+        demean_rust_simple if backends["rust-accelerated"] else None
+    )
+
+    # Numba
+    try:
+        from pyfixest.estimation.demean_ import demean as demean_numba
+
+        backends["numba"] = demean_numba
+    except ImportError:
+        backends["numba"] = None
+
+    # CuPy 32-bit
+    try:
+        from pyfixest.estimation.cupy.demean_cupy_ import demean_cupy32
+
+        backends["cupy32"] = demean_cupy32
+    except ImportError:
+        backends["cupy32"] = None
+
+    # CuPy 64-bit
+    try:
+        from pyfixest.estimation.cupy.demean_cupy_ import demean_cupy64
+
+        backends["cupy64"] = demean_cupy64
+    except ImportError:
+        backends["cupy64"] = None
+
+    # R fixest via rpy2 - use feols with only FE (no covariates) to measure demean time
+    try:
+        import pandas as pd
+        import rpy2.robjects as ro
+        from rpy2.robjects import numpy2ri, pandas2ri
+        from rpy2.robjects.packages import importr
+
+        numpy2ri.activate()
+        pandas2ri.activate()
+        importr("fixest")  # Load fixest package
+
+        def demean_fixest(x, flist, weights, tol=1e-8, maxiter=100_000):
+            # Create a minimal regression problem that exercises the demeaning
+            _n, k = x.shape
+            n_fe = flist.shape[1] if flist.ndim > 1 else 1
+
+            # Build a dataframe with y and FE columns
+            data = {"y": x[:, 0]}
+            fe_names = []
+            for j in range(n_fe):
+                fe_col = f"fe{j + 1}"
+                fe_names.append(fe_col)
+                if flist.ndim > 1:
+                    data[fe_col] = flist[:, j].astype(int)
+                else:
+                    data[fe_col] = flist.astype(int)
+
+            df = pd.DataFrame(data)
+            r_df = pandas2ri.py2rpy(df)
+
+            # Build formula: y ~ 1 | fe1 + fe2 + ...
+            fe_formula = " + ".join(fe_names)
+            formula = f"y ~ 1 | {fe_formula}"
+
+            # Call feols (this includes demeaning time)
+            ro.r.assign("df", r_df)
+            ro.r(f"result <- fixest::feols({formula}, data=df, nthreads=1)")
+
+            # Return the residuals as "demeaned" values
+            resid = np.array(ro.r("residuals(result)"))
+            result = np.column_stack([resid] + [x[:, j] for j in range(1, k)])
+            return result, True
+
+        backends["fixest"] = demean_fixest
+    except (ImportError, Exception):
+        backends["fixest"] = None
+
+    return backends
+
+
+def run_single_benchmark(
+    demean_func: Callable,
+    x: np.ndarray,
+    flist: np.ndarray,
+    weights: np.ndarray,
+    n_iters: int,
+) -> list[float]:
+    """Run a single benchmark configuration multiple times."""
+    times = []
+
+    for _ in range(n_iters):
+        # Copy arrays to avoid caching effects
+        x_copy = x.copy()
+
+        start = time.perf_counter()
+        demean_func(x_copy, flist, weights)
+        elapsed = time.perf_counter() - start
+
+        times.append(elapsed)
+
+    return times
+
+
+def run_benchmarks(
+    configs: list[BenchmarkConfig],
+    backends: dict[str, Callable | None],
+) -> list[BenchmarkResult]:
+    """Run all benchmark configurations across all backends."""
+    results = []
+
+    for config in configs:
+        print(f"\n{'=' * 60}")
+        print(f"Config: n={config.n_obs:,}, type={config.dgp_type}, fe={config.n_fe}")
+        print("=" * 60)
+
+        # Generate data
+        x, indiv_id, year, firm_id, weights = generate_dgp(
+            config.n_obs, config.dgp_type
+        )
+
+        # Build flist based on n_fe
+        if config.n_fe == 2:
+            flist = np.column_stack([indiv_id, year]).astype(np.uint64)
+        else:  # n_fe == 3
+            flist = np.column_stack([indiv_id, year, firm_id]).astype(np.uint64)
+
+        for backend_name, demean_func in backends.items():
+            if demean_func is None:
+                result = BenchmarkResult(
+                    config=config,
+                    backend=backend_name,
+                    times=[],
+                    median_time=float("inf"),
+                    available=False,
+                    error="Not installed",
+                )
+                results.append(result)
+                print(f"  {backend_name:20s}: not available")
+                continue
+
+            try:
+                times = run_single_benchmark(
+                    demean_func, x, flist, weights, config.n_iters
+                )
+                med_time = median(times)
+                result = BenchmarkResult(
+                    config=config,
+                    backend=backend_name,
+                    times=times,
+                    median_time=med_time,
+                    available=True,
+                )
+                results.append(result)
+                print(
+                    f"  {backend_name:20s}: {med_time * 1000:8.2f} ms (median of {len(times)})"
+                )
+            except Exception as e:
+                result = BenchmarkResult(
+                    config=config,
+                    backend=backend_name,
+                    times=[],
+                    median_time=float("inf"),
+                    available=False,
+                    error=str(e),
+                )
+                results.append(result)
+                print(f"  {backend_name:20s}: ERROR - {e}")
+
+    return results
+
+
+def print_summary(results: list[BenchmarkResult]) -> None:
+    """Print a summary table of results."""
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+
+    # Group by config
+    configs = sorted(
+        set((r.config.n_obs, r.config.dgp_type, r.config.n_fe) for r in results)
+    )
+
+    backends = sorted(set(r.backend for r in results))
+
+    # Header
+    header = f"{'Config':30s}"
+    for backend in backends:
+        header += f" {backend:>12s}"
+    print(header)
+    print("-" * len(header))
+
+    # Find fixest baseline for relative comparison
+    fixest_times = {}
+    for r in results:
+        if r.backend == "fixest" and r.available:
+            key = (r.config.n_obs, r.config.dgp_type, r.config.n_fe)
+            fixest_times[key] = r.median_time
+
+    # Rows
+    for n_obs, dgp_type, n_fe in configs:
+        config_str = f"n={n_obs:,} {dgp_type:9s} {n_fe}FE"
+        row = f"{config_str:30s}"
+
+        key = (n_obs, dgp_type, n_fe)
+        baseline = fixest_times.get(key)
+
+        for backend in backends:
+            matching = [
+                r
+                for r in results
+                if r.config.n_obs == n_obs
+                and r.config.dgp_type == dgp_type
+                and r.config.n_fe == n_fe
+                and r.backend == backend
+            ]
+            if matching and matching[0].available:
+                time_ms = matching[0].median_time * 1000
+                if baseline and backend != "fixest":
+                    ratio = matching[0].median_time / baseline
+                    row += f" {time_ms:7.1f}ms({ratio:.1f}x)"
+                else:
+                    row += f" {time_ms:12.1f}ms"
+            else:
+                row += f" {'N/A':>12s}"
+
+        print(row)
+
+
+def save_results(results: list[BenchmarkResult], path: Path) -> None:
+    """Save results to JSON."""
+    data = []
+    for r in results:
+        data.append(
+            {
+                "n_obs": r.config.n_obs,
+                "dgp_type": r.config.dgp_type,
+                "n_fe": r.config.n_fe,
+                "n_iters": r.config.n_iters,
+                "backend": r.backend,
+                "times": r.times,
+                "median_time": r.median_time if r.median_time != float("inf") else None,
+                "available": r.available,
+                "error": r.error,
+            }
+        )
+
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)
+    print(f"\nResults saved to {path}")
+
+
+def main():
+    """Run demeaning benchmarks."""
+    parser = argparse.ArgumentParser(description="Benchmark demeaning implementations")
+    parser.add_argument(
+        "--full", action="store_true", help="Run full benchmark (slower)"
+    )
+    parser.add_argument("--save", action="store_true", help="Save results to JSON")
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("benchmarks/results/benchmark.json"),
+        help="Output path for results",
+    )
+    args = parser.parse_args()
+
+    # Define configurations
+    if args.full:
+        configs = [
+            # Small (fast)
+            BenchmarkConfig(n_obs=10_000, dgp_type="simple", n_fe=2, n_iters=5),
+            BenchmarkConfig(n_obs=10_000, dgp_type="difficult", n_fe=2, n_iters=5),
+            BenchmarkConfig(n_obs=10_000, dgp_type="simple", n_fe=3, n_iters=5),
+            BenchmarkConfig(n_obs=10_000, dgp_type="difficult", n_fe=3, n_iters=5),
+            # Medium
+            BenchmarkConfig(n_obs=100_000, dgp_type="simple", n_fe=2, n_iters=3),
+            BenchmarkConfig(n_obs=100_000, dgp_type="difficult", n_fe=2, n_iters=3),
+            BenchmarkConfig(n_obs=100_000, dgp_type="simple", n_fe=3, n_iters=3),
+            BenchmarkConfig(n_obs=100_000, dgp_type="difficult", n_fe=3, n_iters=3),
+            # Large
+            BenchmarkConfig(n_obs=500_000, dgp_type="simple", n_fe=2, n_iters=2),
+            BenchmarkConfig(n_obs=500_000, dgp_type="difficult", n_fe=2, n_iters=2),
+            BenchmarkConfig(n_obs=1_000_000, dgp_type="simple", n_fe=2, n_iters=1),
+            BenchmarkConfig(n_obs=1_000_000, dgp_type="difficult", n_fe=2, n_iters=1),
+        ]
+    else:
+        # Fast mode - minimal configs for quick iteration
+        configs = [
+            BenchmarkConfig(n_obs=10_000, dgp_type="simple", n_fe=2, n_iters=5),
+            BenchmarkConfig(n_obs=10_000, dgp_type="difficult", n_fe=2, n_iters=5),
+            BenchmarkConfig(n_obs=10_000, dgp_type="simple", n_fe=3, n_iters=5),
+            BenchmarkConfig(n_obs=10_000, dgp_type="difficult", n_fe=3, n_iters=5),
+            BenchmarkConfig(n_obs=100_000, dgp_type="simple", n_fe=2, n_iters=3),
+            BenchmarkConfig(n_obs=100_000, dgp_type="difficult", n_fe=2, n_iters=3),
+            BenchmarkConfig(n_obs=100_000, dgp_type="simple", n_fe=3, n_iters=3),
+            BenchmarkConfig(n_obs=100_000, dgp_type="difficult", n_fe=3, n_iters=3),
+        ]
+
+    print("Demeaning Benchmark")
+    print("=" * 60)
+    print(f"Mode: {'full' if args.full else 'fast'}")
+    print(f"Configurations: {len(configs)}")
+
+    # Get available backends
+    backends = get_demean_backends()
+    available = [name for name, func in backends.items() if func is not None]
+    unavailable = [name for name, func in backends.items() if func is None]
+
+    print(f"Available backends: {', '.join(available)}")
+    if unavailable:
+        print(f"Unavailable backends: {', '.join(unavailable)}")
+
+    # Run benchmarks
+    results = run_benchmarks(configs, backends)
+
+    # Print summary
+    print_summary(results)
+
+    # Save if requested
+    if args.save:
+        save_results(results, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/demean.rs b/src/demean.rs
index 8d04414db..22098bade 100644
--- a/src/demean.rs
+++ b/src/demean.rs
@@ -2,7 +2,6 @@ use ndarray::{Array2, ArrayView1, ArrayView2, Zip};
 use numpy::{PyArray2, PyReadonlyArray1, PyReadonlyArray2};
 use pyo3::prelude::*;
 use rayon::prelude::*;
-use std::env;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 
@@ -66,29 +65,6 @@ fn demean_impl(
     weights: &ArrayView1<f64>,
     tol: f64,
     maxiter: usize,
-) -> (Array2<f64>, bool) {
-    // Allow benchmarks to force the simple implementation for apples-to-apples comparisons.
-    if env::var("PYFIXEST_DEMEAN_SIMPLE").is_ok() {
-        return demean_simple_impl(x, flist, weights, tol, maxiter);
-    }
-
-    // Use the accelerated Rust implementation by default. If it fails to converge,
-    // fall back to the reference implementation to guarantee correctness.
-    let (accel, success) =
-        crate::demean_accelerated::demean_accelerated(x, flist, weights, tol, maxiter);
-    if success {
-        return (accel, true);
-    }
-
-    demean_simple_impl(x, flist, weights, tol, maxiter)
-}
-
-fn demean_simple_impl(
-    x: &ArrayView2<f64>,
-    flist: &ArrayView2<usize>,
-    weights: &ArrayView1<f64>,
-    tol: f64,
-    maxiter: usize,
 ) -> (Array2<f64>, bool) {
     let (n_samples, n_features) = x.dim();
     let n_factors = flist.ncols();
@@ -235,7 +211,8 @@ pub fn _demean_rs(
     let flist_arr = flist.as_array();
     let weights_arr = weights.as_array();
 
-    let (out, success) = py.detach(|| demean_impl(&x_arr, &flist_arr, &weights_arr, tol, maxiter));
+    let (out, success) =
+        py.detach(|| demean_impl(&x_arr, &flist_arr, &weights_arr, tol, maxiter));
 
     let pyarray = PyArray2::from_owned_array(py, out);
     Ok((pyarray.into(), success))
diff --git a/src/demean_accelerated/accelerator.rs b/src/demean_accelerated/accelerator.rs
index 9733e6c01..0350e7786 100644
--- a/src/demean_accelerated/accelerator.rs
+++ b/src/demean_accelerated/accelerator.rs
@@ -1,87 +1,98 @@
 //! Acceleration strategies for fixed effects demeaning.
 //!
-//! This module provides the [`Accelerator`] trait for iteration acceleration,
-//! with the default implementation [`IronsTuckGrand`] matching fixest's algorithm.
+//! This module provides [`IronsTuckGrand`], the acceleration strategy matching
+//! fixest's implementation.
 
 use crate::demean_accelerated::projection::Projector;
-use crate::demean_accelerated::types::FixestConfig;
+use crate::demean_accelerated::types::{ConvergenceState, FixestConfig};
 
 // =============================================================================
-// Accelerator Trait
+// Internal Types
 // =============================================================================
 
-/// An acceleration strategy for iterative demeaning.
+/// Phase of grand acceleration state machine.
 ///
-/// Accelerators take a [`Projector`] and repeatedly apply it until convergence,
-/// using various techniques to speed up convergence.
+/// Grand acceleration applies Irons-Tuck at a coarser timescale to capture
+/// long-range convergence patterns. It collects 3 snapshots of `gx` at
+/// `iter_grand_acc` intervals, then applies Irons-Tuck to those snapshots.
 ///
-/// # Associated Types
+/// # State transitions
 ///
-/// Each accelerator has its own buffer type, as different strategies require
-/// different working memory (e.g., Irons-Tuck needs snapshots for extrapolation).
-pub trait Accelerator {
-    /// Working buffers needed by this acceleration strategy.
-    type Buffers;
+/// ```text
+/// Collect1st ──> Collect2nd ──> Collect3rdAndAccelerate ──┐
+///     ^                                                   │
+///     └───────────────────────────────────────────────────┘
+/// ```
+///
+/// Actual acceleration happens every `3 × iter_grand_acc` iterations.
+#[derive(Clone, Copy, Default)]
+enum GrandPhase {
+    /// Store current `gx` as first snapshot (y buffer).
+    #[default]
+    Collect1st,
+    /// Store current `gx` as second snapshot (gy buffer).
+    Collect2nd,
+    /// Store current `gx` as third snapshot (ggy buffer), then accelerate.
+    Collect3rdAndAccelerate,
+}
 
-    /// Create buffers for the given coefficient count.
-    fn create_buffers(n_coef: usize) -> Self::Buffers;
+/// Result of a grand acceleration step.
+///
+/// Grand acceleration operates on a coarser timescale than regular Irons-Tuck,
+/// collecting snapshots every `iter_grand_acc` iterations to capture long-range
+/// convergence patterns.
+enum GrandStepResult {
+    /// Continue with the next phase of snapshot collection.
+    Continue(GrandPhase),
+    /// Grand acceleration detected convergence; iteration can stop.
+    Done(ConvergenceState),
+}
 
-    /// Check if two scalar values have converged within tolerance.
-    ///
-    /// Uses both absolute and relative tolerance: converged if
-    /// `|a - b| <= tol` OR `|a - b| <= tol * (0.1 + |a|)`.
-    ///
-    /// The `0.1` denominator offset prevents division by zero and provides
-    /// a smooth transition between absolute tolerance (when |a| << 0.1) and
-    /// relative tolerance (when |a| >> 0.1). This matches fixest's convergence check.
-    ///
-    /// # Implementation Note
-    ///
-    /// The relative tolerance check `|a - b| / (0.1 + |a|) <= tol` is rewritten
-    /// as `|a - b| <= tol * (0.1 + |a|)` to avoid division, improving performance
-    /// and SIMD-friendliness.
-    #[inline]
-    fn converged(a: f64, b: f64, tol: f64) -> bool {
-        // 0.1 offset: ensures numerical stability and smooth absolute/relative transition
-        const RELATIVE_TOL_OFFSET: f64 = 0.1;
-        let diff = (a - b).abs();
-        // Absolute tolerance check (faster, handles small values)
-        // OR relative tolerance check (multiplication form, avoids division)
-        (diff <= tol) || (diff <= tol * (RELATIVE_TOL_OFFSET + a.abs()))
-    }
+/// Buffers for Irons-Tuck + Grand acceleration.
+///
+/// # Regular Irons-Tuck buffers
+///
+/// - `gx`: G(x), result of one projection
+/// - `ggx`: G(G(x)), result of two projections
+/// - `temp`: temporary for post-acceleration projection
+///
+/// # Grand acceleration buffers
+///
+/// These store snapshots of `gx` at different times (separated by `iter_grand_acc`):
+/// - `y`: 1st snapshot of gx
+/// - `gy`: 2nd snapshot of gx
+/// - `ggy`: 3rd snapshot of gx
+///
+/// Note: The names follow fixest's convention. Despite the names, these are NOT
+/// nested projections (G(y), G(G(y))), but rather time-separated snapshots that
+/// are then fed to Irons-Tuck as if they were successive iterates.
+struct IronsTuckGrandBuffers {
+    /// G(x): Result of one projection step (regular Irons-Tuck).
+    gx: Vec<f64>,
+    /// G(G(x)): Result of two projection steps (regular Irons-Tuck).
+    ggx: Vec<f64>,
+    /// Temporary buffer for post-acceleration projection.
+    temp: Vec<f64>,
+    /// Grand acceleration: 1st snapshot of gx.
+    y: Vec<f64>,
+    /// Grand acceleration: 2nd snapshot of gx.
+    gy: Vec<f64>,
+    /// Grand acceleration: 3rd snapshot of gx.
+    ggy: Vec<f64>,
+}
 
-    /// Check if coefficient arrays have NOT converged (should keep iterating).
-    ///
-    /// Returns `true` if ANY pair of coefficients differs by more than tolerance.
-    /// Uses early-exit: returns as soon as any non-converged pair is found.
-    #[inline]
-    fn should_continue(coef_old: &[f64], coef_new: &[f64], tol: f64) -> bool {
-        coef_old
-            .iter()
-            .zip(coef_new.iter())
-            .any(|(&a, &b)| !Self::converged(a, b, tol))
+impl IronsTuckGrandBuffers {
+    /// Create new buffers for the given coefficient count.
+    fn new(n_coef: usize) -> Self {
+        Self {
+            gx: vec![0.0; n_coef],
+            ggx: vec![0.0; n_coef],
+            temp: vec![0.0; n_coef],
+            y: vec![0.0; n_coef],
+            gy: vec![0.0; n_coef],
+            ggy: vec![0.0; n_coef],
+        }
     }
-
-    /// Run the acceleration loop to convergence.
-    ///
-    /// # Arguments
-    ///
-    /// * `projector` - The projection operation to accelerate
-    /// * `coef` - Initial coefficients (modified in place with final result)
-    /// * `buffers` - Working buffers for the acceleration
-    /// * `config` - Algorithm configuration (tolerance, etc.)
-    /// * `max_iter` - Maximum iterations before giving up
-    ///
-    /// # Returns
-    ///
-    /// Tuple of (iterations_used, converged_flag)
-    fn run<P: Projector>(
-        projector: &mut P,
-        coef: &mut [f64],
-        buffers: &mut Self::Buffers,
-        config: &FixestConfig,
-        max_iter: usize,
-    ) -> (usize, bool);
 }
 
 // =============================================================================
@@ -99,167 +110,244 @@ pub trait Accelerator {
 /// 2. **Grand acceleration**: Every `iter_grand_acc` iterations, applies Irons-Tuck
 ///    at a coarser level to accelerate long-range convergence.
 ///
-/// Additionally, SSR (sum of squared residuals) is checked every 40 iterations
-/// as a secondary convergence criterion. The interval of 40 balances overhead
-/// (SSR computation is O(n)) against catching convergence that coefficient
-/// checks might miss.
-pub struct IronsTuckGrand;
-
-/// Interval for SSR-based convergence checks (every N iterations).
-/// Matches fixest's check frequency for secondary convergence criterion.
-const SSR_CHECK_INTERVAL: usize = 40;
-
-/// Buffers for Irons-Tuck + Grand acceleration.
-pub struct IronsTuckGrandBuffers {
-    /// G(x): Result of one projection step.
-    pub gx: Vec<f64>,
-    /// G(G(x)): Result of two projection steps.
-    pub ggx: Vec<f64>,
-    /// Temporary buffer for post-acceleration projection.
-    pub temp: Vec<f64>,
-    /// Grand acceleration: y snapshot.
-    pub y: Vec<f64>,
-    /// Grand acceleration: G(y) snapshot.
-    pub gy: Vec<f64>,
-    /// Grand acceleration: G(G(y)) snapshot.
-    pub ggy: Vec<f64>,
+/// Additionally, SSR (sum of squared residuals) is checked every `ssr_check_interval`
+/// iterations as a secondary convergence criterion.
+pub struct IronsTuckGrand {
+    /// Algorithm configuration (tolerance, iteration parameters).
+    config: FixestConfig,
+    /// Working buffers for the acceleration algorithm.
+    buffers: IronsTuckGrandBuffers,
 }
 
 impl IronsTuckGrand {
-    /// Apply Irons-Tuck acceleration to speed up convergence.
-    ///
-    /// Given three successive iterates x, G(x), G(G(x)), computes an accelerated
-    /// update that often converges faster than simple iteration.
-    ///
-    /// Returns `true` if already converged (denominator is zero), `false` otherwise.
-    #[inline(always)]
-    fn accelerate(x: &mut [f64], gx: &[f64], ggx: &[f64]) -> bool {
-        let (vprod, ssq) = x
-            .iter()
-            .zip(gx.iter())
-            .zip(ggx.iter())
-            .map(|((&x_i, &gx_i), &ggx_i)| {
-                let delta_gx = ggx_i - gx_i;
-                let delta2_x = delta_gx - gx_i + x_i;
-                (delta_gx * delta2_x, delta2_x * delta2_x)
-            })
-            .fold((0.0, 0.0), |(vp, sq), (dvp, dsq)| (vp + dvp, sq + dsq));
-
-        if ssq == 0.0 {
-            return true;
-        }
-
-        let coef = vprod / ssq;
-        x.iter_mut()
-            .zip(gx.iter())
-            .zip(ggx.iter())
-            .for_each(|((x_i, &gx_i), &ggx_i)| {
-                *x_i = ggx_i - coef * (ggx_i - gx_i);
-            });
-
-        false
-    }
-}
-
-impl Accelerator for IronsTuckGrand {
-    type Buffers = IronsTuckGrandBuffers;
-
+    /// Create a new accelerator with the given configuration and buffer size.
     #[inline]
-    fn create_buffers(n_coef: usize) -> Self::Buffers {
-        IronsTuckGrandBuffers {
-            gx: vec![0.0; n_coef],
-            ggx: vec![0.0; n_coef],
-            temp: vec![0.0; n_coef],
-            y: vec![0.0; n_coef],
-            gy: vec![0.0; n_coef],
-            ggy: vec![0.0; n_coef],
+    pub fn new(config: FixestConfig, n_coef: usize) -> Self {
+        Self {
+            config,
+            buffers: IronsTuckGrandBuffers::new(n_coef),
         }
     }
 
-    fn run<P: Projector>(
+    /// Run the acceleration loop to convergence.
+    ///
+    /// # Arguments
+    ///
+    /// * `projector` - The projection operation to accelerate
+    /// * `coef` - Initial coefficients (modified in place with final result)
+    /// * `max_iter` - Maximum iterations before giving up
+    ///
+    /// # Returns
+    ///
+    /// Tuple of (iterations_used, convergence_state)
+    pub fn run<P: Projector>(
+        &mut self,
         projector: &mut P,
         coef: &mut [f64],
-        buffers: &mut Self::Buffers,
-        config: &FixestConfig,
         max_iter: usize,
-    ) -> (usize, bool) {
+    ) -> (usize, ConvergenceState) {
+        // Verify buffer size matches projector's coefficient count
+        debug_assert_eq!(
+            self.buffers.gx.len(),
+            projector.coef_len(),
+            "Accelerator buffer size ({}) must match projector coef_len ({})",
+            self.buffers.gx.len(),
+            projector.coef_len()
+        );
+
         let conv_len = projector.convergence_len();
 
         // Initial projection
-        projector.project(coef, &mut buffers.gx);
-
-        let mut keep_going =
-            Self::should_continue(&coef[..conv_len], &buffers.gx[..conv_len], config.tol);
+        projector.project(coef, &mut self.buffers.gx);
+
+        let mut convergence = if Self::should_continue(
+            &coef[..conv_len],
+            &self.buffers.gx[..conv_len],
+            self.config.tol,
+        ) {
+            ConvergenceState::NotConverged
+        } else {
+            ConvergenceState::Converged
+        };
         let mut iter = 0;
-        let mut grand_counter = 0usize;
+        let mut grand_phase = GrandPhase::default();
         let mut ssr = 0.0;
 
-        while keep_going && iter < max_iter {
+        while convergence == ConvergenceState::NotConverged && iter < max_iter {
             iter += 1;
 
             // Double projection for Irons-Tuck: G(G(x))
-            projector.project(&buffers.gx, &mut buffers.ggx);
+            projector.project(&self.buffers.gx, &mut self.buffers.ggx);
 
             // Irons-Tuck acceleration
-            if Self::accelerate(
+            let accel_convergence = Self::accelerate(
                 &mut coef[..conv_len],
-                &buffers.gx[..conv_len],
-                &buffers.ggx[..conv_len],
-            ) {
+                &self.buffers.gx[..conv_len],
+                &self.buffers.ggx[..conv_len],
+            );
+            if accel_convergence == ConvergenceState::Converged {
+                convergence = ConvergenceState::Converged;
                 break;
             }
 
             // Post-acceleration projection (after warmup)
-            if iter >= config.iter_proj_after_acc {
-                buffers.temp[..conv_len].copy_from_slice(&coef[..conv_len]);
-                projector.project(&buffers.temp, coef);
+            if iter >= self.config.iter_proj_after_acc {
+                self.buffers.temp[..conv_len].copy_from_slice(&coef[..conv_len]);
+                projector.project(&self.buffers.temp, coef);
             }
 
             // Update gx for convergence check
-            projector.project(coef, &mut buffers.gx);
-            keep_going =
-                Self::should_continue(&coef[..conv_len], &buffers.gx[..conv_len], config.tol);
+            projector.project(coef, &mut self.buffers.gx);
+            convergence = if Self::should_continue(
+                &coef[..conv_len],
+                &self.buffers.gx[..conv_len],
+                self.config.tol,
+            ) {
+                ConvergenceState::NotConverged
+            } else {
+                ConvergenceState::Converged
+            };
 
             // Grand acceleration (every iter_grand_acc iterations)
-            if iter % config.iter_grand_acc == 0 {
-                grand_counter += 1;
-                match grand_counter {
-                    1 => {
-                        buffers.y[..conv_len].copy_from_slice(&buffers.gx[..conv_len]);
-                    }
-                    2 => {
-                        buffers.gy[..conv_len].copy_from_slice(&buffers.gx[..conv_len]);
-                    }
-                    _ => {
-                        buffers.ggy[..conv_len].copy_from_slice(&buffers.gx[..conv_len]);
-                        if Self::accelerate(
-                            &mut buffers.y[..conv_len],
-                            &buffers.gy[..conv_len],
-                            &buffers.ggy[..conv_len],
-                        ) {
-                            break;
-                        }
-                        projector.project(&buffers.y, &mut buffers.gx);
-                        grand_counter = 0;
+            if iter % self.config.iter_grand_acc == 0 {
+                match self.grand_acceleration_step(grand_phase, projector, conv_len) {
+                    GrandStepResult::Continue(next) => grand_phase = next,
+                    GrandStepResult::Done(state) => {
+                        convergence = state;
+                        break;
                     }
                 }
             }
 
-            // SSR convergence check (every SSR_CHECK_INTERVAL iterations)
-            if iter % SSR_CHECK_INTERVAL == 0 {
+            // SSR convergence check (every ssr_check_interval iterations)
+            if iter % self.config.ssr_check_interval == 0 {
                 let ssr_old = ssr;
-                ssr = projector.compute_ssr(&buffers.gx);
+                ssr = projector.compute_ssr(&self.buffers.gx);
 
-                if iter > SSR_CHECK_INTERVAL && Self::converged(ssr_old, ssr, config.tol) {
-                    keep_going = false;
+                if iter > self.config.ssr_check_interval
+                    && Self::converged(ssr_old, ssr, self.config.tol)
+                {
+                    convergence = ConvergenceState::Converged;
                     break;
                 }
             }
         }
 
         // Copy final result
-        coef.copy_from_slice(&buffers.gx);
-        (iter, !keep_going)
+        coef.copy_from_slice(&self.buffers.gx);
+        (iter, convergence)
+    }
+
+    /// Apply Irons-Tuck acceleration to speed up convergence.
+    ///
+    /// Given three successive iterates x, G(x), G(G(x)), extrapolates toward
+    /// the fixed point using the formula from Irons & Tuck (1969).
+    ///
+    /// The method computes second differences `δ²x = G(G(x)) - 2G(x) + x` and uses
+    /// them to estimate how far we are from the fixed point. If second differences
+    /// are zero, we've already converged.
+    #[inline(always)]
+    fn accelerate(x: &mut [f64], gx: &[f64], ggx: &[f64]) -> ConvergenceState {
+        let (vprod, ssq) = x
+            .iter()
+            .zip(gx.iter())
+            .zip(ggx.iter())
+            .map(|((&x_i, &gx_i), &ggx_i)| {
+                let delta_gx = ggx_i - gx_i;
+                let delta2_x = delta_gx - gx_i + x_i;
+                (delta_gx * delta2_x, delta2_x * delta2_x)
+            })
+            .fold((0.0, 0.0), |(vp, sq), (dvp, dsq)| (vp + dvp, sq + dsq));
+
+        if ssq == 0.0 {
+            return ConvergenceState::Converged;
+        }
+
+        let coef = vprod / ssq;
+        x.iter_mut()
+            .zip(gx.iter())
+            .zip(ggx.iter())
+            .for_each(|((x_i, &gx_i), &ggx_i)| {
+                *x_i = ggx_i - coef * (ggx_i - gx_i);
+            });
+
+        ConvergenceState::NotConverged
+    }
+
+    /// Perform one step of grand acceleration.
+    ///
+    /// Grand acceleration applies Irons-Tuck at a coarser timescale to capture
+    /// long-range convergence patterns that fine-grained iteration might miss.
+    ///
+    /// # How it works
+    ///
+    /// Every `iter_grand_acc` iterations, this function is called to advance a
+    /// 3-phase state machine:
+    ///
+    /// 1. **Collect1st**: Store current `gx` as the first snapshot (`y`)
+    /// 2. **Collect2nd**: Store current `gx` as the second snapshot (`gy`)
+    /// 3. **Collect3rdAndAccelerate**: Store current `gx` as third snapshot (`ggy`),
+    ///    then apply Irons-Tuck to (y, gy, ggy) to extrapolate toward the fixed point
+    ///
+    /// After phase 3, the cycle repeats. This means actual acceleration happens
+    /// every `3 × iter_grand_acc` iterations.
+    #[inline]
+    fn grand_acceleration_step<P: Projector>(
+        &mut self,
+        phase: GrandPhase,
+        projector: &mut P,
+        conv_len: usize,
+    ) -> GrandStepResult {
+        match phase {
+            GrandPhase::Collect1st => {
+                self.buffers.y[..conv_len].copy_from_slice(&self.buffers.gx[..conv_len]);
+                GrandStepResult::Continue(GrandPhase::Collect2nd)
+            }
+            GrandPhase::Collect2nd => {
+                self.buffers.gy[..conv_len].copy_from_slice(&self.buffers.gx[..conv_len]);
+                GrandStepResult::Continue(GrandPhase::Collect3rdAndAccelerate)
+            }
+            GrandPhase::Collect3rdAndAccelerate => {
+                self.buffers.ggy[..conv_len].copy_from_slice(&self.buffers.gx[..conv_len]);
+                let convergence = Self::accelerate(
+                    &mut self.buffers.y[..conv_len],
+                    &self.buffers.gy[..conv_len],
+                    &self.buffers.ggy[..conv_len],
+                );
+                if convergence == ConvergenceState::Converged {
+                    return GrandStepResult::Done(ConvergenceState::Converged);
+                }
+                projector.project(&self.buffers.y, &mut self.buffers.gx);
+                GrandStepResult::Continue(GrandPhase::Collect1st)
+            }
+        }
+    }
+
+    /// Check if two scalar values have converged within tolerance.
+    ///
+    /// Uses both absolute and relative tolerance: converged if
+    /// `|a - b| <= tol` OR `|a - b| <= tol * (0.1 + |a|)`.
+    ///
+    /// The `0.1` denominator offset prevents division by zero and provides
+    /// a smooth transition between absolute tolerance (when |a| << 0.1) and
+    /// relative tolerance (when |a| >> 0.1). This matches fixest's convergence check.
+    #[inline]
+    fn converged(a: f64, b: f64, tol: f64) -> bool {
+        const RELATIVE_TOL_OFFSET: f64 = 0.1;
+        let diff = (a - b).abs();
+        (diff <= tol) || (diff <= tol * (RELATIVE_TOL_OFFSET + a.abs()))
+    }
+
+    /// Check if coefficient arrays have NOT converged (should keep iterating).
+    ///
+    /// Returns `true` if ANY pair of coefficients differs by more than tolerance.
+    /// Uses early-exit: returns as soon as any non-converged pair is found.
+    #[inline]
+    fn should_continue(coef_old: &[f64], coef_new: &[f64], tol: f64) -> bool {
+        coef_old
+            .iter()
+            .zip(coef_new.iter())
+            .any(|(&a, &b)| !Self::converged(a, b, tol))
     }
 }
 
@@ -288,6 +376,7 @@ mod tests {
     fn test_irons_tuck_grand_convergence() {
         let (ctx, input) = create_test_problem(100);
         let config = FixestConfig::default();
+        let maxiter = config.maxiter;
 
         let n0 = ctx.index.n_groups[0];
         let n1 = ctx.index.n_groups[1];
@@ -295,13 +384,15 @@ mod tests {
 
         let in_out = ctx.scatter_to_coefficients(&input);
         let mut coef = vec![0.0; n_coef];
-        let mut buffers = IronsTuckGrand::create_buffers(n_coef);
+        let mut accelerator = IronsTuckGrand::new(config, n_coef);
         let mut projector = TwoFEProjector::new(&ctx, &in_out, &input);
 
-        let (iter, converged) =
-            IronsTuckGrand::run(&mut projector, &mut coef, &mut buffers, &config, config.maxiter);
+        let (iter, convergence) = accelerator.run(&mut projector, &mut coef, maxiter);
 
-        assert!(converged, "IronsTuckGrand should converge");
+        assert!(
+            convergence == ConvergenceState::Converged,
+            "IronsTuckGrand should converge"
+        );
         assert!(iter < 100, "Should converge in less than 100 iterations");
     }
 }
diff --git a/src/demean_accelerated/demeaner.rs b/src/demean_accelerated/demeaner.rs
index 9f131b6b2..d822dc326 100644
--- a/src/demean_accelerated/demeaner.rs
+++ b/src/demean_accelerated/demeaner.rs
@@ -13,9 +13,9 @@
 //! This is important for parallel processing where each thread can have its own
 //! demeaner instance that reuses buffers across columns.
 
-use crate::demean_accelerated::accelerator::{Accelerator, IronsTuckGrand, IronsTuckGrandBuffers};
+use crate::demean_accelerated::accelerator::IronsTuckGrand;
 use crate::demean_accelerated::projection::{MultiFEProjector, TwoFEProjector};
-use crate::demean_accelerated::types::{DemeanContext, FixestConfig};
+use crate::demean_accelerated::types::{ConvergenceState, DemeanContext, FixestConfig};
 
 // =============================================================================
 // Demeaner Trait
@@ -30,8 +30,8 @@ pub trait Demeaner {
     ///
     /// # Returns
     ///
-    /// Tuple of (demeaned_output, iterations_used, converged_flag)
-    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, bool);
+    /// Tuple of (demeaned_output, iterations_used, convergence_state)
+    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, ConvergenceState);
 }
 
 // =============================================================================
@@ -54,7 +54,7 @@ impl<'a> SingleFEDemeaner<'a> {
 }
 
 impl Demeaner for SingleFEDemeaner<'_> {
-    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, bool) {
+    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, ConvergenceState) {
         let n_obs = self.ctx.index.n_obs;
         let output = vec![0.0; n_obs];
 
@@ -74,7 +74,8 @@ impl Demeaner for SingleFEDemeaner<'_> {
         // output[i] = input[i] - coef[fe0[i]]
         let output: Vec<f64> = (0..n_obs).map(|i| input[i] - coef[fe0[i]]).collect();
 
-        (output, 0, true)
+        // Single FE is a closed-form solution, always converges in 0 iterations
+        (output, 0, ConvergenceState::Converged)
     }
 }
 
@@ -90,8 +91,8 @@ pub struct TwoFEDemeaner<'a> {
     config: &'a FixestConfig,
     /// Coefficient array [alpha | beta], reused across solves
     coef: Vec<f64>,
-    /// Acceleration buffers, reused across solves
-    buffers: IronsTuckGrandBuffers,
+    /// Accelerator with internal buffers, reused across solves
+    accelerator: IronsTuckGrand,
 }
 
 impl<'a> TwoFEDemeaner<'a> {
@@ -106,13 +107,13 @@ impl<'a> TwoFEDemeaner<'a> {
             ctx,
             config,
             coef: vec![0.0; n_coef],
-            buffers: IronsTuckGrand::create_buffers(n_coef),
+            accelerator: IronsTuckGrand::new(*config, n_coef),
         }
     }
 }
 
 impl Demeaner for TwoFEDemeaner<'_> {
-    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, bool) {
+    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, ConvergenceState) {
         let n_obs = self.ctx.index.n_obs;
         let n0 = self.ctx.index.n_groups[0];
 
@@ -125,14 +126,10 @@ impl Demeaner for TwoFEDemeaner<'_> {
         // Create projector (lightweight, references in_out and input)
         let mut projector = TwoFEProjector::new(self.ctx, &in_out, input);
 
-        // Run acceleration loop with reused buffers
-        let (iter, converged) = IronsTuckGrand::run(
-            &mut projector,
-            &mut self.coef,
-            &mut self.buffers,
-            self.config,
-            self.config.maxiter,
-        );
+        // Run acceleration loop
+        let (iter, convergence) = self
+            .accelerator
+            .run(&mut projector, &mut self.coef, self.config.maxiter);
 
         // Reconstruct output: input - alpha - beta
         let fe0 = self.ctx.index.group_ids_for_fe(0);
@@ -142,7 +139,7 @@ impl Demeaner for TwoFEDemeaner<'_> {
             .map(|i| input[i] - self.coef[fe0[i]] - self.coef[n0 + fe1[i]])
             .collect();
 
-        (result, iter, converged)
+        (result, iter, convergence)
     }
 }
 
@@ -198,10 +195,10 @@ pub struct MultiFEDemeaner<'a> {
     config: &'a FixestConfig,
     /// Working buffers for coefficient and observation arrays
     buffers: MultiFEBuffers,
-    /// Acceleration buffers for multi-FE iterations
-    multi_acc: IronsTuckGrandBuffers,
-    /// Acceleration buffers for 2-FE sub-convergence
-    two_acc: IronsTuckGrandBuffers,
+    /// Accelerator for multi-FE iterations
+    multi_acc: IronsTuckGrand,
+    /// Accelerator for 2-FE sub-convergence
+    two_acc: IronsTuckGrand,
 }
 
 impl<'a> MultiFEDemeaner<'a> {
@@ -218,14 +215,14 @@ impl<'a> MultiFEDemeaner<'a> {
             ctx,
             config,
             buffers: MultiFEBuffers::new(n_obs, n_coef, n_coef_2fe),
-            multi_acc: IronsTuckGrand::create_buffers(n_coef),
-            two_acc: IronsTuckGrand::create_buffers(n_coef_2fe),
+            multi_acc: IronsTuckGrand::new(*config, n_coef),
+            two_acc: IronsTuckGrand::new(*config, n_coef_2fe),
         }
     }
 }
 
 impl Demeaner for MultiFEDemeaner<'_> {
-    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, bool) {
+    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, ConvergenceState) {
         let n_obs = self.ctx.index.n_obs;
         let n0 = self.ctx.index.n_groups[0];
         let n1 = self.ctx.index.n_groups[1];
@@ -238,20 +235,16 @@ impl Demeaner for MultiFEDemeaner<'_> {
         // Phase 1: Warmup with all FEs (mu is zeros initially)
         let in_out_phase1 = self.ctx.scatter_to_coefficients(input);
         let mut projector1 = MultiFEProjector::new(self.ctx, &in_out_phase1, input);
-        let (iter1, converged1) = IronsTuckGrand::run(
-            &mut projector1,
-            &mut self.buffers.coef,
-            &mut self.multi_acc,
-            self.config,
-            self.config.iter_warmup,
-        );
+        let (iter1, convergence1) = self
+            .multi_acc
+            .run(&mut projector1, &mut self.buffers.coef, self.config.iter_warmup);
         total_iter += iter1;
         self.ctx.gather_and_add(&self.buffers.coef, &mut self.buffers.mu);
 
         // Determine final convergence status based on which phase completes the algorithm
-        let converged = if converged1 {
+        let convergence = if convergence1 == ConvergenceState::Converged {
             // Early convergence in warmup phase
-            true
+            ConvergenceState::Converged
         } else {
             // Phase 2: 2-FE sub-convergence
             let in_out_phase2 = self.ctx.scatter_residuals(input, &self.buffers.mu);
@@ -265,11 +258,9 @@ impl Demeaner for MultiFEDemeaner<'_> {
 
             let mut projector2 =
                 TwoFEProjector::new(self.ctx, &in_out_2fe, &self.buffers.effective_input);
-            let (iter2, converged2) = IronsTuckGrand::run(
+            let (iter2, convergence2) = self.two_acc.run(
                 &mut projector2,
                 &mut self.buffers.coef_2fe,
-                &mut self.two_acc,
-                self.config,
                 self.config.maxiter / 2,
             );
             total_iter += iter2;
@@ -288,25 +279,21 @@ impl Demeaner for MultiFEDemeaner<'_> {
                 let in_out_phase3 = self.ctx.scatter_residuals(input, &self.buffers.mu);
                 self.buffers.coef.fill(0.0);
                 let mut projector3 = MultiFEProjector::new(self.ctx, &in_out_phase3, input);
-                let (iter3, converged3) = IronsTuckGrand::run(
-                    &mut projector3,
-                    &mut self.buffers.coef,
-                    &mut self.multi_acc,
-                    self.config,
-                    remaining,
-                );
+                let (iter3, convergence3) =
+                    self.multi_acc
+                        .run(&mut projector3, &mut self.buffers.coef, remaining);
                 total_iter += iter3;
                 self.ctx.gather_and_add(&self.buffers.coef, &mut self.buffers.mu);
-                converged3
+                convergence3
             } else {
                 // No remaining iterations, use phase 2 convergence status
-                converged2
+                convergence2
             }
         };
 
         // Compute output: input - mu
         let output: Vec<f64> = (0..n_obs).map(|i| input[i] - self.buffers.mu[i]).collect();
 
-        (output, total_iter, converged)
+        (output, total_iter, convergence)
     }
 }
diff --git a/src/demean_accelerated/mod.rs b/src/demean_accelerated/mod.rs
index 96ae4d757..8e7976ca2 100644
--- a/src/demean_accelerated/mod.rs
+++ b/src/demean_accelerated/mod.rs
@@ -14,8 +14,8 @@
 //! - [`projection`]: Projection operations with [`Projector`](projection::Projector) trait
 //!   - [`TwoFEProjector`](projection::TwoFEProjector): Specialized 2-FE projection
 //!   - [`MultiFEProjector`](projection::MultiFEProjector): General Q-FE projection
-//! - [`accelerator`]: Acceleration strategies with [`Accelerator`](accelerator::Accelerator) trait
-//!   - [`IronsTuckGrand`](accelerator::IronsTuckGrand): Default acceleration (matches fixest)
+//! - [`accelerator`]: Acceleration strategy
+//!   - [`IronsTuckGrand`](accelerator::IronsTuckGrand): Irons-Tuck + Grand acceleration (matches fixest)
 //! - [`demeaner`]: High-level solver strategies with [`Demeaner`](demeaner::Demeaner) trait
 //!   - [`SingleFEDemeaner`](demeaner::SingleFEDemeaner): O(n) closed-form (1 FE)
 //!   - [`TwoFEDemeaner`](demeaner::TwoFEDemeaner): Accelerated iteration (2 FEs)
@@ -32,7 +32,7 @@ pub mod projection;
 pub mod types;
 
 use demeaner::{Demeaner, MultiFEDemeaner, SingleFEDemeaner, TwoFEDemeaner};
-use types::{DemeanContext, FixestConfig};
+use types::{ConvergenceState, DemeanContext, FixestConfig};
 
 use ndarray::{Array2, ArrayView1, ArrayView2, Zip};
 use numpy::{PyArray2, PyReadonlyArray1, PyReadonlyArray2};
@@ -64,7 +64,7 @@ impl<'a> ThreadLocalDemeaner<'a> {
 
     /// Solve the demeaning problem, reusing internal buffers.
     #[inline]
-    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, bool) {
+    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, ConvergenceState) {
         match self {
             ThreadLocalDemeaner::Single(d) => d.solve(input),
             ThreadLocalDemeaner::Two(d) => d.solve(input),
@@ -108,9 +108,9 @@ pub(crate) fn demean_accelerated(
                 // Use ndarray's column view and convert to contiguous Vec
                 // (column() returns a non-contiguous view, to_vec() copies to contiguous)
                 let xk: Vec<f64> = x.column(k).to_vec();
-                let (result, _iter, converged) = demeaner.solve(&xk);
+                let (result, _iter, convergence) = demeaner.solve(&xk);
 
-                if !converged {
+                if convergence == ConvergenceState::NotConverged {
                     not_converged.fetch_add(1, Ordering::SeqCst);
                 }
 
@@ -170,9 +170,12 @@ mod tests {
 
         let config = FixestConfig::default();
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
-        let (result, iter, converged) = demeaner.solve(&input);
+        let (result, iter, convergence) = demeaner.solve(&input);
 
-        assert!(converged, "Should converge");
+        assert!(
+            convergence == ConvergenceState::Converged,
+            "Should converge"
+        );
         assert!(iter < 100, "Should converge quickly");
         assert!(result.iter().all(|&v| v.is_finite()));
     }
@@ -196,9 +199,9 @@ mod tests {
 
         let config = FixestConfig::default();
         let mut demeaner = MultiFEDemeaner::new(&ctx, &config);
-        let (result, _iter, converged) = demeaner.solve(&input);
+        let (result, _iter, convergence) = demeaner.solve(&input);
 
-        assert!(converged);
+        assert!(convergence == ConvergenceState::Converged);
         assert!(result.iter().all(|&v| v.is_finite()));
     }
 
@@ -218,9 +221,12 @@ mod tests {
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let mut demeaner = SingleFEDemeaner::new(&ctx);
-        let (result, iter, converged) = demeaner.solve(&input);
+        let (result, iter, convergence) = demeaner.solve(&input);
 
-        assert!(converged, "Single FE should always converge");
+        assert!(
+            convergence == ConvergenceState::Converged,
+            "Single FE should always converge"
+        );
         assert_eq!(iter, 0, "Single FE should be closed-form (0 iterations)");
 
         // Verify demeaning: each group's sum should be approximately 0
@@ -263,9 +269,12 @@ mod tests {
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
         let config = FixestConfig::default();
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
-        let (result, _iter, converged) = demeaner.solve(&input);
+        let (result, _iter, convergence) = demeaner.solve(&input);
 
-        assert!(converged, "Weighted regression should converge");
+        assert!(
+            convergence == ConvergenceState::Converged,
+            "Weighted regression should converge"
+        );
         assert!(
             result.iter().all(|&v| v.is_finite()),
             "All results should be finite"
@@ -289,9 +298,12 @@ mod tests {
 
         let config = FixestConfig::default();
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
-        let (result, _iter, converged) = demeaner.solve(&input);
+        let (result, _iter, convergence) = demeaner.solve(&input);
 
-        assert!(converged, "Singleton groups should converge");
+        assert!(
+            convergence == ConvergenceState::Converged,
+            "Singleton groups should converge"
+        );
 
         // With singleton groups in FE 0, each observation's own mean is subtracted,
         // then adjusted for FE 1. The result should be all zeros since each
@@ -319,9 +331,12 @@ mod tests {
 
         let config = FixestConfig::default();
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
-        let (result, _iter, converged) = demeaner.solve(&input);
+        let (result, _iter, convergence) = demeaner.solve(&input);
 
-        assert!(converged, "Small groups should converge");
+        assert!(
+            convergence == ConvergenceState::Converged,
+            "Small groups should converge"
+        );
         assert!(
             result.iter().all(|&v| v.is_finite()),
             "All results should be finite"
diff --git a/src/demean_accelerated/projection.rs b/src/demean_accelerated/projection.rs
index f29eb3ba0..89113316f 100644
--- a/src/demean_accelerated/projection.rs
+++ b/src/demean_accelerated/projection.rs
@@ -19,8 +19,8 @@
 //!
 //! # Usage with Accelerators
 //!
-//! Projectors are used with [`Accelerator`](crate::demean_accelerated::accelerator::Accelerator)
-//! implementations that handle the iteration strategy (e.g., Irons-Tuck acceleration).
+//! Projectors are used with [`IronsTuckGrand`](crate::demean_accelerated::accelerator::IronsTuckGrand)
+//! which handles the iteration strategy.
 
 use crate::demean_accelerated::types::DemeanContext;
 
@@ -34,14 +34,21 @@ use crate::demean_accelerated::types::DemeanContext;
 /// scattered input sums, original input values, and scratch buffers.
 /// This makes the projection interface simple and clear.
 ///
-/// Projectors are used with [`Accelerator`](crate::demean_accelerated::accelerator::Accelerator)
-/// implementations that handle the iteration strategy.
+/// Projectors are used with [`IronsTuckGrand`](crate::demean_accelerated::accelerator::IronsTuckGrand)
+/// which handles the iteration strategy.
 ///
 /// # Performance
 ///
 /// All methods are called in tight loops and should be marked `#[inline(always)]`.
 /// Using static dispatch (`impl Projector` or generics) ensures zero overhead.
 pub trait Projector {
+    /// Total number of coefficients this projector operates on.
+    ///
+    /// This defines the required size of coefficient arrays passed to
+    /// `project()` and `compute_ssr()`. Accelerator buffers must be
+    /// sized to match this value.
+    fn coef_len(&self) -> usize;
+
     /// Project coefficients: coef_in → coef_out.
     fn project(&mut self, coef_in: &[f64], coef_out: &mut [f64]);
 
@@ -49,6 +56,9 @@ pub trait Projector {
     fn compute_ssr(&mut self, coef: &[f64]) -> f64;
 
     /// Length of coefficient slice to use for convergence checking.
+    ///
+    /// This may be smaller than `coef_len()` when not all coefficients
+    /// need to be checked (e.g., for 2-FE only alpha is checked).
     fn convergence_len(&self) -> usize;
 }
 
@@ -146,6 +156,11 @@ impl<'a> TwoFEProjector<'a> {
 }
 
 impl Projector for TwoFEProjector<'_> {
+    #[inline(always)]
+    fn coef_len(&self) -> usize {
+        self.ctx.index.n_groups[0] + self.ctx.index.n_groups[1]
+    }
+
     #[inline(always)]
     fn project(&mut self, coef_in: &[f64], coef_out: &mut [f64]) {
         let n0 = self.ctx.index.n_groups[0];
@@ -278,6 +293,11 @@ impl<'a> MultiFEProjector<'a> {
 }
 
 impl Projector for MultiFEProjector<'_> {
+    #[inline(always)]
+    fn coef_len(&self) -> usize {
+        self.ctx.index.n_coef
+    }
+
     /// Project coefficients using reverse-order FE updates.
     ///
     /// For each FE q from (n_fe-1) down to 0:
diff --git a/src/demean_accelerated/types.rs b/src/demean_accelerated/types.rs
index 6d70b51e4..8cd429697 100644
--- a/src/demean_accelerated/types.rs
+++ b/src/demean_accelerated/types.rs
@@ -426,6 +426,9 @@ pub struct FixestConfig {
 
     /// Iterations between grand acceleration steps.
     pub iter_grand_acc: usize,
+
+    /// Iterations between SSR-based convergence checks.
+    pub ssr_check_interval: usize,
 }
 
 impl Default for FixestConfig {
@@ -442,6 +445,24 @@ impl Default for FixestConfig {
             iter_proj_after_acc: 40,
             // Grand acceleration frequency (every N iterations)
             iter_grand_acc: 4,
+            // SSR convergence check frequency
+            ssr_check_interval: 40,
         }
     }
 }
+
+// =============================================================================
+// ConvergenceState
+// =============================================================================
+
+/// Whether the iterative algorithm has converged.
+///
+/// Used throughout the demeaning module to represent convergence state
+/// in a self-documenting way, avoiding ambiguous boolean returns.
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum ConvergenceState {
+    /// Algorithm has converged; iteration can stop.
+    Converged,
+    /// Algorithm has not yet converged; continue iterating.
+    NotConverged,
+}

From c3ca14344783d9d931cb4253b722907bf596d605 Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Sat, 3 Jan 2026 23:29:38 +0100
Subject: [PATCH 07/24] Wire Rust accelerated backend to Python

Connect the new demean_accelerated module to Python and polish:
- Wire rust backend to use demean_accelerated instead of simple demean
- Fix MultiFE early convergence bug in 3+ FE demeaning
- Rename scatter/gather to apply_design_matrix for clarity
- Avoid per-column copy for Fortran-ordered input arrays
- Add type cast guard and #[inline(always)] on hot methods
---
 .gitignore                            |   1 -
 benchmarks/bench_demean_r.R           |  71 ----
 benchmarks/bench_native_comparison.py | 209 ------------
 benchmarks/demean_benchmark.py        | 456 --------------------------
 pyfixest/core/demean_accelerated.py   |   8 +-
 pyfixest/estimation/backends.py       |   4 +-
 pyfixest/estimation/demean_.py        |   4 +-
 src/demean_accelerated/accelerator.rs | 201 +++++++-----
 src/demean_accelerated/demeaner.rs    | 196 ++++++-----
 src/demean_accelerated/mod.rs         |  57 ++--
 src/demean_accelerated/projection.rs  |  14 +-
 src/demean_accelerated/types.rs       |  87 ++---
 src/detect_singletons.rs              |   2 +-
 tests/test_demean.py                  |   2 +-
 14 files changed, 316 insertions(+), 996 deletions(-)
 delete mode 100644 benchmarks/bench_demean_r.R
 delete mode 100644 benchmarks/bench_native_comparison.py
 delete mode 100644 benchmarks/demean_benchmark.py

diff --git a/.gitignore b/.gitignore
index 899602ad4..f5378e980 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,4 +42,3 @@ coverage.xml
 # pixi environments
 .pixi/*
 !.pixi/config.toml
-benchmarks/results/
diff --git a/benchmarks/bench_demean_r.R b/benchmarks/bench_demean_r.R
deleted file mode 100644
index 66bdc342a..000000000
--- a/benchmarks/bench_demean_r.R
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env Rscript
-# Benchmark fixest demeaning directly in R
-# Usage: Rscript bench_demean_r.R [n_obs] [dgp_type] [n_fe]
-
-library(fixest)
-
-args <- commandArgs(trailingOnly = TRUE)
-n_obs <- if (length(args) >= 1) as.integer(args[1]) else 100000L
-dgp_type <- if (length(args) >= 2) args[2] else "difficult"
-n_fe <- if (length(args) >= 3) as.integer(args[3]) else 3L
-
-# Use 2 threads to match fixest_benchmarks settings
-setFixest_nthreads(2)
-
-# Generate data matching Python benchmark DGP
-set.seed(42)
-n_year <- 10L
-n_indiv_per_firm <- 23L
-n_indiv <- max(1L, round(n_obs / n_year))
-n_firm <- max(1L, round(n_indiv / n_indiv_per_firm))
-
-indiv_id <- rep(1:n_indiv, each = n_year)[1:n_obs]
-year <- rep(1:n_year, times = n_indiv)[1:n_obs]
-
-if (dgp_type == "simple") {
-  firm_id <- sample(1:n_firm, n_obs, replace = TRUE)
-} else {
-  # difficult: sequential assignment
-  firm_id <- rep(1:n_firm, length.out = n_obs)
-}
-
-# Generate outcome
-x1 <- rnorm(n_obs)
-firm_fe <- rnorm(n_firm)[firm_id]
-unit_fe <- rnorm(n_indiv)[indiv_id]
-year_fe <- rnorm(n_year)[year]
-y <- x1 + firm_fe + unit_fe + year_fe + rnorm(n_obs)
-
-df <- data.frame(
-  y = y,
-  x1 = x1,
-  indiv_id = indiv_id,
-  year = year,
-  firm_id = firm_id
-)
-
-# Build formula based on n_fe
-if (n_fe == 2) {
-  fml <- y ~ x1 | indiv_id + year
-} else {
-  fml <- y ~ x1 | indiv_id + year + firm_id
-}
-
-# Warm up
-invisible(feols(fml, data = df, notes = FALSE, warn = FALSE, nthreads = 2L))
-
-# Benchmark
-n_runs <- 5L
-times <- numeric(n_runs)
-
-for (i in 1:n_runs) {
-  start <- Sys.time()
-  fit <- feols(fml, data = df, notes = FALSE, warn = FALSE, nthreads = 2L)
-  end <- Sys.time()
-  times[i] <- as.numeric(end - start, units = "secs") * 1000  # ms
-}
-
-cat(sprintf("fixest (R native) - n=%d, type=%s, %dFE\n", n_obs, dgp_type, n_fe))
-cat(sprintf("  Times (ms): %s\n", paste(round(times, 2), collapse = ", ")))
-cat(sprintf("  Median: %.2f ms\n", median(times)))
-cat(sprintf("  Min: %.2f ms\n", min(times)))
diff --git a/benchmarks/bench_native_comparison.py b/benchmarks/bench_native_comparison.py
deleted file mode 100644
index f45ffd08f..000000000
--- a/benchmarks/bench_native_comparison.py
+++ /dev/null
@@ -1,209 +0,0 @@
-#!/usr/bin/env python3
-"""
-Benchmark comparing pyfixest feols vs native fixest feols.
-
-Runs fixest directly in R to avoid rpy2 overhead, then compares with pyfixest.
-This is a fair apples-to-apples comparison of full feols() routines.
-"""
-
-from __future__ import annotations
-
-import os
-
-# Set thread count for Rayon (pyfixest) BEFORE importing pyfixest
-os.environ["RAYON_NUM_THREADS"] = "2"
-
-import json
-import subprocess
-import time
-from pathlib import Path
-from statistics import median
-
-import numpy as np
-import pandas as pd
-
-
-def generate_dgp(
-    n: int,
-    dgp_type: str = "simple",
-    n_years: int = 10,
-    n_indiv_per_firm: int = 23,
-) -> pd.DataFrame:
-    """Generate test data matching fixest benchmark DGP."""
-    np.random.seed(42)
-
-    n_indiv = max(1, round(n / n_years))
-    n_firm = max(1, round(n_indiv / n_indiv_per_firm))
-
-    indiv_id = np.repeat(np.arange(n_indiv), n_years)[:n]
-    year = np.tile(np.arange(n_years), n_indiv)[:n]
-
-    if dgp_type == "simple":
-        firm_id = np.random.randint(0, n_firm, size=n)
-    else:  # difficult
-        firm_id = np.tile(np.arange(n_firm), (n // n_firm) + 1)[:n]
-
-    x1 = np.random.randn(n)
-    firm_fe = np.random.randn(n_firm)[firm_id]
-    unit_fe = np.random.randn(n_indiv)[indiv_id]
-    year_fe = np.random.randn(n_years)[year]
-    y = x1 + firm_fe + unit_fe + year_fe + np.random.randn(n)
-
-    return pd.DataFrame(
-        {
-            "y": y,
-            "x1": x1,
-            "indiv_id": indiv_id,
-            "year": year,
-            "firm_id": firm_id,
-        }
-    )
-
-
-def run_r_benchmark(n_obs: int, dgp_type: str, n_fe: int, n_runs: int = 5) -> dict:
-    """Run fixest benchmark in R subprocess."""
-    r_script = Path(__file__).parent / "bench_demean_r.R"
-
-    try:
-        result = subprocess.run(
-            ["Rscript", str(r_script), str(n_obs), dgp_type, str(n_fe)],
-            capture_output=True,
-            text=True,
-            timeout=300,
-        )
-
-        if result.returncode != 0:
-            return {"error": result.stderr, "times": [], "median": float("inf")}
-
-        # Parse output
-        lines = result.stdout.strip().split("\n")
-        median_ms = None
-        for line in lines:
-            if "Median:" in line:
-                median_ms = float(line.split(":")[1].strip().replace(" ms", ""))
-
-        return {
-            "median": median_ms if median_ms else float("inf"),
-            "output": result.stdout,
-        }
-    except subprocess.TimeoutExpired:
-        return {"error": "timeout", "median": float("inf")}
-    except FileNotFoundError:
-        return {"error": "R not found", "median": float("inf")}
-
-
-def run_pyfixest_benchmark(
-    df: pd.DataFrame,
-    n_fe: int,
-    n_runs: int = 5,
-) -> dict:
-    """Run pyfixest feols benchmark."""
-    import pyfixest as pf
-
-    # Build formula matching R benchmark
-    if n_fe == 2:
-        fml = "y ~ x1 | indiv_id + year"
-    else:
-        fml = "y ~ x1 | indiv_id + year + firm_id"
-
-    # Warmup - use rust backend for accelerated demeaning
-    pf.feols(fml, data=df, demeaner_backend="rust")
-
-    times = []
-    for _ in range(n_runs):
-        start = time.perf_counter()
-        fit = pf.feols(fml, data=df, demeaner_backend="rust")
-        elapsed = (time.perf_counter() - start) * 1000  # ms
-        times.append(elapsed)
-
-    return {
-        "median": median(times),
-        "times": times,
-        "coef": float(fit.coef().iloc[0]),
-    }
-
-
-def main():
-    """Run benchmark comparing pyfixest feols vs native fixest feols."""
-    configs = [
-        (10_000, "simple", 2),
-        (10_000, "difficult", 2),
-        (10_000, "simple", 3),
-        (10_000, "difficult", 3),
-        (100_000, "simple", 2),
-        (100_000, "difficult", 2),
-        (100_000, "simple", 3),
-        (100_000, "difficult", 3),
-        (1_000_000, "simple", 2),
-        (1_000_000, "difficult", 2),
-        (1_000_000, "simple", 3),
-        (1_000_000, "difficult", 3),
-    ]
-
-    results = []
-
-    print("=" * 70)
-    print("PyFixest feols() vs Fixest feols() Benchmark")
-    print("=" * 70)
-
-    for n_obs, dgp_type, n_fe in configs:
-        print(f"\nConfig: n={n_obs:,}, type={dgp_type}, fe={n_fe}")
-        print("-" * 50)
-
-        # Generate data
-        df = generate_dgp(n_obs, dgp_type)
-
-        # Run R benchmark (feols)
-        r_result = run_r_benchmark(n_obs, dgp_type, n_fe)
-        r_time = r_result.get("median", float("inf"))
-        print(f"  fixest (R):      {r_time:8.2f} ms")
-
-        # Run pyfixest benchmark (feols)
-        py_result = run_pyfixest_benchmark(df, n_fe)
-        py_time = py_result.get("median", float("inf"))
-
-        if r_time > 0 and py_time < float("inf"):
-            ratio = py_time / r_time
-            print(f"  pyfixest:        {py_time:8.2f} ms ({ratio:.2f}x)")
-        else:
-            print(f"  pyfixest:        {py_time:8.2f} ms")
-
-        results.append(
-            {
-                "n_obs": n_obs,
-                "dgp_type": dgp_type,
-                "n_fe": n_fe,
-                "fixest_r_ms": r_time,
-                "pyfixest_ms": py_time,
-            }
-        )
-
-    # Summary
-    print("\n" + "=" * 70)
-    print("SUMMARY (pyfixest feols vs fixest feols)")
-    print("=" * 70)
-
-    print(f"{'Config':<35} {'fixest':>10} {'pyfixest':>10} {'ratio':>8}")
-    print("-" * 65)
-
-    for r in results:
-        config = f"n={r['n_obs']:,} {r['dgp_type']:9} {r['n_fe']}FE"
-        fixest = r["fixest_r_ms"]
-        pyfixest = r["pyfixest_ms"]
-
-        if fixest > 0 and fixest < float("inf") and pyfixest < float("inf"):
-            ratio = pyfixest / fixest
-            print(f"{config:<35} {fixest:>8.1f}ms {pyfixest:>8.1f}ms {ratio:>7.2f}x")
-        else:
-            print(f"{config:<35} {'N/A':>10} {'N/A':>10}")
-
-    # Save results
-    output_path = Path(__file__).parent / "results" / "native_comparison.json"
-    output_path.parent.mkdir(exist_ok=True)
-    with open(output_path, "w") as f:
-        json.dump(results, f, indent=2)
-    print(f"\nResults saved to {output_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmarks/demean_benchmark.py b/benchmarks/demean_benchmark.py
deleted file mode 100644
index 6a587b75f..000000000
--- a/benchmarks/demean_benchmark.py
+++ /dev/null
@@ -1,456 +0,0 @@
-#!/usr/bin/env python3
-"""
-Benchmark script for comparing demeaning implementations.
-
-Oriented on fixest_benchmarks/bench_ols.R but focused on demeaning only
-and optimized for fast iteration.
-
-Usage:
-    python benchmarks/demean_benchmark.py           # Fast mode (~30s)
-    python benchmarks/demean_benchmark.py --full    # Full mode (~5min)
-    python benchmarks/demean_benchmark.py --save    # Save results to JSON
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import time
-from dataclasses import dataclass
-from pathlib import Path
-from statistics import median
-from typing import Callable
-
-import numpy as np
-
-
-@dataclass
-class BenchmarkConfig:
-    """Configuration for a single benchmark run."""
-
-    n_obs: int
-    dgp_type: str  # "simple" or "difficult"
-    n_fe: int
-    n_iters: int
-
-
-@dataclass
-class BenchmarkResult:
-    """Result of a benchmark run."""
-
-    config: BenchmarkConfig
-    backend: str
-    times: list[float]
-    median_time: float
-    available: bool
-    error: str | None = None
-
-
-def generate_dgp(
-    n: int,
-    dgp_type: str = "simple",
-    n_years: int = 10,
-    n_indiv_per_firm: int = 23,
-) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
-    """
-    Generate data matching fixest_benchmarks DGP.
-
-    Parameters
-    ----------
-    n : int
-        Number of observations
-    dgp_type : str
-        "simple" (random firm assignment) or "difficult" (sequential)
-    n_years : int
-        Number of years
-    n_indiv_per_firm : int
-        Average individuals per firm
-
-    Returns
-    -------
-    x : np.ndarray
-        Feature matrix (n, 1)
-    flist : np.ndarray
-        Fixed effect IDs (n, 2 or 3) - [indiv_id, year] or [indiv_id, year, firm_id]
-    weights : np.ndarray
-        Sample weights (n,)
-    """
-    n_indiv = max(1, round(n / n_years))
-    n_firm = max(1, round(n_indiv / n_indiv_per_firm))
-
-    # Create FE IDs
-    indiv_id = np.repeat(np.arange(n_indiv), n_years)[:n]
-    year = np.tile(np.arange(n_years), n_indiv)[:n]
-
-    if dgp_type == "simple":
-        # Random firm assignment - easier convergence
-        firm_id = np.random.randint(0, n_firm, size=n)
-    elif dgp_type == "difficult":
-        # Sequential firm assignment - harder convergence (messy data)
-        firm_id = np.tile(np.arange(n_firm), (n // n_firm) + 1)[:n]
-    else:
-        raise ValueError(f"Unknown dgp_type: {dgp_type}")
-
-    # Generate features
-    x1 = np.random.randn(n)
-
-    # Generate y with FE structure
-    firm_fe = np.random.randn(n_firm)[firm_id]
-    unit_fe = np.random.randn(n_indiv)[indiv_id]
-    year_fe = np.random.randn(n_years)[year]
-    y = x1 + firm_fe + unit_fe + year_fe + np.random.randn(n)
-
-    # Stack into matrices
-    x = np.column_stack([y, x1])  # Demean both y and x1
-    weights = np.ones(n)
-
-    return x, indiv_id, year, firm_id, weights
-
-
-def get_demean_backends() -> dict[str, Callable | None]:
-    """Get available demeaning backends with graceful fallbacks."""
-    backends: dict[str, Callable | None] = {}
-
-    # Rust accelerated (default)
-    try:
-        from pyfixest.core.demean import demean as demean_rust
-
-        backends["rust-accelerated"] = demean_rust
-    except ImportError:
-        backends["rust-accelerated"] = None
-
-    # Rust simple (via env var)
-    def demean_rust_simple(x, flist, weights, tol=1e-8, maxiter=100_000):
-        os.environ["PYFIXEST_DEMEAN_SIMPLE"] = "1"
-        try:
-            from pyfixest.core.demean import demean as demean_rust
-
-            return demean_rust(x, flist, weights, tol, maxiter)
-        finally:
-            del os.environ["PYFIXEST_DEMEAN_SIMPLE"]
-
-    backends["rust-simple"] = (
-        demean_rust_simple if backends["rust-accelerated"] else None
-    )
-
-    # Numba
-    try:
-        from pyfixest.estimation.demean_ import demean as demean_numba
-
-        backends["numba"] = demean_numba
-    except ImportError:
-        backends["numba"] = None
-
-    # CuPy 32-bit
-    try:
-        from pyfixest.estimation.cupy.demean_cupy_ import demean_cupy32
-
-        backends["cupy32"] = demean_cupy32
-    except ImportError:
-        backends["cupy32"] = None
-
-    # CuPy 64-bit
-    try:
-        from pyfixest.estimation.cupy.demean_cupy_ import demean_cupy64
-
-        backends["cupy64"] = demean_cupy64
-    except ImportError:
-        backends["cupy64"] = None
-
-    # R fixest via rpy2 - use feols with only FE (no covariates) to measure demean time
-    try:
-        import pandas as pd
-        import rpy2.robjects as ro
-        from rpy2.robjects import numpy2ri, pandas2ri
-        from rpy2.robjects.packages import importr
-
-        numpy2ri.activate()
-        pandas2ri.activate()
-        importr("fixest")  # Load fixest package
-
-        def demean_fixest(x, flist, weights, tol=1e-8, maxiter=100_000):
-            # Create a minimal regression problem that exercises the demeaning
-            _n, k = x.shape
-            n_fe = flist.shape[1] if flist.ndim > 1 else 1
-
-            # Build a dataframe with y and FE columns
-            data = {"y": x[:, 0]}
-            fe_names = []
-            for j in range(n_fe):
-                fe_col = f"fe{j + 1}"
-                fe_names.append(fe_col)
-                if flist.ndim > 1:
-                    data[fe_col] = flist[:, j].astype(int)
-                else:
-                    data[fe_col] = flist.astype(int)
-
-            df = pd.DataFrame(data)
-            r_df = pandas2ri.py2rpy(df)
-
-            # Build formula: y ~ 1 | fe1 + fe2 + ...
-            fe_formula = " + ".join(fe_names)
-            formula = f"y ~ 1 | {fe_formula}"
-
-            # Call feols (this includes demeaning time)
-            ro.r.assign("df", r_df)
-            ro.r(f"result <- fixest::feols({formula}, data=df, nthreads=1)")
-
-            # Return the residuals as "demeaned" values
-            resid = np.array(ro.r("residuals(result)"))
-            result = np.column_stack([resid] + [x[:, j] for j in range(1, k)])
-            return result, True
-
-        backends["fixest"] = demean_fixest
-    except (ImportError, Exception):
-        backends["fixest"] = None
-
-    return backends
-
-
-def run_single_benchmark(
-    demean_func: Callable,
-    x: np.ndarray,
-    flist: np.ndarray,
-    weights: np.ndarray,
-    n_iters: int,
-) -> list[float]:
-    """Run a single benchmark configuration multiple times."""
-    times = []
-
-    for _ in range(n_iters):
-        # Copy arrays to avoid caching effects
-        x_copy = x.copy()
-
-        start = time.perf_counter()
-        demean_func(x_copy, flist, weights)
-        elapsed = time.perf_counter() - start
-
-        times.append(elapsed)
-
-    return times
-
-
-def run_benchmarks(
-    configs: list[BenchmarkConfig],
-    backends: dict[str, Callable | None],
-) -> list[BenchmarkResult]:
-    """Run all benchmark configurations across all backends."""
-    results = []
-
-    for config in configs:
-        print(f"\n{'=' * 60}")
-        print(f"Config: n={config.n_obs:,}, type={config.dgp_type}, fe={config.n_fe}")
-        print("=" * 60)
-
-        # Generate data
-        x, indiv_id, year, firm_id, weights = generate_dgp(
-            config.n_obs, config.dgp_type
-        )
-
-        # Build flist based on n_fe
-        if config.n_fe == 2:
-            flist = np.column_stack([indiv_id, year]).astype(np.uint64)
-        else:  # n_fe == 3
-            flist = np.column_stack([indiv_id, year, firm_id]).astype(np.uint64)
-
-        for backend_name, demean_func in backends.items():
-            if demean_func is None:
-                result = BenchmarkResult(
-                    config=config,
-                    backend=backend_name,
-                    times=[],
-                    median_time=float("inf"),
-                    available=False,
-                    error="Not installed",
-                )
-                results.append(result)
-                print(f"  {backend_name:20s}: not available")
-                continue
-
-            try:
-                times = run_single_benchmark(
-                    demean_func, x, flist, weights, config.n_iters
-                )
-                med_time = median(times)
-                result = BenchmarkResult(
-                    config=config,
-                    backend=backend_name,
-                    times=times,
-                    median_time=med_time,
-                    available=True,
-                )
-                results.append(result)
-                print(
-                    f"  {backend_name:20s}: {med_time * 1000:8.2f} ms (median of {len(times)})"
-                )
-            except Exception as e:
-                result = BenchmarkResult(
-                    config=config,
-                    backend=backend_name,
-                    times=[],
-                    median_time=float("inf"),
-                    available=False,
-                    error=str(e),
-                )
-                results.append(result)
-                print(f"  {backend_name:20s}: ERROR - {e}")
-
-    return results
-
-
-def print_summary(results: list[BenchmarkResult]) -> None:
-    """Print a summary table of results."""
-    print("\n" + "=" * 80)
-    print("SUMMARY")
-    print("=" * 80)
-
-    # Group by config
-    configs = sorted(
-        set((r.config.n_obs, r.config.dgp_type, r.config.n_fe) for r in results)
-    )
-
-    backends = sorted(set(r.backend for r in results))
-
-    # Header
-    header = f"{'Config':30s}"
-    for backend in backends:
-        header += f" {backend:>12s}"
-    print(header)
-    print("-" * len(header))
-
-    # Find fixest baseline for relative comparison
-    fixest_times = {}
-    for r in results:
-        if r.backend == "fixest" and r.available:
-            key = (r.config.n_obs, r.config.dgp_type, r.config.n_fe)
-            fixest_times[key] = r.median_time
-
-    # Rows
-    for n_obs, dgp_type, n_fe in configs:
-        config_str = f"n={n_obs:,} {dgp_type:9s} {n_fe}FE"
-        row = f"{config_str:30s}"
-
-        key = (n_obs, dgp_type, n_fe)
-        baseline = fixest_times.get(key)
-
-        for backend in backends:
-            matching = [
-                r
-                for r in results
-                if r.config.n_obs == n_obs
-                and r.config.dgp_type == dgp_type
-                and r.config.n_fe == n_fe
-                and r.backend == backend
-            ]
-            if matching and matching[0].available:
-                time_ms = matching[0].median_time * 1000
-                if baseline and backend != "fixest":
-                    ratio = matching[0].median_time / baseline
-                    row += f" {time_ms:7.1f}ms({ratio:.1f}x)"
-                else:
-                    row += f" {time_ms:12.1f}ms"
-            else:
-                row += f" {'N/A':>12s}"
-
-        print(row)
-
-
-def save_results(results: list[BenchmarkResult], path: Path) -> None:
-    """Save results to JSON."""
-    data = []
-    for r in results:
-        data.append(
-            {
-                "n_obs": r.config.n_obs,
-                "dgp_type": r.config.dgp_type,
-                "n_fe": r.config.n_fe,
-                "n_iters": r.config.n_iters,
-                "backend": r.backend,
-                "times": r.times,
-                "median_time": r.median_time if r.median_time != float("inf") else None,
-                "available": r.available,
-                "error": r.error,
-            }
-        )
-
-    path.parent.mkdir(parents=True, exist_ok=True)
-    with open(path, "w") as f:
-        json.dump(data, f, indent=2)
-    print(f"\nResults saved to {path}")
-
-
-def main():
-    """Run demeaning benchmarks."""
-    parser = argparse.ArgumentParser(description="Benchmark demeaning implementations")
-    parser.add_argument(
-        "--full", action="store_true", help="Run full benchmark (slower)"
-    )
-    parser.add_argument("--save", action="store_true", help="Save results to JSON")
-    parser.add_argument(
-        "--output",
-        type=Path,
-        default=Path("benchmarks/results/benchmark.json"),
-        help="Output path for results",
-    )
-    args = parser.parse_args()
-
-    # Define configurations
-    if args.full:
-        configs = [
-            # Small (fast)
-            BenchmarkConfig(n_obs=10_000, dgp_type="simple", n_fe=2, n_iters=5),
-            BenchmarkConfig(n_obs=10_000, dgp_type="difficult", n_fe=2, n_iters=5),
-            BenchmarkConfig(n_obs=10_000, dgp_type="simple", n_fe=3, n_iters=5),
-            BenchmarkConfig(n_obs=10_000, dgp_type="difficult", n_fe=3, n_iters=5),
-            # Medium
-            BenchmarkConfig(n_obs=100_000, dgp_type="simple", n_fe=2, n_iters=3),
-            BenchmarkConfig(n_obs=100_000, dgp_type="difficult", n_fe=2, n_iters=3),
-            BenchmarkConfig(n_obs=100_000, dgp_type="simple", n_fe=3, n_iters=3),
-            BenchmarkConfig(n_obs=100_000, dgp_type="difficult", n_fe=3, n_iters=3),
-            # Large
-            BenchmarkConfig(n_obs=500_000, dgp_type="simple", n_fe=2, n_iters=2),
-            BenchmarkConfig(n_obs=500_000, dgp_type="difficult", n_fe=2, n_iters=2),
-            BenchmarkConfig(n_obs=1_000_000, dgp_type="simple", n_fe=2, n_iters=1),
-            BenchmarkConfig(n_obs=1_000_000, dgp_type="difficult", n_fe=2, n_iters=1),
-        ]
-    else:
-        # Fast mode - minimal configs for quick iteration
-        configs = [
-            BenchmarkConfig(n_obs=10_000, dgp_type="simple", n_fe=2, n_iters=5),
-            BenchmarkConfig(n_obs=10_000, dgp_type="difficult", n_fe=2, n_iters=5),
-            BenchmarkConfig(n_obs=10_000, dgp_type="simple", n_fe=3, n_iters=5),
-            BenchmarkConfig(n_obs=10_000, dgp_type="difficult", n_fe=3, n_iters=5),
-            BenchmarkConfig(n_obs=100_000, dgp_type="simple", n_fe=2, n_iters=3),
-            BenchmarkConfig(n_obs=100_000, dgp_type="difficult", n_fe=2, n_iters=3),
-            BenchmarkConfig(n_obs=100_000, dgp_type="simple", n_fe=3, n_iters=3),
-            BenchmarkConfig(n_obs=100_000, dgp_type="difficult", n_fe=3, n_iters=3),
-        ]
-
-    print("Demeaning Benchmark")
-    print("=" * 60)
-    print(f"Mode: {'full' if args.full else 'fast'}")
-    print(f"Configurations: {len(configs)}")
-
-    # Get available backends
-    backends = get_demean_backends()
-    available = [name for name, func in backends.items() if func is not None]
-    unavailable = [name for name, func in backends.items() if func is None]
-
-    print(f"Available backends: {', '.join(available)}")
-    if unavailable:
-        print(f"Unavailable backends: {', '.join(unavailable)}")
-
-    # Run benchmarks
-    results = run_benchmarks(configs, backends)
-
-    # Print summary
-    print_summary(results)
-
-    # Save if requested
-    if args.save:
-        save_results(results, args.output)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/pyfixest/core/demean_accelerated.py b/pyfixest/core/demean_accelerated.py
index 1121463e3..a55dda72d 100644
--- a/pyfixest/core/demean_accelerated.py
+++ b/pyfixest/core/demean_accelerated.py
@@ -70,4 +70,10 @@ def demean_accelerated(
     print(pf.feols(fml, data).coef())
     ```
     """
-    return _demean_accelerated_rs(x, flist.astype(np.uint64), weights, tol, maxiter)
+    return _demean_accelerated_rs(
+        x.astype(np.float64, copy=False),
+        flist.astype(np.uint64, copy=False),
+        weights.astype(np.float64, copy=False),
+        tol,
+        maxiter,
+    )
diff --git a/pyfixest/estimation/backends.py b/pyfixest/estimation/backends.py
index e80a5c3db..51f9891b5 100644
--- a/pyfixest/estimation/backends.py
+++ b/pyfixest/estimation/backends.py
@@ -1,6 +1,6 @@
 from pyfixest.core.collinear import find_collinear_variables
 from pyfixest.core.crv1 import crv1_meat_loop
-from pyfixest.core.demean import demean
+from pyfixest.core.demean_accelerated import demean_accelerated
 from pyfixest.core.nested_fixed_effects import count_fixef_fully_nested_all
 from pyfixest.estimation.demean_ import demean as demean_nb
 from pyfixest.estimation.numba.find_collinear_variables_nb import (
@@ -53,7 +53,7 @@
         "nonnested": count_fixef_fully_nested_all_nb,
     },
     "rust": {
-        "demean": demean,
+        "demean": demean_accelerated,
         "collinear": find_collinear_variables,
         "crv1_meat": crv1_meat_loop,
         "nonnested": count_fixef_fully_nested_all,
diff --git a/pyfixest/estimation/demean_.py b/pyfixest/estimation/demean_.py
index d05ecc885..0354e454a 100644
--- a/pyfixest/estimation/demean_.py
+++ b/pyfixest/estimation/demean_.py
@@ -346,9 +346,9 @@ def _set_demeaner_backend(
         If the demeaning backend is not supported.
     """
     if demeaner_backend == "rust":
-        from pyfixest.core.demean import demean as demean_rs
+        from pyfixest.core.demean_accelerated import demean_accelerated
 
-        return demean_rs
+        return demean_accelerated
     elif demeaner_backend == "numba":
         return demean
     elif demeaner_backend == "jax":
diff --git a/src/demean_accelerated/accelerator.rs b/src/demean_accelerated/accelerator.rs
index 0350e7786..f259be8a7 100644
--- a/src/demean_accelerated/accelerator.rs
+++ b/src/demean_accelerated/accelerator.rs
@@ -42,13 +42,13 @@ enum GrandPhase {
 /// collecting snapshots every `iter_grand_acc` iterations to capture long-range
 /// convergence patterns.
 enum GrandStepResult {
-    /// Continue with the next phase of snapshot collection.
+    /// Continue with the next phase of the snapshot collection.
     Continue(GrandPhase),
     /// Grand acceleration detected convergence; iteration can stop.
     Done(ConvergenceState),
 }
 
-/// Buffers for Irons-Tuck + Grand acceleration.
+/// Buffers for Irons-Tuck with Grand acceleration.
 ///
 /// # Regular Irons-Tuck buffers
 ///
@@ -107,11 +107,11 @@ impl IronsTuckGrandBuffers {
 /// 1. **Irons-Tuck**: After computing G(x) and G(G(x)), extrapolates to estimate
 ///    the fixed point directly using the formula from Irons & Tuck (1969).
 ///
-/// 2. **Grand acceleration**: Every `iter_grand_acc` iterations, applies Irons-Tuck
+/// 2. **Grand acceleration**: Every `iter_grand_acc` iteration applies Irons-Tuck
 ///    at a coarser level to accelerate long-range convergence.
 ///
 /// Additionally, SSR (sum of squared residuals) is checked every `ssr_check_interval`
-/// iterations as a secondary convergence criterion.
+/// iteration as a secondary convergence criterion.
 pub struct IronsTuckGrand {
     /// Algorithm configuration (tolerance, iteration parameters).
     config: FixestConfig,
@@ -134,7 +134,7 @@ impl IronsTuckGrand {
     /// # Arguments
     ///
     /// * `projector` - The projection operation to accelerate
-    /// * `coef` - Initial coefficients (modified in place with final result)
+    /// * `coef` - Initial coefficients (modified in place with the final result)
     /// * `max_iter` - Maximum iterations before giving up
     ///
     /// # Returns
@@ -146,7 +146,6 @@ impl IronsTuckGrand {
         coef: &mut [f64],
         max_iter: usize,
     ) -> (usize, ConvergenceState) {
-        // Verify buffer size matches projector's coefficient count
         debug_assert_eq!(
             self.buffers.gx.len(),
             projector.coef_len(),
@@ -155,87 +154,142 @@ impl IronsTuckGrand {
             projector.coef_len()
         );
 
-        let conv_len = projector.convergence_len();
-
-        // Initial projection
-        projector.project(coef, &mut self.buffers.gx);
+        // Initial projection and convergence check
+        let conv = self.project_and_check(projector, coef);
+        if conv == ConvergenceState::Converged {
+            return self.finalize_output(coef, 0, conv);
+        }
 
-        let mut convergence = if Self::should_continue(
-            &coef[..conv_len],
-            &self.buffers.gx[..conv_len],
-            self.config.tol,
-        ) {
-            ConvergenceState::NotConverged
-        } else {
-            ConvergenceState::Converged
-        };
-        let mut iter = 0;
         let mut grand_phase = GrandPhase::default();
         let mut ssr = 0.0;
 
-        while convergence == ConvergenceState::NotConverged && iter < max_iter {
-            iter += 1;
-
-            // Double projection for Irons-Tuck: G(G(x))
-            projector.project(&self.buffers.gx, &mut self.buffers.ggx);
-
-            // Irons-Tuck acceleration
-            let accel_convergence = Self::accelerate(
-                &mut coef[..conv_len],
-                &self.buffers.gx[..conv_len],
-                &self.buffers.ggx[..conv_len],
-            );
-            if accel_convergence == ConvergenceState::Converged {
-                convergence = ConvergenceState::Converged;
-                break;
+        for iter in 1..=max_iter {
+            // Core acceleration step
+            let conv = self.acceleration_step_check(projector, coef, iter);
+            if conv == ConvergenceState::Converged {
+                return self.finalize_output(coef, iter, conv);
             }
 
-            // Post-acceleration projection (after warmup)
-            if iter >= self.config.iter_proj_after_acc {
-                self.buffers.temp[..conv_len].copy_from_slice(&coef[..conv_len]);
-                projector.project(&self.buffers.temp, coef);
-            }
-
-            // Update gx for convergence check
-            projector.project(coef, &mut self.buffers.gx);
-            convergence = if Self::should_continue(
-                &coef[..conv_len],
-                &self.buffers.gx[..conv_len],
-                self.config.tol,
-            ) {
-                ConvergenceState::NotConverged
-            } else {
-                ConvergenceState::Converged
-            };
-
             // Grand acceleration (every iter_grand_acc iterations)
             if iter % self.config.iter_grand_acc == 0 {
-                match self.grand_acceleration_step(grand_phase, projector, conv_len) {
-                    GrandStepResult::Continue(next) => grand_phase = next,
-                    GrandStepResult::Done(state) => {
-                        convergence = state;
-                        break;
-                    }
+                let conv = self.grand_acceleration_check(projector, &mut grand_phase);
+                if conv == ConvergenceState::Converged {
+                    return self.finalize_output(coef, iter, conv);
                 }
             }
 
             // SSR convergence check (every ssr_check_interval iterations)
             if iter % self.config.ssr_check_interval == 0 {
-                let ssr_old = ssr;
-                ssr = projector.compute_ssr(&self.buffers.gx);
-
-                if iter > self.config.ssr_check_interval
-                    && Self::converged(ssr_old, ssr, self.config.tol)
-                {
-                    convergence = ConvergenceState::Converged;
-                    break;
+                let conv = self.ssr_convergence_check(projector, iter, &mut ssr);
+                if conv == ConvergenceState::Converged {
+                    return self.finalize_output(coef, iter, conv);
                 }
             }
         }
+        self.finalize_output(coef, max_iter, ConvergenceState::NotConverged)
+    }
 
-        // Copy final result
+    /// Copy converged coefficients to the output buffer.
+    ///
+    /// This method should be called after `run()` has completed to retrieve
+    /// the final coefficients from the internal `gx` buffer.
+    #[inline]
+    fn finalize_output(&self, coef: &mut [f64],
+                           iter: usize,
+                           convergence: ConvergenceState,) -> (usize, ConvergenceState) {
         coef.copy_from_slice(&self.buffers.gx);
         (iter, convergence)
+
+    }
+
+    /// Perform the core Irons-Tuck acceleration step.
+    ///
+    /// Returns `Converged` if convergence detected, `NotConverged` to continue.
+    #[inline]
+    fn acceleration_step_check<P: Projector>(
+        &mut self,
+        projector: &mut P,
+        coef: &mut [f64],
+        iter: usize,
+    ) -> ConvergenceState {
+        let conv_len = projector.convergence_len();
+
+        // Double projection for Irons-Tuck: G(G(x))
+        projector.project(&self.buffers.gx, &mut self.buffers.ggx);
+
+        // Irons-Tuck acceleration
+        if Self::accelerate(
+            &mut coef[..conv_len],
+            &self.buffers.gx[..conv_len],
+            &self.buffers.ggx[..conv_len],
+        ) == ConvergenceState::Converged
+        {
+            return ConvergenceState::Converged;
+        }
+
+        // Post-acceleration projection (after warmup)
+        if iter >= self.config.iter_proj_after_acc {
+            self.buffers.temp[..conv_len].copy_from_slice(&coef[..conv_len]);
+            projector.project(&self.buffers.temp, coef);
+        }
+
+        // Update gx and check coefficient convergence
+        self.project_and_check(projector, coef)
+    }
+
+    /// Perform grand acceleration and check for convergence.
+    #[inline]
+    fn grand_acceleration_check<P: Projector>(
+        &mut self,
+        projector: &mut P,
+        grand_phase: &mut GrandPhase,
+    ) -> ConvergenceState {
+        match self.grand_acceleration_step(projector, *grand_phase) {
+            GrandStepResult::Continue(next) => {
+                *grand_phase = next;
+                ConvergenceState::NotConverged
+            }
+            GrandStepResult::Done(state) => state,
+        }
+    }
+
+    /// Check SSR-based convergence.
+    #[inline]
+    fn ssr_convergence_check<P: Projector>(
+        &self,
+        projector: &mut P,
+        iter: usize,
+        ssr: &mut f64,
+    ) -> ConvergenceState {
+        let ssr_old = *ssr;
+        *ssr = projector.compute_ssr(&self.buffers.gx);
+
+        if iter > self.config.ssr_check_interval && Self::converged(ssr_old, *ssr, self.config.tol)
+        {
+            ConvergenceState::Converged
+        } else {
+            ConvergenceState::NotConverged
+        }
+    }
+
+    /// Project coefficients and check for convergence.
+    #[inline]
+    fn project_and_check<P: Projector>(
+        &mut self,
+        projector: &mut P,
+        coef: &[f64],
+    ) -> ConvergenceState {
+        projector.project(coef, &mut self.buffers.gx);
+        let conv_len = projector.convergence_len();
+        if Self::should_continue(
+            &coef[..conv_len],
+            &self.buffers.gx[..conv_len],
+            self.config.tol,
+        ) {
+            ConvergenceState::NotConverged
+        } else {
+            ConvergenceState::Converged
+        }
     }
 
     /// Apply Irons-Tuck acceleration to speed up convergence.
@@ -294,10 +348,10 @@ impl IronsTuckGrand {
     #[inline]
     fn grand_acceleration_step<P: Projector>(
         &mut self,
-        phase: GrandPhase,
         projector: &mut P,
-        conv_len: usize,
+        phase: GrandPhase,
     ) -> GrandStepResult {
+        let conv_len = projector.convergence_len();
         match phase {
             GrandPhase::Collect1st => {
                 self.buffers.y[..conv_len].copy_from_slice(&self.buffers.gx[..conv_len]);
@@ -382,17 +436,14 @@ mod tests {
         let n1 = ctx.index.n_groups[1];
         let n_coef = n0 + n1;
 
-        let in_out = ctx.scatter_to_coefficients(&input);
+        let in_out = ctx.apply_design_matrix_t(&input);
         let mut coef = vec![0.0; n_coef];
         let mut accelerator = IronsTuckGrand::new(config, n_coef);
         let mut projector = TwoFEProjector::new(&ctx, &in_out, &input);
 
         let (iter, convergence) = accelerator.run(&mut projector, &mut coef, maxiter);
 
-        assert!(
-            convergence == ConvergenceState::Converged,
-            "IronsTuckGrand should converge"
-        );
+        assert_eq!(convergence, ConvergenceState::Converged, "IronsTuckGrand should converge");
         assert!(iter < 100, "Should converge in less than 100 iterations");
     }
 }
diff --git a/src/demean_accelerated/demeaner.rs b/src/demean_accelerated/demeaner.rs
index d822dc326..7508c3283 100644
--- a/src/demean_accelerated/demeaner.rs
+++ b/src/demean_accelerated/demeaner.rs
@@ -56,10 +56,9 @@ impl<'a> SingleFEDemeaner<'a> {
 impl Demeaner for SingleFEDemeaner<'_> {
     fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, ConvergenceState) {
         let n_obs = self.ctx.index.n_obs;
-        let output = vec![0.0; n_obs];
 
-        // Scatter input to coefficient space (sum of input per group)
-        let in_out = self.ctx.scatter_residuals(input, &output);
+        // Apply Dᵀ to get coefficient-space sums
+        let in_out = self.ctx.apply_design_matrix_t(input);
 
         let fe0 = self.ctx.index.group_ids_for_fe(0);
         let group_weights = self.ctx.group_weights_for_fe(0);
@@ -89,7 +88,7 @@ impl Demeaner for SingleFEDemeaner<'_> {
 pub struct TwoFEDemeaner<'a> {
     ctx: &'a DemeanContext,
     config: &'a FixestConfig,
-    /// Coefficient array [alpha | beta], reused across solves
+    /// Coefficient array [alpha | beta], reused across calls to solve
     coef: Vec<f64>,
     /// Accelerator with internal buffers, reused across solves
     accelerator: IronsTuckGrand,
@@ -117,13 +116,13 @@ impl Demeaner for TwoFEDemeaner<'_> {
         let n_obs = self.ctx.index.n_obs;
         let n0 = self.ctx.index.n_groups[0];
 
-        // Scatter input to coefficient space
-        let in_out = self.ctx.scatter_to_coefficients(input);
+        // Apply Dᵀ to get coefficient-space sums
+        let in_out = self.ctx.apply_design_matrix_t(input);
 
-        // Reset coefficient array for this solve
+        // Reset coefficient array for this call to solve
         self.coef.fill(0.0);
 
-        // Create projector (lightweight, references in_out and input)
+        // Create the projector (lightweight, references in_out and input)
         let mut projector = TwoFEProjector::new(self.ctx, &in_out, input);
 
         // Run acceleration loop
@@ -219,81 +218,126 @@ impl<'a> MultiFEDemeaner<'a> {
             two_acc: IronsTuckGrand::new(*config, n_coef_2fe),
         }
     }
-}
 
-impl Demeaner for MultiFEDemeaner<'_> {
-    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, ConvergenceState) {
+    /// Phase 1: Warmup with all FEs to get initial estimates.
+    fn warmup_phase(&mut self, input: &[f64]) -> (usize, ConvergenceState) {
+        let in_out = self.ctx.apply_design_matrix_t(input);
+        let mut projector = MultiFEProjector::new(self.ctx, &in_out, input);
+
+        let (iter, convergence) = self
+            .multi_acc
+            .run(&mut projector, &mut self.buffers.coef, self.config.iter_warmup);
+
+        self.ctx.apply_design_matrix(&self.buffers.coef, &mut self.buffers.mu);
+        (iter, convergence)
+    }
+
+    /// Phase 2: Fast 2-FE sub-convergence on the first two fixed effects.
+    fn two_fe_convergence_phase(&mut self, input: &[f64]) -> (usize, ConvergenceState) {
         let n_obs = self.ctx.index.n_obs;
         let n0 = self.ctx.index.n_groups[0];
         let n1 = self.ctx.index.n_groups[1];
         let n_coef_2fe = n0 + n1;
-        let mut total_iter = 0usize;
 
-        // Reset buffers for this solve
+        // Compute residuals: input - mu
+        for i in 0..n_obs {
+            self.buffers.effective_input[i] = input[i] - self.buffers.mu[i];
+        }
+
+        // Apply Dᵀ to residuals (only need first 2 FEs)
+        let in_out_full = self.ctx.apply_design_matrix_t(&self.buffers.effective_input);
+        let in_out_2fe: Vec<f64> = in_out_full[..n_coef_2fe].to_vec();
+
+        // Run 2-FE acceleration
+        self.buffers.coef_2fe.fill(0.0);
+        let mut projector =
+            TwoFEProjector::new(self.ctx, &in_out_2fe, &self.buffers.effective_input);
+        let (iter, convergence) = self.two_acc.run(
+            &mut projector,
+            &mut self.buffers.coef_2fe,
+            self.config.maxiter / 2,
+        );
+
+        // Add 2-FE coefficients to mu
+        self.add_2fe_coefficients_to_mu();
+        (iter, convergence)
+    }
+
+    /// Phase 3: Final re-acceleration with all FEs.
+    fn reacceleration_phase(
+        &mut self,
+        input: &[f64],
+        used_iter: usize,
+    ) -> (usize, ConvergenceState) {
+        let remaining = self.config.maxiter.saturating_sub(used_iter);
+        if remaining == 0 {
+            return (0, ConvergenceState::NotConverged);
+        }
+
+        // Compute residuals: input - mu
+        for i in 0..self.ctx.index.n_obs {
+            self.buffers.effective_input[i] = input[i] - self.buffers.mu[i];
+        }
+
+        let in_out = self.ctx.apply_design_matrix_t(&self.buffers.effective_input);
+        self.buffers.coef.fill(0.0);
+
+        let mut projector = MultiFEProjector::new(self.ctx, &in_out, input);
+        let (iter, convergence) =
+            self.multi_acc
+                .run(&mut projector, &mut self.buffers.coef, remaining);
+
+        self.ctx.apply_design_matrix(&self.buffers.coef, &mut self.buffers.mu);
+        (iter, convergence)
+    }
+
+    /// Add 2-FE coefficients to the accumulated mu buffer.
+    fn add_2fe_coefficients_to_mu(&mut self) {
+        let n0 = self.ctx.index.n_groups[0];
+        let fe0 = self.ctx.index.group_ids_for_fe(0);
+        let fe1 = self.ctx.index.group_ids_for_fe(1);
+
+        for i in 0..self.ctx.index.n_obs {
+            self.buffers.mu[i] +=
+                self.buffers.coef_2fe[fe0[i]] + self.buffers.coef_2fe[n0 + fe1[i]];
+        }
+    }
+
+    /// Compute final output and return result tuple.
+    fn finalize_output(
+        &self,
+        input: &[f64],
+        iter: usize,
+        convergence: ConvergenceState,
+    ) -> (Vec<f64>, usize, ConvergenceState) {
+        let output: Vec<f64> = input
+            .iter()
+            .zip(self.buffers.mu.iter())
+            .map(|(&x, &mu)| x - mu)
+            .collect();
+        (output, iter, convergence)
+    }
+}
+
+impl Demeaner for MultiFEDemeaner<'_> {
+    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, ConvergenceState) {
         self.buffers.reset();
 
-        // Phase 1: Warmup with all FEs (mu is zeros initially)
-        let in_out_phase1 = self.ctx.scatter_to_coefficients(input);
-        let mut projector1 = MultiFEProjector::new(self.ctx, &in_out_phase1, input);
-        let (iter1, convergence1) = self
-            .multi_acc
-            .run(&mut projector1, &mut self.buffers.coef, self.config.iter_warmup);
-        total_iter += iter1;
-        self.ctx.gather_and_add(&self.buffers.coef, &mut self.buffers.mu);
-
-        // Determine final convergence status based on which phase completes the algorithm
-        let convergence = if convergence1 == ConvergenceState::Converged {
-            // Early convergence in warmup phase
-            ConvergenceState::Converged
-        } else {
-            // Phase 2: 2-FE sub-convergence
-            let in_out_phase2 = self.ctx.scatter_residuals(input, &self.buffers.mu);
-            self.buffers.coef_2fe.fill(0.0);
-            let in_out_2fe: Vec<f64> = in_out_phase2[..n_coef_2fe].to_vec();
-
-            // Compute effective input: input - mu
-            for i in 0..n_obs {
-                self.buffers.effective_input[i] = input[i] - self.buffers.mu[i];
-            }
-
-            let mut projector2 =
-                TwoFEProjector::new(self.ctx, &in_out_2fe, &self.buffers.effective_input);
-            let (iter2, convergence2) = self.two_acc.run(
-                &mut projector2,
-                &mut self.buffers.coef_2fe,
-                self.config.maxiter / 2,
-            );
-            total_iter += iter2;
-
-            // Add 2-FE coefficients to mu
-            let fe0 = self.ctx.index.group_ids_for_fe(0);
-            let fe1 = self.ctx.index.group_ids_for_fe(1);
-            for i in 0..n_obs {
-                self.buffers.mu[i] +=
-                    self.buffers.coef_2fe[fe0[i]] + self.buffers.coef_2fe[n0 + fe1[i]];
-            }
-
-            // Phase 3: Re-acceleration with all FEs (unless 2-FE converged fully)
-            let remaining = self.config.maxiter.saturating_sub(total_iter);
-            if remaining > 0 {
-                let in_out_phase3 = self.ctx.scatter_residuals(input, &self.buffers.mu);
-                self.buffers.coef.fill(0.0);
-                let mut projector3 = MultiFEProjector::new(self.ctx, &in_out_phase3, input);
-                let (iter3, convergence3) =
-                    self.multi_acc
-                        .run(&mut projector3, &mut self.buffers.coef, remaining);
-                total_iter += iter3;
-                self.ctx.gather_and_add(&self.buffers.coef, &mut self.buffers.mu);
-                convergence3
-            } else {
-                // No remaining iterations, use phase 2 convergence status
-                convergence2
-            }
-        };
-
-        // Compute output: input - mu
-        let output: Vec<f64> = (0..n_obs).map(|i| input[i] - self.buffers.mu[i]).collect();
-
-        (output, total_iter, convergence)
+        // Phase 1: Warmup with all FEs
+        let (iter1, conv1) = self.warmup_phase(input);
+        if conv1 == ConvergenceState::Converged {
+            return self.finalize_output(input, iter1, conv1);
+        }
+
+        // Phase 2: 2-FE sub-convergence (refines only first 2 FEs)
+        // Note: Don't return early on Phase 2 convergence!
+        // Phase 2 only refines the first 2 FEs. The 3rd+ FEs still need Phase 3.
+        let (iter2, _conv2) = self.two_fe_convergence_phase(input);
+        let total_iter = iter1 + iter2;
+
+        // Phase 3: Re-acceleration with all FEs
+        let (iter3, conv3) = self.reacceleration_phase(input, total_iter);
+
+        self.finalize_output(input, total_iter + iter3, conv3)
     }
 }
diff --git a/src/demean_accelerated/mod.rs b/src/demean_accelerated/mod.rs
index 8e7976ca2..689776030 100644
--- a/src/demean_accelerated/mod.rs
+++ b/src/demean_accelerated/mod.rs
@@ -9,21 +9,21 @@
 //! - [`types`]: Core data types
 //!   - [`FixedEffectsIndex`](types::FixedEffectsIndex): Fixed effects indexing (which obs belongs to which group)
 //!   - [`ObservationWeights`](types::ObservationWeights): Observation weights and group-level aggregations
-//!   - [`DemeanContext`](types::DemeanContext): Combines index + weights for demeaning operations
-//!   - [`FixestConfig`](types::FixestConfig): Algorithm parameters
+//!   - [`DemeanContext`](DemeanContext): Combines index and weights for demeaning operations
+//!   - [`FixestConfig`](FixestConfig): Algorithm parameters
 //! - [`projection`]: Projection operations with [`Projector`](projection::Projector) trait
 //!   - [`TwoFEProjector`](projection::TwoFEProjector): Specialized 2-FE projection
 //!   - [`MultiFEProjector`](projection::MultiFEProjector): General Q-FE projection
 //! - [`accelerator`]: Acceleration strategy
 //!   - [`IronsTuckGrand`](accelerator::IronsTuckGrand): Irons-Tuck + Grand acceleration (matches fixest)
-//! - [`demeaner`]: High-level solver strategies with [`Demeaner`](demeaner::Demeaner) trait
-//!   - [`SingleFEDemeaner`](demeaner::SingleFEDemeaner): O(n) closed-form (1 FE)
-//!   - [`TwoFEDemeaner`](demeaner::TwoFEDemeaner): Accelerated iteration (2 FEs)
-//!   - [`MultiFEDemeaner`](demeaner::MultiFEDemeaner): Multi-phase strategy (3+ FEs)
+//! - [`demeaner`]: High-level solver strategies with [`Demeaner`](Demeaner) trait
+//!   - [`SingleFEDemeaner`](SingleFEDemeaner): O(n) closed-form (1 FE)
+//!   - [`TwoFEDemeaner`](TwoFEDemeaner): Accelerated iteration (2 FEs)
+//!   - [`MultiFEDemeaner`](MultiFEDemeaner): Multi-phase strategy (3+ FEs)
 //!
-//! # Dispatching based on number of fixed effects:
+//! # Dispatching based on the number of fixed effects:
 //! - 1 FE: O(n) closed-form solution (single pass, no iteration)
-//! - 2 FE: Coefficient-space iteration with Irons-Tuck + Grand acceleration
+//! - 2 FE: Coefficient-space iteration with Irons-Tuck and Grand acceleration
 //! - 3+ FE: Multi-phase strategy with 2-FE sub-convergence
 
 pub mod accelerator;
@@ -101,14 +101,18 @@ pub(crate) fn demean_accelerated(
         .into_par_iter()
         .enumerate()
         .for_each_init(
-            // Init closure: called once per thread to create thread-local state
+            // Init closure: called once per thread to create the thread-local state
             || ThreadLocalDemeaner::new(&ctx, &config),
             // Body closure: called for each column, reusing thread-local state
             |demeaner, (k, mut col)| {
-                // Use ndarray's column view and convert to contiguous Vec
-                // (column() returns a non-contiguous view, to_vec() copies to contiguous)
-                let xk: Vec<f64> = x.column(k).to_vec();
-                let (result, _iter, convergence) = demeaner.solve(&xk);
+                let col_view = x.column(k);
+                // Zero-copy if column is contiguous (F-order), otherwise copy
+                let (result, _iter, convergence) = if let Some(slice) = col_view.as_slice() {
+                    demeaner.solve(slice)
+                } else {
+                    let xk: Vec<f64> = col_view.to_vec();
+                    demeaner.solve(&xk)
+                };
 
                 if convergence == ConvergenceState::NotConverged {
                     not_converged.fetch_add(1, Ordering::SeqCst);
@@ -172,10 +176,7 @@ mod tests {
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
         let (result, iter, convergence) = demeaner.solve(&input);
 
-        assert!(
-            convergence == ConvergenceState::Converged,
-            "Should converge"
-        );
+        assert_eq!(convergence, ConvergenceState::Converged, "Should converge");
         assert!(iter < 100, "Should converge quickly");
         assert!(result.iter().all(|&v| v.is_finite()));
     }
@@ -201,7 +202,7 @@ mod tests {
         let mut demeaner = MultiFEDemeaner::new(&ctx, &config);
         let (result, _iter, convergence) = demeaner.solve(&input);
 
-        assert!(convergence == ConvergenceState::Converged);
+        assert_eq!(convergence, ConvergenceState::Converged);
         assert!(result.iter().all(|&v| v.is_finite()));
     }
 
@@ -223,10 +224,7 @@ mod tests {
         let mut demeaner = SingleFEDemeaner::new(&ctx);
         let (result, iter, convergence) = demeaner.solve(&input);
 
-        assert!(
-            convergence == ConvergenceState::Converged,
-            "Single FE should always converge"
-        );
+        assert_eq!(convergence, ConvergenceState::Converged, "Single FE should always converge");
         assert_eq!(iter, 0, "Single FE should be closed-form (0 iterations)");
 
         // Verify demeaning: each group's sum should be approximately 0
@@ -271,10 +269,7 @@ mod tests {
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
         let (result, _iter, convergence) = demeaner.solve(&input);
 
-        assert!(
-            convergence == ConvergenceState::Converged,
-            "Weighted regression should converge"
-        );
+        assert_eq!(convergence, ConvergenceState::Converged, "Weighted regression should converge");
         assert!(
             result.iter().all(|&v| v.is_finite()),
             "All results should be finite"
@@ -300,10 +295,7 @@ mod tests {
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
         let (result, _iter, convergence) = demeaner.solve(&input);
 
-        assert!(
-            convergence == ConvergenceState::Converged,
-            "Singleton groups should converge"
-        );
+        assert_eq!(convergence, ConvergenceState::Converged, "Singleton groups should converge");
 
         // With singleton groups in FE 0, each observation's own mean is subtracted,
         // then adjusted for FE 1. The result should be all zeros since each
@@ -333,10 +325,7 @@ mod tests {
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
         let (result, _iter, convergence) = demeaner.solve(&input);
 
-        assert!(
-            convergence == ConvergenceState::Converged,
-            "Small groups should converge"
-        );
+        assert_eq!(convergence, ConvergenceState::Converged, "Small groups should converge");
         assert!(
             result.iter().all(|&v| v.is_finite()),
             "All results should be finite"
diff --git a/src/demean_accelerated/projection.rs b/src/demean_accelerated/projection.rs
index 89113316f..8c27ee8d8 100644
--- a/src/demean_accelerated/projection.rs
+++ b/src/demean_accelerated/projection.rs
@@ -52,10 +52,10 @@ pub trait Projector {
     /// Project coefficients: coef_in → coef_out.
     fn project(&mut self, coef_in: &[f64], coef_out: &mut [f64]);
 
-    /// Compute sum of squared residuals for the given coefficients.
+    /// Compute the sum of squared residuals for the given coefficients.
     fn compute_ssr(&mut self, coef: &[f64]) -> f64;
 
-    /// Length of coefficient slice to use for convergence checking.
+    /// Length of the coefficient slice to use for convergence checking.
     ///
     /// This may be smaller than `coef_len()` when not all coefficients
     /// need to be checked (e.g., for 2-FE only alpha is checked).
@@ -99,7 +99,7 @@ impl<'a> TwoFEProjector<'a> {
     ///
     /// For each group g1 in FE1:
     ///   beta[g1] = (in_out[g1] - Σ alpha[g0] * w) / group_weight[g1]
-    #[inline]
+    #[inline(always)]
     fn compute_beta_from_alpha(&mut self, alpha: &[f64]) {
         let n0 = self.ctx.index.n_groups[0];
         let n1 = self.ctx.index.n_groups[1];
@@ -129,7 +129,7 @@ impl<'a> TwoFEProjector<'a> {
     ///
     /// For each group g0 in FE0:
     ///   alpha[g0] = (in_out[g0] - Σ beta[g1] * w) / group_weight[g0]
-    #[inline]
+    #[inline(always)]
     fn compute_alpha_from_beta(&self, alpha_out: &mut [f64]) {
         let n0 = self.ctx.index.n_groups[0];
         let fe0 = self.ctx.index.group_ids_for_fe(0);
@@ -242,7 +242,7 @@ impl<'a> MultiFEProjector<'a> {
     /// Accumulate coefficient contributions from one FE into the scratch buffer.
     ///
     /// For each observation i: scratch[i] += coef[start + fe[i]]
-    #[inline]
+    #[inline(always)]
     fn accumulate_fe_contributions(&mut self, fe_idx: usize, coef: &[f64]) {
         let start = self.ctx.index.coef_start[fe_idx];
         let fe = self.ctx.index.group_ids_for_fe(fe_idx);
@@ -256,7 +256,7 @@ impl<'a> MultiFEProjector<'a> {
     ///
     /// For each group g in FE q:
     ///   coef_out[g] = (in_out[g] - Σ scratch[i] * w) / group_weight[g]
-    #[inline]
+    #[inline(always)]
     fn update_fe_coefficients(&self, fe_idx: usize, coef_out: &mut [f64]) {
         let start = self.ctx.index.coef_start[fe_idx];
         let n_groups = self.ctx.index.n_groups[fe_idx];
@@ -337,7 +337,7 @@ impl Projector for MultiFEProjector<'_> {
         // This improves cache locality because:
         // 1. group_ids_for_fe(q) returns a contiguous slice for FE q
         // 2. We access the scratch buffer sequentially
-        // 3. The coefficient array (typically small) stays in cache
+        // 3. The coefficient array (typically small) stays in the cache
 
         // Accumulate coefficient sums per observation using the scratch buffer
         self.scratch.fill(0.0);
diff --git a/src/demean_accelerated/types.rs b/src/demean_accelerated/types.rs
index 8cd429697..9e2125d3b 100644
--- a/src/demean_accelerated/types.rs
+++ b/src/demean_accelerated/types.rs
@@ -3,7 +3,7 @@
 //! # Overview
 //!
 //! Fixed effects demeaning removes group means from data. For example, with
-//! individual and time fixed effects, we remove both individual-specific and
+//! individual and time-fixed effects, we remove both individual-specific and
 //! time-specific means from each observation.
 //!
 //! # Two Spaces
@@ -29,7 +29,7 @@
 //!
 //! - [`FixedEffectsIndex`]: Maps observations to their group IDs for each FE
 //! - [`ObservationWeights`]: Per-observation and per-group weight sums
-//! - [`DemeanContext`]: Combines index + weights, provides scatter/gather operations
+//! - [`DemeanContext`]: Combines index and weights, provides scatter/gather operations
 //! - [`FixestConfig`]: Algorithm parameters (tolerance, max iterations, etc.)
 
 use ndarray::{ArrayView1, ArrayView2};
@@ -68,7 +68,7 @@ pub struct FixedEffectsIndex {
     /// Number of observations (N).
     pub n_obs: usize,
 
-    /// Number of fixed effects (e.g., 2 for individual + time).
+    /// Number of fixed effects (e.g., 2 for individual and time).
     pub n_fe: usize,
 
     /// Flat group IDs in column-major order.
@@ -204,7 +204,7 @@ pub struct ObservationWeights {
     /// Layout matches coefficient space: `[fe0_group0, ..., fe0_groupK, fe1_group0, ...]`.
     pub per_group: Vec<f64>,
 
-    /// True if all observation weights are 1.0 (enables fast path).
+    /// True if all observation weights are 1.0 (enables the fast path).
     pub is_uniform: bool,
 }
 
@@ -223,7 +223,7 @@ impl ObservationWeights {
     pub fn new(weights: &ArrayView1<f64>, index: &FixedEffectsIndex) -> Self {
         // Tolerance for detecting uniform weights (all 1.0).
         // Using 1e-10 to account for floating-point representation errors
-        // while being strict enough to catch intentionally non-uniform weights.
+        // while being strict enough to intentionally catch non-uniform weights.
         const UNIFORM_WEIGHT_TOL: f64 = 1e-10;
         let is_uniform = weights.iter().all(|&w| (w - 1.0).abs() < UNIFORM_WEIGHT_TOL);
 
@@ -280,8 +280,8 @@ impl ObservationWeights {
 /// ```ignore
 /// let ctx = DemeanContext::new(&flist, &weights);
 ///
-/// // Scatter input to coefficient space
-/// let coef_sums = ctx.scatter_to_coefficients(&input);
+/// // Apply Dᵀ to get coefficient-space sums
+/// let coef_sums = ctx.apply_design_matrix_t(&input);
 ///
 /// // Compute group means: coef[g] = coef_sums[g] / group_weight[g]
 /// // ... (done in solver)
@@ -326,37 +326,38 @@ impl DemeanContext {
     }
 
     // =========================================================================
-    // Scatter/Gather Operations
+    // Design Matrix Operations (D and Dᵀ)
     // =========================================================================
 
-    /// Scatter values from observation space to coefficient space.
+    /// Apply transpose of design matrix: Dᵀ · values.
     ///
     /// Computes weighted sums of `values` for each group in each FE.
     /// Returns a vector of length `n_coef` with the aggregated sums.
     #[inline]
-    pub fn scatter_to_coefficients(&self, values: &[f64]) -> Vec<f64> {
+    pub fn apply_design_matrix_t(&self, values: &[f64]) -> Vec<f64> {
         let mut result = vec![0.0; self.index.n_coef];
-        self.scatter_inner(values, None, &mut result);
-        result
-    }
-
-    /// Scatter residuals from observation space to coefficient space.
-    ///
-    /// Like [`scatter_to_coefficients`], but first subtracts `baseline` from `values`.
-    /// Computes: `Σ (values[i] - baseline[i]) * weight[i]` for each group.
-    #[inline]
-    pub fn scatter_residuals(&self, values: &[f64], baseline: &[f64]) -> Vec<f64> {
-        let mut result = vec![0.0; self.index.n_coef];
-        self.scatter_inner(values, Some(baseline), &mut result);
+        for q in 0..self.index.n_fe {
+            let offset = self.index.coef_start[q];
+            let fe_ids = self.index.group_ids_for_fe(q);
+            if self.weights.is_uniform {
+                for (i, &g) in fe_ids.iter().enumerate() {
+                    result[offset + g] += values[i];
+                }
+            } else {
+                for (i, &g) in fe_ids.iter().enumerate() {
+                    result[offset + g] += values[i] * self.weights.per_obs[i];
+                }
+            }
+        }
         result
     }
 
-    /// Gather coefficients to observation space and add to output.
+    /// Apply design matrix and add to output: output += D · coef.
     ///
     /// For each observation, looks up its coefficient for each FE and adds to output.
     /// Computes: `output[i] += Σ_q coef[offset_q + fe_q[i]]`
     #[inline]
-    pub fn gather_and_add(&self, coef: &[f64], output: &mut [f64]) {
+    pub fn apply_design_matrix(&self, coef: &[f64], output: &mut [f64]) {
         for q in 0..self.index.n_fe {
             let offset = self.index.coef_start[q];
             let fe_ids = self.index.group_ids_for_fe(q);
@@ -365,40 +366,6 @@ impl DemeanContext {
             }
         }
     }
-
-    /// Inner scatter implementation with optional baseline subtraction.
-    ///
-    /// Handles both uniform and non-uniform weights with optimized code paths.
-    #[inline(always)]
-    fn scatter_inner(&self, values: &[f64], baseline: Option<&[f64]>, result: &mut [f64]) {
-        for q in 0..self.index.n_fe {
-            let offset = self.index.coef_start[q];
-            let fe_ids = self.index.group_ids_for_fe(q);
-
-            match (self.weights.is_uniform, baseline) {
-                (true, None) => {
-                    for (i, &g) in fe_ids.iter().enumerate() {
-                        result[offset + g] += values[i];
-                    }
-                }
-                (true, Some(base)) => {
-                    for (i, &g) in fe_ids.iter().enumerate() {
-                        result[offset + g] += values[i] - base[i];
-                    }
-                }
-                (false, None) => {
-                    for (i, &g) in fe_ids.iter().enumerate() {
-                        result[offset + g] += values[i] * self.weights.per_obs[i];
-                    }
-                }
-                (false, Some(base)) => {
-                    for (i, &g) in fe_ids.iter().enumerate() {
-                        result[offset + g] += (values[i] - base[i]) * self.weights.per_obs[i];
-                    }
-                }
-            }
-        }
-    }
 }
 
 // =============================================================================
@@ -457,9 +424,9 @@ impl Default for FixestConfig {
 
 /// Whether the iterative algorithm has converged.
 ///
-/// Used throughout the demeaning module to represent convergence state
+/// Used throughout the demeaning module to represent the convergence state
 /// in a self-documenting way, avoiding ambiguous boolean returns.
-#[derive(Clone, Copy, PartialEq, Eq)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum ConvergenceState {
     /// Algorithm has converged; iteration can stop.
     Converged,
diff --git a/src/detect_singletons.rs b/src/detect_singletons.rs
index 1abcff335..bd4b380ff 100644
--- a/src/detect_singletons.rs
+++ b/src/detect_singletons.rs
@@ -65,7 +65,7 @@ pub fn _detect_singletons_rs(py: Python<'_>, ids: PyReadonlyArray2<u32>) -> Py<P
                 continue;
             }
 
-            // Remove singletons from non_singletons list
+            // Remove singletons from the non_singletons list
             let mut cnt = 0;
             for i in 0..n_non_singletons {
                 let idx = non_singletons[i] as usize;
diff --git a/tests/test_demean.py b/tests/test_demean.py
index 15dc71032..ef5814e0b 100644
--- a/tests/test_demean.py
+++ b/tests/test_demean.py
@@ -65,7 +65,7 @@ def test_set_demeaner_backend():
     assert demean_func == demean_jax
 
     demean_func = _set_demeaner_backend("rust")
-    assert demean_func == demean_rs
+    assert demean_func == demean_accelerated_rs
 
     demean_func = _set_demeaner_backend("cupy32")
     assert demean_func == demean_cupy32

From 81e2aa144a47321718158c17b0071228c968b216 Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Sun, 4 Jan 2026 23:59:03 +0100
Subject: [PATCH 08/24] Remove old demean implementation, use accelerated
 version
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the simple alternating projections implementation with the
accelerated Irons-Tuck algorithm as the sole Rust demean backend.

Changes:
- Remove src/demean.rs (old simple implementation)
- Update demean.py to call _demean_accelerated_rs
- Remove demean_accelerated.py (was only needed during development)
- Update backends.py and demean_.py imports
- Clean up tests to remove redundant fixtures

The public Python API is unchanged - users calling demean() or using
the "rust" backend get the accelerated implementation transparently.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 pyfixest/core/_core_impl.pyi                  |   7 -
 pyfixest/core/demean.py                       |   4 +-
 pyfixest/core/demean_accelerated.py           |  79 -------
 pyfixest/estimation/backends.py               |   4 +-
 pyfixest/estimation/demean_.py                |   4 +-
 src/demean.rs                                 | 219 ------------------
 .../accelerator.rs                            |   8 +-
 .../demeaner.rs                               |   6 +-
 src/{demean_accelerated => demean}/mod.rs     |   6 +-
 .../projection.rs                             |   4 +-
 src/{demean_accelerated => demean}/types.rs   |   0
 src/lib.rs                                    |   4 +-
 tests/test_demean.py                          |  23 +-
 13 files changed, 23 insertions(+), 345 deletions(-)
 delete mode 100644 pyfixest/core/demean_accelerated.py
 delete mode 100644 src/demean.rs
 rename src/{demean_accelerated => demean}/accelerator.rs (98%)
 rename src/{demean_accelerated => demean}/demeaner.rs (98%)
 rename src/{demean_accelerated => demean}/mod.rs (98%)
 rename src/{demean_accelerated => demean}/projection.rs (98%)
 rename src/{demean_accelerated => demean}/types.rs (100%)

diff --git a/pyfixest/core/_core_impl.pyi b/pyfixest/core/_core_impl.pyi
index 8e4bed02d..6bb849ec5 100644
--- a/pyfixest/core/_core_impl.pyi
+++ b/pyfixest/core/_core_impl.pyi
@@ -20,11 +20,4 @@ def _count_fixef_fully_nested_all_rs(
     cluster_data: NDArray[np.uint64],
     fe_data: NDArray[np.uint64],
 ) -> tuple[np.ndarray, int]: ...
-def _demean_accelerated_rs(
-    x: NDArray[np.float64],
-    flist: NDArray[np.uint64],
-    weights: NDArray[np.float64],
-    tol: float = 1e-08,
-    maxiter: int = 100_000,
-) -> tuple[np.ndarray, bool]: ...
 def _detect_singletons_rs(ids: NDArray[np.uint32]) -> NDArray[np.bool_]: ...
diff --git a/pyfixest/core/demean.py b/pyfixest/core/demean.py
index 95cd97e88..8af8c8bbe 100644
--- a/pyfixest/core/demean.py
+++ b/pyfixest/core/demean.py
@@ -8,7 +8,7 @@ def demean(
     x: NDArray[np.float64],
     flist: NDArray[np.uint64],
     weights: NDArray[np.float64],
-    tol: float = 1e-06,
+    tol: float = 1e-08,
     maxiter: int = 100_000,
 ) -> tuple[NDArray, bool]:
     """
@@ -27,7 +27,7 @@ def demean(
     weights : numpy.ndarray
         Array of shape (n_samples,) specifying the weights.
     tol : float, optional
-        Tolerance criterion for convergence. Defaults to 1e-06 (matching fixest).
+        Tolerance criterion for convergence. Defaults to 1e-08.
     maxiter : int, optional
         Maximum number of iterations. Defaults to 100_000.
 
diff --git a/pyfixest/core/demean_accelerated.py b/pyfixest/core/demean_accelerated.py
deleted file mode 100644
index a55dda72d..000000000
--- a/pyfixest/core/demean_accelerated.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import numpy as np
-from numpy.typing import NDArray
-
-from ._core_impl import _demean_accelerated_rs
-
-
-def demean_accelerated(
-    x: NDArray[np.float64],
-    flist: NDArray[np.uint64],
-    weights: NDArray[np.float64],
-    tol: float = 1e-08,
-    maxiter: int = 100_000,
-) -> tuple[NDArray, bool]:
-    """
-    Demean an array.
-
-    Workhorse for demeaning an input array `x` based on the specified fixed
-    effects and weights via the alternating projections algorithm.
-
-    Parameters
-    ----------
-    x : numpy.ndarray
-        Input array of shape (n_samples, n_features). Needs to be of type float.
-    flist : numpy.ndarray
-        Array of shape (n_samples, n_factors) specifying the fixed effects.
-        Needs to already be converted to integers.
-    weights : numpy.ndarray
-        Array of shape (n_samples,) specifying the weights.
-    tol : float, optional
-        Tolerance criterion for convergence. Defaults to 1e-08.
-    maxiter : int, optional
-        Maximum number of iterations. Defaults to 100_000.
-
-    Returns
-    -------
-    tuple[numpy.ndarray, bool]
-        A tuple containing the demeaned array of shape (n_samples, n_features)
-        and a boolean indicating whether the algorithm converged successfully.
-
-    Examples
-    --------
-    ```{python}
-    import numpy as np
-    import pyfixest as pf
-    from pyfixest.utils.dgps import get_blw
-    from pyfixest.estimation.demean_ import demean
-    from formulaic import model_matrix
-
-    fml = "y ~ treat | state + year"
-
-    data = get_blw()
-    data.head()
-
-    Y, rhs = model_matrix(fml, data)
-    X = rhs[0].drop(columns="Intercept")
-    fe = rhs[1].drop(columns="Intercept")
-    YX = np.concatenate([Y, X], axis=1)
-
-    # to numpy
-    Y = Y.to_numpy()
-    X = X.to_numpy()
-    YX = np.concatenate([Y, X], axis=1)
-    fe = fe.to_numpy().astype(int)  # demean requires fixed effects as ints!
-
-    YX_demeaned, success = demean(YX, fe, weights = np.ones(YX.shape[0]))
-    Y_demeaned = YX_demeaned[:, 0]
-    X_demeaned = YX_demeaned[:, 1:]
-
-    print(np.linalg.lstsq(X_demeaned, Y_demeaned, rcond=None)[0])
-    print(pf.feols(fml, data).coef())
-    ```
-    """
-    return _demean_accelerated_rs(
-        x.astype(np.float64, copy=False),
-        flist.astype(np.uint64, copy=False),
-        weights.astype(np.float64, copy=False),
-        tol,
-        maxiter,
-    )
diff --git a/pyfixest/estimation/backends.py b/pyfixest/estimation/backends.py
index 51f9891b5..ad650310b 100644
--- a/pyfixest/estimation/backends.py
+++ b/pyfixest/estimation/backends.py
@@ -1,6 +1,6 @@
 from pyfixest.core.collinear import find_collinear_variables
 from pyfixest.core.crv1 import crv1_meat_loop
-from pyfixest.core.demean_accelerated import demean_accelerated
+from pyfixest.core.demean import demean as demean_rust
 from pyfixest.core.nested_fixed_effects import count_fixef_fully_nested_all
 from pyfixest.estimation.demean_ import demean as demean_nb
 from pyfixest.estimation.numba.find_collinear_variables_nb import (
@@ -53,7 +53,7 @@
         "nonnested": count_fixef_fully_nested_all_nb,
     },
     "rust": {
-        "demean": demean_accelerated,
+        "demean": demean_rust,
         "collinear": find_collinear_variables,
         "crv1_meat": crv1_meat_loop,
         "nonnested": count_fixef_fully_nested_all,
diff --git a/pyfixest/estimation/demean_.py b/pyfixest/estimation/demean_.py
index 0354e454a..84c94e548 100644
--- a/pyfixest/estimation/demean_.py
+++ b/pyfixest/estimation/demean_.py
@@ -346,9 +346,9 @@ def _set_demeaner_backend(
         If the demeaning backend is not supported.
     """
     if demeaner_backend == "rust":
-        from pyfixest.core.demean_accelerated import demean_accelerated
+        from pyfixest.core.demean import demean as demean_rust
 
-        return demean_accelerated
+        return demean_rust
     elif demeaner_backend == "numba":
         return demean
     elif demeaner_backend == "jax":
diff --git a/src/demean.rs b/src/demean.rs
deleted file mode 100644
index 22098bade..000000000
--- a/src/demean.rs
+++ /dev/null
@@ -1,219 +0,0 @@
-use ndarray::{Array2, ArrayView1, ArrayView2, Zip};
-use numpy::{PyArray2, PyReadonlyArray1, PyReadonlyArray2};
-use pyo3::prelude::*;
-use rayon::prelude::*;
-use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::Arc;
-
-mod internal {
-    pub(super) fn sad_converged(a: &[f64], b: &[f64], tol: f64) -> bool {
-        a.iter().zip(b).all(|(&x, &y)| (x - y).abs() < tol)
-    }
-
-    pub(super) fn subtract_weighted_group_mean(
-        x: &mut [f64],
-        sample_weights: &[f64],
-        group_ids: &[usize],
-        group_weights: &[f64],
-        group_weighted_sums: &mut [f64],
-    ) {
-        group_weighted_sums.fill(0.0);
-
-        // Accumulate weighted sums per group
-        x.iter()
-            .zip(sample_weights)
-            .zip(group_ids)
-            .for_each(|((&xi, &wi), &gid)| {
-                group_weighted_sums[gid] += wi * xi;
-            });
-
-        // Compute group means
-        let group_means: Vec<f64> = group_weighted_sums
-            .iter()
-            .zip(group_weights)
-            .map(|(&sum, &weight)| sum / weight)
-            .collect();
-
-        // Subtract means from each sample
-        x.iter_mut().zip(group_ids).for_each(|(xi, &gid)| {
-            *xi -= group_means[gid];
-        });
-    }
-
-    pub(super) fn calc_group_weights(
-        sample_weights: &[f64],
-        group_ids: &[usize],
-        n_samples: usize,
-        n_factors: usize,
-        n_groups: usize,
-    ) -> Vec<f64> {
-        let mut group_weights = vec![0.0; n_factors * n_groups];
-        for i in 0..n_samples {
-            let weight = sample_weights[i];
-            for j in 0..n_factors {
-                let id = group_ids[i * n_factors + j];
-                group_weights[j * n_groups + id] += weight;
-            }
-        }
-        group_weights
-    }
-}
-
-fn demean_impl(
-    x: &ArrayView2<f64>,
-    flist: &ArrayView2<usize>,
-    weights: &ArrayView1<f64>,
-    tol: f64,
-    maxiter: usize,
-) -> (Array2<f64>, bool) {
-    let (n_samples, n_features) = x.dim();
-    let n_factors = flist.ncols();
-    let n_groups = flist.iter().cloned().max().unwrap() + 1;
-
-    let sample_weights: Vec<f64> = weights.iter().cloned().collect();
-    let group_ids: Vec<usize> = flist.iter().cloned().collect();
-    let group_weights =
-        internal::calc_group_weights(&sample_weights, &group_ids, n_samples, n_factors, n_groups);
-
-    let not_converged = Arc::new(AtomicUsize::new(0));
-
-    // Precompute slices of group_ids for each factor
-    let group_ids_by_factor: Vec<Vec<usize>> = (0..n_factors)
-        .map(|j| {
-            (0..n_samples)
-                .map(|i| group_ids[i * n_factors + j])
-                .collect()
-        })
-        .collect();
-
-    // Precompute group weight slices
-    let group_weight_slices: Vec<&[f64]> = (0..n_factors)
-        .map(|j| &group_weights[j * n_groups..(j + 1) * n_groups])
-        .collect();
-
-    let process_column = |(k, mut col): (usize, ndarray::ArrayViewMut1<f64>)| {
-        let mut xk_curr: Vec<f64> = (0..n_samples).map(|i| x[[i, k]]).collect();
-        let mut xk_prev: Vec<f64> = xk_curr.iter().map(|&v| v - 1.0).collect();
-        let mut gw_sums = vec![0.0; n_groups];
-
-        let mut converged = false;
-        for _ in 0..maxiter {
-            for j in 0..n_factors {
-                internal::subtract_weighted_group_mean(
-                    &mut xk_curr,
-                    &sample_weights,
-                    &group_ids_by_factor[j],
-                    group_weight_slices[j],
-                    &mut gw_sums,
-                );
-            }
-
-            if internal::sad_converged(&xk_curr, &xk_prev, tol) {
-                converged = true;
-                break;
-            }
-            xk_prev.copy_from_slice(&xk_curr);
-        }
-
-        if !converged {
-            not_converged.fetch_add(1, Ordering::SeqCst);
-        }
-        Zip::from(&mut col).and(&xk_curr).for_each(|col_elm, &val| {
-            *col_elm = val;
-        });
-    };
-
-    let mut res = Array2::<f64>::zeros((n_samples, n_features));
-
-    res.axis_iter_mut(ndarray::Axis(1))
-        .into_par_iter()
-        .enumerate()
-        .for_each(process_column);
-
-    let success = not_converged.load(Ordering::SeqCst) == 0;
-    (res, success)
-}
-
-
-/// Demean a 2D array x by a set of fixed effects using the alternating
-/// projection algorithm.
-///
-/// Parameters
-/// ----------
-/// x : np.ndarray[float64]
-///     2D array of data to be demeaned (shape: observations x variables).
-/// flist : np.ndarray[usize]
-///     2D array of group indicators (shape: observations x the number of fixed effects), must be integer-encoded.
-/// weights : np.ndarray[float64]
-///     1D array of observation weights (length: observations).
-/// tol : float, optional
-///     Convergence tolerance (default: 1e-8).
-/// maxiter : int, optional
-///     Maximum number of iterations (default: 100000).
-///
-/// Returns
-/// -------
-/// (np.ndarray[float64], bool)
-///     Tuple with:
-///         - demeaned array (same shape as `x`)
-///         - success flag (True if converged, False if maxiter was reached)
-///
-/// Notes
-/// -----
-/// This function performs iterative demeaning to remove all group means specified by
-/// `flist` from the data `x`, optionally using observation weights. Convergence is
-/// determined when the change between iterations falls below `tol`.
-/// Note that flist must be a 2D array of integers. NaNs are not allowed in
-/// either `x` or `flist`.
-///
-/// Example
-/// -------
-/// ```python
-/// import numpy as np
-/// from pyfixest.core.demean import _demean_rs
-///
-/// # Sample data: 5 observations, 2 variables
-/// x = np.array([[10.0, 2.0],
-///               [11.0, 3.0],
-///               [12.0, 4.0],
-///               [20.0, 5.0],
-///               [21.0, 6.0]])
-///
-/// # Grouping by two categorical variables, integer-encoded
-/// flist = np.array([[0, 1],
-///                   [0, 2],
-///                   [0, 2],
-///                   [1, 1],
-///                   [1, 2]])
-///
-/// # All observations equally weighted
-/// weights = np.ones(5)
-///
-/// # Call the function
-/// x_demeaned, converged = _demean_rs(x, flist, weights)
-///
-/// print("Demeaned x:")
-/// print(x_demeaned)
-/// print("Converged:", converged)
-/// ```
-
-#[pyfunction]
-#[pyo3(signature = (x, flist, weights, tol=1e-8, maxiter=100_000))]
-pub fn _demean_rs(
-    py: Python<'_>,
-    x: PyReadonlyArray2<f64>,
-    flist: PyReadonlyArray2<usize>,
-    weights: PyReadonlyArray1<f64>,
-    tol: f64,
-    maxiter: usize,
-) -> PyResult<(Py<PyArray2<f64>>, bool)> {
-    let x_arr = x.as_array();
-    let flist_arr = flist.as_array();
-    let weights_arr = weights.as_array();
-
-    let (out, success) =
-        py.detach(|| demean_impl(&x_arr, &flist_arr, &weights_arr, tol, maxiter));
-
-    let pyarray = PyArray2::from_owned_array(py, out);
-    Ok((pyarray.into(), success))
-}
diff --git a/src/demean_accelerated/accelerator.rs b/src/demean/accelerator.rs
similarity index 98%
rename from src/demean_accelerated/accelerator.rs
rename to src/demean/accelerator.rs
index f259be8a7..4ca5aca50 100644
--- a/src/demean_accelerated/accelerator.rs
+++ b/src/demean/accelerator.rs
@@ -3,8 +3,8 @@
 //! This module provides [`IronsTuckGrand`], the acceleration strategy matching
 //! fixest's implementation.
 
-use crate::demean_accelerated::projection::Projector;
-use crate::demean_accelerated::types::{ConvergenceState, FixestConfig};
+use crate::demean::projection::Projector;
+use crate::demean::types::{ConvergenceState, FixestConfig};
 
 // =============================================================================
 // Internal Types
@@ -408,8 +408,8 @@ impl IronsTuckGrand {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::demean_accelerated::projection::TwoFEProjector;
-    use crate::demean_accelerated::types::DemeanContext;
+    use crate::demean::projection::TwoFEProjector;
+    use crate::demean::types::DemeanContext;
     use ndarray::{Array1, Array2};
 
     /// Create a test problem with 2 fixed effects
diff --git a/src/demean_accelerated/demeaner.rs b/src/demean/demeaner.rs
similarity index 98%
rename from src/demean_accelerated/demeaner.rs
rename to src/demean/demeaner.rs
index 7508c3283..8291ec63f 100644
--- a/src/demean_accelerated/demeaner.rs
+++ b/src/demean/demeaner.rs
@@ -13,9 +13,9 @@
 //! This is important for parallel processing where each thread can have its own
 //! demeaner instance that reuses buffers across columns.
 
-use crate::demean_accelerated::accelerator::IronsTuckGrand;
-use crate::demean_accelerated::projection::{MultiFEProjector, TwoFEProjector};
-use crate::demean_accelerated::types::{ConvergenceState, DemeanContext, FixestConfig};
+use crate::demean::accelerator::IronsTuckGrand;
+use crate::demean::projection::{MultiFEProjector, TwoFEProjector};
+use crate::demean::types::{ConvergenceState, DemeanContext, FixestConfig};
 
 // =============================================================================
 // Demeaner Trait
diff --git a/src/demean_accelerated/mod.rs b/src/demean/mod.rs
similarity index 98%
rename from src/demean_accelerated/mod.rs
rename to src/demean/mod.rs
index 689776030..778399443 100644
--- a/src/demean_accelerated/mod.rs
+++ b/src/demean/mod.rs
@@ -77,7 +77,7 @@ impl<'a> ThreadLocalDemeaner<'a> {
 ///
 /// Uses `for_each_init` to create one demeaner per thread, reusing buffers
 /// across all columns processed by that thread.
-pub(crate) fn demean_accelerated(
+pub(crate) fn demean(
     x: &ArrayView2<f64>,
     flist: &ArrayView2<usize>,
     weights: &ArrayView1<f64>,
@@ -131,7 +131,7 @@ pub(crate) fn demean_accelerated(
 /// Python-exposed function for accelerated demeaning.
 #[pyfunction]
 #[pyo3(signature = (x, flist, weights, tol=1e-8, maxiter=100_000))]
-pub fn _demean_accelerated_rs(
+pub fn _demean_rs(
     py: Python<'_>,
     x: PyReadonlyArray2<f64>,
     flist: PyReadonlyArray2<usize>,
@@ -144,7 +144,7 @@ pub fn _demean_accelerated_rs(
     let weights_arr = weights.as_array();
 
     let (out, success) =
-        py.detach(|| demean_accelerated(&x_arr, &flist_arr, &weights_arr, tol, maxiter));
+        py.detach(|| demean(&x_arr, &flist_arr, &weights_arr, tol, maxiter));
 
     let pyarray = PyArray2::from_owned_array(py, out);
     Ok((pyarray.into(), success))
diff --git a/src/demean_accelerated/projection.rs b/src/demean/projection.rs
similarity index 98%
rename from src/demean_accelerated/projection.rs
rename to src/demean/projection.rs
index 8c27ee8d8..9ad985635 100644
--- a/src/demean_accelerated/projection.rs
+++ b/src/demean/projection.rs
@@ -19,10 +19,10 @@
 //!
 //! # Usage with Accelerators
 //!
-//! Projectors are used with [`IronsTuckGrand`](crate::demean_accelerated::accelerator::IronsTuckGrand)
+//! Projectors are used with [`IronsTuckGrand`](crate::demean::accelerator::IronsTuckGrand)
 //! which handles the iteration strategy.
 
-use crate::demean_accelerated::types::DemeanContext;
+use crate::demean::types::DemeanContext;
 
 // =============================================================================
 // Projector Trait
diff --git a/src/demean_accelerated/types.rs b/src/demean/types.rs
similarity index 100%
rename from src/demean_accelerated/types.rs
rename to src/demean/types.rs
diff --git a/src/lib.rs b/src/lib.rs
index d1cf3b5c7..9551b0bab 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -5,17 +5,15 @@ mod crv1;
 mod demean;
 mod detect_singletons;
 mod nested_fixed_effects;
-mod demean_accelerated;
 
 #[pymodule]
 fn _core_impl(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_wrapped(wrap_pyfunction!(collinear::_find_collinear_variables_rs))?;
     m.add_wrapped(wrap_pyfunction!(crv1::_crv1_meat_loop_rs))?;
     m.add_wrapped(wrap_pyfunction!(demean::_demean_rs))?;
+    m.add_wrapped(wrap_pyfunction!(detect_singletons::_detect_singletons_rs))?;
     m.add_wrapped(wrap_pyfunction!(
         nested_fixed_effects::_count_fixef_fully_nested_all_rs
     ))?;
-    m.add_wrapped(wrap_pyfunction!(demean_accelerated::_demean_accelerated_rs))?;
-    m.add_wrapped(wrap_pyfunction!(detect_singletons::_detect_singletons_rs))?;
     Ok(())
 }
diff --git a/tests/test_demean.py b/tests/test_demean.py
index ef5814e0b..5f20a60ed 100644
--- a/tests/test_demean.py
+++ b/tests/test_demean.py
@@ -3,8 +3,7 @@
 import pyhdfe
 import pytest
 
-from pyfixest.core import demean as demean_rs
-from pyfixest.core.demean_accelerated import demean_accelerated as demean_accelerated_rs
+from pyfixest.core.demean import demean as demean_rs
 from pyfixest.estimation.cupy.demean_cupy_ import demean_cupy32, demean_cupy64
 from pyfixest.estimation.demean_ import _set_demeaner_backend, demean, demean_model
 from pyfixest.estimation.jax.demean_jax_ import demean_jax
@@ -16,7 +15,6 @@
         demean,
         demean_jax,
         demean_rs,
-        demean_accelerated_rs,
         demean_cupy32,
         demean_cupy64,
     ],
@@ -24,7 +22,6 @@
         "demean_numba",
         "demean_jax",
         "demean_rs",
-        "demean_accelerated_rs",
         "demean_cupy32",
         "demean_cupy64",
     ],
@@ -65,7 +62,7 @@ def test_set_demeaner_backend():
     assert demean_func == demean_jax
 
     demean_func = _set_demeaner_backend("rust")
-    assert demean_func == demean_accelerated_rs
+    assert demean_func == demean_rs
 
     demean_func = _set_demeaner_backend("cupy32")
     assert demean_func == demean_cupy32
@@ -84,7 +81,6 @@ def test_set_demeaner_backend():
         demean,
         demean_jax,
         demean_rs,
-        demean_accelerated_rs,
         demean_cupy32,
         demean_cupy64,
     ],
@@ -92,7 +88,6 @@ def test_set_demeaner_backend():
         "demean_numba",
         "demean_jax",
         "demean_rs",
-        "demean_accelerated_rs",
         "demean_cupy32",
         "demean_cupy64",
     ],
@@ -133,7 +128,6 @@ def test_demean_model_no_fixed_effects(benchmark, demean_func):
         demean,
         demean_jax,
         demean_rs,
-        demean_accelerated_rs,
         demean_cupy32,
         demean_cupy64,
     ],
@@ -141,7 +135,6 @@ def test_demean_model_no_fixed_effects(benchmark, demean_func):
         "demean_numba",
         "demean_jax",
         "demean_rs",
-        "demean_accelerated_rs",
         "demean_cupy32",
         "demean_cupy64",
     ],
@@ -193,7 +186,6 @@ def test_demean_model_with_fixed_effects(benchmark, demean_func):
         demean,
         demean_jax,
         demean_rs,
-        demean_accelerated_rs,
         demean_cupy32,
         demean_cupy64,
     ],
@@ -201,7 +193,6 @@ def test_demean_model_with_fixed_effects(benchmark, demean_func):
         "demean_numba",
         "demean_jax",
         "demean_rs",
-        "demean_accelerated_rs",
         "demean_cupy32",
         "demean_cupy64",
     ],
@@ -255,7 +246,6 @@ def test_demean_model_with_weights(benchmark, demean_func):
         demean,
         demean_jax,
         demean_rs,
-        demean_accelerated_rs,
         demean_cupy32,
         demean_cupy64,
     ],
@@ -263,7 +253,6 @@ def test_demean_model_with_weights(benchmark, demean_func):
         "demean_numba",
         "demean_jax",
         "demean_rs",
-        "demean_accelerated_rs",
         "demean_cupy32",
         "demean_cupy64",
     ],
@@ -338,7 +327,6 @@ def test_demean_model_caching(benchmark, demean_func):
         demean,
         demean_jax,
         demean_rs,
-        demean_accelerated_rs,
         demean_cupy32,
         demean_cupy64,
     ],
@@ -346,7 +334,6 @@ def test_demean_model_caching(benchmark, demean_func):
         "demean_numba",
         "demean_jax",
         "demean_rs",
-        "demean_accelerated_rs",
         "demean_cupy32",
         "demean_cupy64",
     ],
@@ -386,7 +373,6 @@ def test_demean_model_maxiter_convergence_failure(demean_func):
         demean,
         demean_jax,
         demean_rs,
-        demean_accelerated_rs,
         demean_cupy32,
         demean_cupy64,
     ],
@@ -394,7 +380,6 @@ def test_demean_model_maxiter_convergence_failure(demean_func):
         "demean_numba",
         "demean_jax",
         "demean_rs",
-        "demean_accelerated_rs",
         "demean_cupy32",
         "demean_cupy64",
     ],
@@ -476,8 +461,8 @@ def test_feols_integration_maxiter():
 
 @pytest.mark.parametrize(
     argnames="demean_func",
-    argvalues=[demean_rs, demean_accelerated_rs, demean_cupy32, demean_cupy64],
-    ids=["demean_rs", "demean_accelerated_rs", "demean_cupy32", "demean_cupy64"],
+    argvalues=[demean_rs, demean_cupy32, demean_cupy64],
+    ids=["demean_rs", "demean_cupy32", "demean_cupy64"],
 )
 def test_demean_complex_fixed_effects(benchmark, demean_func):
     """Benchmark demean functions with complex multi-level fixed effects."""

From 420d7fc3127cdb0208861873f3be7e7271c477c7 Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Mon, 5 Jan 2026 11:49:23 +0100
Subject: [PATCH 09/24] Add FE reordering by size for faster convergence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reorder fixed effects by number of groups (largest first) to match
fixest's default `fixef.reorder = TRUE` behavior. This improves
convergence for 3+ FE cases by making the 2-FE sub-convergence
phase work on the largest FEs first.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/demean/demeaner.rs | 59 ++++++++++++++++----------
 src/demean/mod.rs      | 85 ++++++++++++++++++++++---------------
 src/demean/types.rs    | 96 ++++++++++++++++++++++++++++++++++++++----
 3 files changed, 174 insertions(+), 66 deletions(-)

diff --git a/src/demean/demeaner.rs b/src/demean/demeaner.rs
index 8291ec63f..90bca66ab 100644
--- a/src/demean/demeaner.rs
+++ b/src/demean/demeaner.rs
@@ -15,7 +15,7 @@
 
 use crate::demean::accelerator::IronsTuckGrand;
 use crate::demean::projection::{MultiFEProjector, TwoFEProjector};
-use crate::demean::types::{ConvergenceState, DemeanContext, FixestConfig};
+use crate::demean::types::{ConvergenceState, DemeanContext, DemeanResult, FixestConfig};
 
 // =============================================================================
 // Demeaner Trait
@@ -26,12 +26,16 @@ use crate::demean::types::{ConvergenceState, DemeanContext, FixestConfig};
 /// Demeaners own references to their context and configuration, as well as
 /// working buffers that are reused across multiple `solve()` calls.
 pub trait Demeaner {
-    /// Solve the demeaning problem.
+    /// Solve the demeaning problem for a single column.
     ///
     /// # Returns
     ///
-    /// Tuple of (demeaned_output, iterations_used, convergence_state)
-    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, ConvergenceState);
+    /// A `DemeanResult` containing:
+    /// - `demeaned`: The input with fixed effects removed
+    /// - `success`: Whether the algorithm converged
+    /// - `iterations`: Number of iterations (0 for closed-form solutions)
+    /// - `coefficients`: FE coefficients (`None` for 3+ FE case)
+    fn solve(&mut self, input: &[f64]) -> DemeanResult;
 }
 
 // =============================================================================
@@ -54,7 +58,7 @@ impl<'a> SingleFEDemeaner<'a> {
 }
 
 impl Demeaner for SingleFEDemeaner<'_> {
-    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, ConvergenceState) {
+    fn solve(&mut self, input: &[f64]) -> DemeanResult {
         let n_obs = self.ctx.index.n_obs;
 
         // Apply Dᵀ to get coefficient-space sums
@@ -63,18 +67,18 @@ impl Demeaner for SingleFEDemeaner<'_> {
         let fe0 = self.ctx.index.group_ids_for_fe(0);
         let group_weights = self.ctx.group_weights_for_fe(0);
 
-        // coef[g] = in_out[g] / group_weights[g]
-        let coef: Vec<f64> = in_out
-            .iter()
-            .zip(group_weights.iter())
-            .map(|(&io, &sw)| io / sw)
+        // output[i] = input[i] - group_mean[fe0[i]]
+        // where group_mean[g] = in_out[g] / group_weights[g]
+        let demeaned: Vec<f64> = (0..n_obs)
+            .map(|i| input[i] - in_out[fe0[i]] / group_weights[fe0[i]])
             .collect();
 
-        // output[i] = input[i] - coef[fe0[i]]
-        let output: Vec<f64> = (0..n_obs).map(|i| input[i] - coef[fe0[i]]).collect();
-
         // Single FE is a closed-form solution, always converges in 0 iterations
-        (output, 0, ConvergenceState::Converged)
+        DemeanResult {
+            demeaned,
+            convergence: ConvergenceState::Converged,
+            iterations: 0,
+        }
     }
 }
 
@@ -112,7 +116,7 @@ impl<'a> TwoFEDemeaner<'a> {
 }
 
 impl Demeaner for TwoFEDemeaner<'_> {
-    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, ConvergenceState) {
+    fn solve(&mut self, input: &[f64]) -> DemeanResult {
         let n_obs = self.ctx.index.n_obs;
         let n0 = self.ctx.index.n_groups[0];
 
@@ -134,11 +138,15 @@ impl Demeaner for TwoFEDemeaner<'_> {
         let fe0 = self.ctx.index.group_ids_for_fe(0);
         let fe1 = self.ctx.index.group_ids_for_fe(1);
 
-        let result: Vec<f64> = (0..n_obs)
+        let demeaned: Vec<f64> = (0..n_obs)
             .map(|i| input[i] - self.coef[fe0[i]] - self.coef[n0 + fe1[i]])
             .collect();
 
-        (result, iter, convergence)
+        DemeanResult {
+            demeaned,
+            convergence,
+            iterations: iter,
+        }
     }
 }
 
@@ -153,7 +161,7 @@ impl Demeaner for TwoFEDemeaner<'_> {
 struct MultiFEBuffers {
     /// Accumulated fixed effects per observation (observation-space)
     mu: Vec<f64>,
-    /// Coefficient array for all FEs (coefficient-space)
+    /// Working coefficient array for accelerator (reset each phase)
     coef: Vec<f64>,
     /// Coefficient array for 2-FE sub-convergence (coefficient-space, first 2 FEs only)
     coef_2fe: Vec<f64>,
@@ -303,24 +311,29 @@ impl<'a> MultiFEDemeaner<'a> {
         }
     }
 
-    /// Compute final output and return result tuple.
+    /// Compute final output and return result.
     fn finalize_output(
         &self,
         input: &[f64],
         iter: usize,
         convergence: ConvergenceState,
-    ) -> (Vec<f64>, usize, ConvergenceState) {
-        let output: Vec<f64> = input
+    ) -> DemeanResult {
+        let demeaned: Vec<f64> = input
             .iter()
             .zip(self.buffers.mu.iter())
             .map(|(&x, &mu)| x - mu)
             .collect();
-        (output, iter, convergence)
+
+        DemeanResult {
+            demeaned,
+            convergence,
+            iterations: iter,
+        }
     }
 }
 
 impl Demeaner for MultiFEDemeaner<'_> {
-    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, ConvergenceState) {
+    fn solve(&mut self, input: &[f64]) -> DemeanResult {
         self.buffers.reset();
 
         // Phase 1: Warmup with all FEs
diff --git a/src/demean/mod.rs b/src/demean/mod.rs
index 778399443..4106a7ca0 100644
--- a/src/demean/mod.rs
+++ b/src/demean/mod.rs
@@ -32,7 +32,7 @@ pub mod projection;
 pub mod types;
 
 use demeaner::{Demeaner, MultiFEDemeaner, SingleFEDemeaner, TwoFEDemeaner};
-use types::{ConvergenceState, DemeanContext, FixestConfig};
+use types::{ConvergenceState, DemeanContext, DemeanResult, FixestConfig};
 
 use ndarray::{Array2, ArrayView1, ArrayView2, Zip};
 use numpy::{PyArray2, PyReadonlyArray1, PyReadonlyArray2};
@@ -64,7 +64,7 @@ impl<'a> ThreadLocalDemeaner<'a> {
 
     /// Solve the demeaning problem, reusing internal buffers.
     #[inline]
-    fn solve(&mut self, input: &[f64]) -> (Vec<f64>, usize, ConvergenceState) {
+    fn solve(&mut self, input: &[f64]) -> DemeanResult {
         match self {
             ThreadLocalDemeaner::Single(d) => d.solve(input),
             ThreadLocalDemeaner::Two(d) => d.solve(input),
@@ -77,6 +77,12 @@ impl<'a> ThreadLocalDemeaner<'a> {
 ///
 /// Uses `for_each_init` to create one demeaner per thread, reusing buffers
 /// across all columns processed by that thread.
+///
+/// # Returns
+///
+/// A tuple of (demeaned_data, success) where:
+/// - `demeaned_data`: The demeaned data as an `Array2<f64>`
+/// - `success`: True if all columns converged
 pub(crate) fn demean(
     x: &ArrayView2<f64>,
     flist: &ArrayView2<usize>,
@@ -95,7 +101,8 @@ pub(crate) fn demean(
     let not_converged = Arc::new(AtomicUsize::new(0));
     let mut res = Array2::<f64>::zeros((n_samples, n_features));
 
-    let ctx = DemeanContext::new(flist, weights);
+    // Use reorder_fe from config (default true, matching fixest)
+    let ctx = DemeanContext::with_config(flist, weights, config.reorder_fe);
 
     res.axis_iter_mut(ndarray::Axis(1))
         .into_par_iter()
@@ -107,20 +114,22 @@ pub(crate) fn demean(
             |demeaner, (k, mut col)| {
                 let col_view = x.column(k);
                 // Zero-copy if column is contiguous (F-order), otherwise copy
-                let (result, _iter, convergence) = if let Some(slice) = col_view.as_slice() {
+                let result = if let Some(slice) = col_view.as_slice() {
                     demeaner.solve(slice)
                 } else {
                     let xk: Vec<f64> = col_view.to_vec();
                     demeaner.solve(&xk)
                 };
 
-                if convergence == ConvergenceState::NotConverged {
+                if result.convergence == ConvergenceState::NotConverged {
                     not_converged.fetch_add(1, Ordering::SeqCst);
                 }
 
-                Zip::from(&mut col).and(&result).for_each(|col_elm, &val| {
-                    *col_elm = val;
-                });
+                Zip::from(&mut col)
+                    .and(&result.demeaned)
+                    .for_each(|col_elm, &val| {
+                        *col_elm = val;
+                    });
             },
         );
 
@@ -129,6 +138,8 @@ pub(crate) fn demean(
 }
 
 /// Python-exposed function for accelerated demeaning.
+///
+/// Returns a tuple of (demeaned_array, success).
 #[pyfunction]
 #[pyo3(signature = (x, flist, weights, tol=1e-8, maxiter=100_000))]
 pub fn _demean_rs(
@@ -143,10 +154,9 @@ pub fn _demean_rs(
     let flist_arr = flist.as_array();
     let weights_arr = weights.as_array();
 
-    let (out, success) =
-        py.detach(|| demean(&x_arr, &flist_arr, &weights_arr, tol, maxiter));
+    let (demeaned, success) = py.detach(|| demean(&x_arr, &flist_arr, &weights_arr, tol, maxiter));
 
-    let pyarray = PyArray2::from_owned_array(py, out);
+    let pyarray = PyArray2::from_owned_array(py, demeaned);
     Ok((pyarray.into(), success))
 }
 
@@ -174,11 +184,11 @@ mod tests {
 
         let config = FixestConfig::default();
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
-        let (result, iter, convergence) = demeaner.solve(&input);
+        let result = demeaner.solve(&input);
 
-        assert_eq!(convergence, ConvergenceState::Converged, "Should converge");
-        assert!(iter < 100, "Should converge quickly");
-        assert!(result.iter().all(|&v| v.is_finite()));
+        assert_eq!(result.convergence, ConvergenceState::Converged, "Should converge");
+        assert!(result.iterations < 100, "Should converge quickly");
+        assert!(result.demeaned.iter().all(|&v| v.is_finite()));
     }
 
     #[test]
@@ -200,10 +210,10 @@ mod tests {
 
         let config = FixestConfig::default();
         let mut demeaner = MultiFEDemeaner::new(&ctx, &config);
-        let (result, _iter, convergence) = demeaner.solve(&input);
+        let result = demeaner.solve(&input);
 
-        assert_eq!(convergence, ConvergenceState::Converged);
-        assert!(result.iter().all(|&v| v.is_finite()));
+        assert_eq!(result.convergence, ConvergenceState::Converged);
+        assert!(result.demeaned.iter().all(|&v| v.is_finite()));
     }
 
     #[test]
@@ -222,14 +232,15 @@ mod tests {
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let mut demeaner = SingleFEDemeaner::new(&ctx);
-        let (result, iter, convergence) = demeaner.solve(&input);
+        let result = demeaner.solve(&input);
 
-        assert_eq!(convergence, ConvergenceState::Converged, "Single FE should always converge");
-        assert_eq!(iter, 0, "Single FE should be closed-form (0 iterations)");
+        assert_eq!(result.convergence, ConvergenceState::Converged, "Single FE should always converge");
+        assert_eq!(result.iterations, 0, "Single FE should be closed-form (0 iterations)");
 
         // Verify demeaning: each group's sum should be approximately 0
         for g in 0..n_groups {
             let group_sum: f64 = result
+                .demeaned
                 .iter()
                 .enumerate()
                 .filter(|(i, _)| i % n_groups == g)
@@ -267,11 +278,11 @@ mod tests {
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
         let config = FixestConfig::default();
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
-        let (result, _iter, convergence) = demeaner.solve(&input);
+        let result = demeaner.solve(&input);
 
-        assert_eq!(convergence, ConvergenceState::Converged, "Weighted regression should converge");
+        assert_eq!(result.convergence, ConvergenceState::Converged, "Weighted regression should converge");
         assert!(
-            result.iter().all(|&v| v.is_finite()),
+            result.demeaned.iter().all(|&v| v.is_finite()),
             "All results should be finite"
         );
     }
@@ -293,15 +304,15 @@ mod tests {
 
         let config = FixestConfig::default();
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
-        let (result, _iter, convergence) = demeaner.solve(&input);
+        let result = demeaner.solve(&input);
 
-        assert_eq!(convergence, ConvergenceState::Converged, "Singleton groups should converge");
+        assert_eq!(result.convergence, ConvergenceState::Converged, "Singleton groups should converge");
 
         // With singleton groups in FE 0, each observation's own mean is subtracted,
         // then adjusted for FE 1. The result should be all zeros since each
         // observation perfectly absorbs its own value in FE 0.
         assert!(
-            result.iter().all(|&v| v.abs() < 1e-10),
+            result.demeaned.iter().all(|&v| v.abs() < 1e-10),
             "Singleton groups should yield near-zero residuals"
         );
     }
@@ -323,11 +334,11 @@ mod tests {
 
         let config = FixestConfig::default();
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
-        let (result, _iter, convergence) = demeaner.solve(&input);
+        let result = demeaner.solve(&input);
 
-        assert_eq!(convergence, ConvergenceState::Converged, "Small groups should converge");
+        assert_eq!(result.convergence, ConvergenceState::Converged, "Small groups should converge");
         assert!(
-            result.iter().all(|&v| v.is_finite()),
+            result.demeaned.iter().all(|&v| v.is_finite()),
             "All results should be finite"
         );
     }
@@ -382,12 +393,12 @@ mod tests {
         let input1: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
         let input2: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.2 + 1.0).collect();
 
-        let (result1a, _, _) = demeaner.solve(&input1);
-        let (result2, _, _) = demeaner.solve(&input2);
-        let (result1b, _, _) = demeaner.solve(&input1);
+        let result1a = demeaner.solve(&input1);
+        let result2 = demeaner.solve(&input2);
+        let result1b = demeaner.solve(&input1);
 
         // Results for the same input should be identical
-        for (a, b) in result1a.iter().zip(result1b.iter()) {
+        for (a, b) in result1a.demeaned.iter().zip(result1b.demeaned.iter()) {
             assert!(
                 (a - b).abs() < 1e-12,
                 "Buffer reuse should produce identical results"
@@ -396,7 +407,11 @@ mod tests {
 
         // Results for different inputs should be different
         assert!(
-            result1a.iter().zip(result2.iter()).any(|(a, b)| (a - b).abs() > 0.01),
+            result1a
+                .demeaned
+                .iter()
+                .zip(result2.demeaned.iter())
+                .any(|(a, b)| (a - b).abs() > 0.01),
             "Different inputs should produce different results"
         );
     }
diff --git a/src/demean/types.rs b/src/demean/types.rs
index 9e2125d3b..bfeb03c62 100644
--- a/src/demean/types.rs
+++ b/src/demean/types.rs
@@ -106,18 +106,51 @@ impl FixedEffectsIndex {
     /// # Panics
     ///
     /// Panics in debug builds if `n_obs == 0` or `n_fe == 0`.
+    #[allow(dead_code)]
     pub fn new(flist: &ArrayView2<usize>) -> Self {
+        Self::with_reorder(flist, false)
+    }
+
+    /// Create a fixed effects index, optionally reordering FEs by size.
+    ///
+    /// When `reorder_fe` is true, fixed effects are sorted by number of groups
+    /// (largest first). This matches R's fixest behavior and improves convergence
+    /// for 3+ FE cases by making the 2-FE sub-convergence phase work on the
+    /// largest FEs first.
+    ///
+    /// # Arguments
+    ///
+    /// * `flist` - Fixed effect group IDs with shape `(n_obs, n_fe)`.
+    /// * `reorder_fe` - Whether to reorder FEs by size (largest first).
+    ///
+    /// # Returns
+    ///
+    /// A `FixedEffectsIndex` with `original_order` tracking the mapping from
+    /// current indices to original indices.
+    pub fn with_reorder(flist: &ArrayView2<usize>, reorder_fe: bool) -> Self {
         let (n_obs, n_fe) = flist.dim();
 
         debug_assert!(n_obs > 0, "Cannot create FixedEffectsIndex with 0 observations");
         debug_assert!(n_fe > 0, "Cannot create FixedEffectsIndex with 0 fixed effects");
 
-        // Compute n_groups: max group_id + 1 for each FE
-        let n_groups: Vec<usize> = (0..n_fe)
+        // Compute n_groups: max group_id + 1 for each FE (in original order)
+        let n_groups_original: Vec<usize> = (0..n_fe)
             .map(|j| flist.column(j).iter().max().unwrap_or(&0) + 1)
             .collect();
 
-        // Compute coefficient start indices (cumulative sum of n_groups)
+        // Determine the order: either sorted by size or identity
+        let order: Vec<usize> = if reorder_fe && n_fe > 1 {
+            let mut indices: Vec<usize> = (0..n_fe).collect();
+            indices.sort_by_key(|&i| std::cmp::Reverse(n_groups_original[i]));
+            indices
+        } else {
+            (0..n_fe).collect()
+        };
+
+        // Reorder n_groups according to the sort order
+        let n_groups: Vec<usize> = order.iter().map(|&i| n_groups_original[i]).collect();
+
+        // Compute coefficient start indices (cumulative sum of reordered n_groups)
         let mut coef_start = vec![0usize; n_fe];
         for q in 1..n_fe {
             coef_start[q] = coef_start[q - 1] + n_groups[q - 1];
@@ -125,11 +158,11 @@ impl FixedEffectsIndex {
         let n_coef: usize = n_groups.iter().sum();
 
         // Transpose group_ids from row-major (obs, fe) to column-major (fe, obs)
-        // This layout is better for the inner loops which iterate over observations
+        // applying the reordering during the transpose (zero extra cost)
         let mut group_ids = vec![0usize; n_fe * n_obs];
-        for q in 0..n_fe {
-            for (i, &g) in flist.column(q).iter().enumerate() {
-                group_ids[q * n_obs + i] = g;
+        for (new_q, &old_q) in order.iter().enumerate() {
+            for (i, &g) in flist.column(old_q).iter().enumerate() {
+                group_ids[new_q * n_obs + i] = g;
             }
         }
 
@@ -305,7 +338,27 @@ impl DemeanContext {
     /// # Panics
     ///
     /// Panics in debug builds if `weights.len() != flist.nrows()`.
+    #[allow(dead_code)]
     pub fn new(flist: &ArrayView2<usize>, weights: &ArrayView1<f64>) -> Self {
+        Self::with_config(flist, weights, false)
+    }
+
+    /// Create a demeaning context with configuration options.
+    ///
+    /// # Arguments
+    ///
+    /// * `flist` - Fixed effect group IDs with shape `(n_obs, n_fe)`
+    /// * `weights` - Per-observation weights (length: `n_obs`)
+    /// * `reorder_fe` - Whether to reorder FEs by size (largest first)
+    ///
+    /// # Panics
+    ///
+    /// Panics in debug builds if `weights.len() != flist.nrows()`.
+    pub fn with_config(
+        flist: &ArrayView2<usize>,
+        weights: &ArrayView1<f64>,
+        reorder_fe: bool,
+    ) -> Self {
         debug_assert_eq!(
             weights.len(),
             flist.nrows(),
@@ -314,7 +367,7 @@ impl DemeanContext {
             flist.nrows()
         );
 
-        let index = FixedEffectsIndex::new(flist);
+        let index = FixedEffectsIndex::with_reorder(flist, reorder_fe);
         let weights = ObservationWeights::new(weights, &index);
         Self { index, weights }
     }
@@ -366,6 +419,7 @@ impl DemeanContext {
             }
         }
     }
+
 }
 
 // =============================================================================
@@ -396,6 +450,12 @@ pub struct FixestConfig {
 
     /// Iterations between SSR-based convergence checks.
     pub ssr_check_interval: usize,
+
+    /// Whether to reorder fixed effects by size (largest first).
+    /// This matches fixest's default behavior and improves convergence
+    /// for 3+ FE cases by making the 2-FE sub-convergence phase work
+    /// on the largest FEs first.
+    pub reorder_fe: bool,
 }
 
 impl Default for FixestConfig {
@@ -414,6 +474,8 @@ impl Default for FixestConfig {
             iter_grand_acc: 4,
             // SSR convergence check frequency
             ssr_check_interval: 40,
+            // Reorder FEs by size (matches fixest's fixef.reorder = TRUE default)
+            reorder_fe: true,
         }
     }
 }
@@ -433,3 +495,21 @@ pub enum ConvergenceState {
     /// Algorithm has not yet converged; continue iterating.
     NotConverged,
 }
+
+// =============================================================================
+// DemeanResult
+// =============================================================================
+
+/// Result of a demeaning operation (single column).
+#[derive(Debug, Clone)]
+pub struct DemeanResult {
+    /// Demeaned data (single column, length `n_obs`).
+    pub demeaned: Vec<f64>,
+
+    /// Convergence state.
+    pub convergence: ConvergenceState,
+
+    /// Number of iterations used (0 for closed-form solutions).
+    #[allow(dead_code)]
+    pub iterations: usize,
+}

From a39ab4b5505c4edc1ce4cdca1e79c99939c6b0e4 Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Mon, 5 Jan 2026 12:36:36 +0100
Subject: [PATCH 10/24] Minor grammer and typo fixes

---
 src/demean/accelerator.rs | 8 ++++----
 src/demean/demeaner.rs    | 6 +++---
 src/demean/mod.rs         | 2 +-
 src/demean/projection.rs  | 4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/demean/accelerator.rs b/src/demean/accelerator.rs
index 4ca5aca50..9ed3bd03f 100644
--- a/src/demean/accelerator.rs
+++ b/src/demean/accelerator.rs
@@ -204,7 +204,7 @@ impl IronsTuckGrand {
 
     /// Perform the core Irons-Tuck acceleration step.
     ///
-    /// Returns `Converged` if convergence detected, `NotConverged` to continue.
+    /// Returns `Converged` if convergence is detected, `NotConverged` to continue.
     #[inline]
     fn acceleration_step_check<P: Projector>(
         &mut self,
@@ -335,16 +335,16 @@ impl IronsTuckGrand {
     ///
     /// # How it works
     ///
-    /// Every `iter_grand_acc` iterations, this function is called to advance a
+    /// Every `iter_grand_acc` iteration, this function is called to advance a
     /// 3-phase state machine:
     ///
     /// 1. **Collect1st**: Store current `gx` as the first snapshot (`y`)
     /// 2. **Collect2nd**: Store current `gx` as the second snapshot (`gy`)
-    /// 3. **Collect3rdAndAccelerate**: Store current `gx` as third snapshot (`ggy`),
+    /// 3. **Collect3rdAndAccelerate**: Store current `gx` as the third snapshot (`ggy`),
     ///    then apply Irons-Tuck to (y, gy, ggy) to extrapolate toward the fixed point
     ///
     /// After phase 3, the cycle repeats. This means actual acceleration happens
-    /// every `3 × iter_grand_acc` iterations.
+    /// every `3 × iter_grand_acc` iteration.
     #[inline]
     fn grand_acceleration_step<P: Projector>(
         &mut self,
diff --git a/src/demean/demeaner.rs b/src/demean/demeaner.rs
index 90bca66ab..410cfe357 100644
--- a/src/demean/demeaner.rs
+++ b/src/demean/demeaner.rs
@@ -180,7 +180,7 @@ impl MultiFEBuffers {
         }
     }
 
-    /// Reset all buffers to zero for a new solve.
+    /// Reset all buffers to zero for a new call to solve.
     #[inline]
     fn reset(&mut self) {
         self.mu.fill(0.0);
@@ -188,7 +188,7 @@ impl MultiFEBuffers {
     }
 }
 
-/// Demeaner for 3+ fixed effects: multi-phase strategy.
+/// Demeaner for 3+ fixed effects: multiphase strategy.
 ///
 /// Owns working buffers that are reused across multiple `solve()` calls.
 ///
@@ -311,7 +311,7 @@ impl<'a> MultiFEDemeaner<'a> {
         }
     }
 
-    /// Compute final output and return result.
+    /// Compute the final output and return the result.
     fn finalize_output(
         &self,
         input: &[f64],
diff --git a/src/demean/mod.rs b/src/demean/mod.rs
index 4106a7ca0..b8a43f1b7 100644
--- a/src/demean/mod.rs
+++ b/src/demean/mod.rs
@@ -113,7 +113,7 @@ pub(crate) fn demean(
             // Body closure: called for each column, reusing thread-local state
             |demeaner, (k, mut col)| {
                 let col_view = x.column(k);
-                // Zero-copy if column is contiguous (F-order), otherwise copy
+                // Zero-copy if the column is contiguous (F-order), otherwise copy
                 let result = if let Some(slice) = col_view.as_slice() {
                     demeaner.solve(slice)
                 } else {
diff --git a/src/demean/projection.rs b/src/demean/projection.rs
index 9ad985635..ab4c84dbe 100644
--- a/src/demean/projection.rs
+++ b/src/demean/projection.rs
@@ -95,7 +95,7 @@ impl<'a> TwoFEProjector<'a> {
         }
     }
 
-    /// Compute beta coefficients from alpha, storing result in scratch buffer.
+    /// Compute beta coefficients from alpha, storing the result in the scratch buffer.
     ///
     /// For each group g1 in FE1:
     ///   beta[g1] = (in_out[g1] - Σ alpha[g0] * w) / group_weight[g1]
@@ -176,7 +176,7 @@ impl Projector for TwoFEProjector<'_> {
         coef_out[n0..n0 + n1].copy_from_slice(&self.scratch[..n1]);
     }
 
-    /// Compute sum of squared residuals for the given coefficients.
+    /// Compute the sum of squared residuals for the given coefficients.
     ///
     /// # Side Effects
     ///

From 9cc4f5947c180c4e56ab2c013c387129cb96c525 Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Mon, 5 Jan 2026 14:20:35 +0100
Subject: [PATCH 11/24] Reuse coefficient sum buffers to reduce allocations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add coef_sums_buffer to SingleFEDemeaner, TwoFEDemeaner, and MultiFEBuffers
- Change apply_design_matrix_t to write to caller-provided buffer
- Remove unnecessary in_out_2fe.to_vec() copy in MultiFEDemeaner
- Rename in_out to coef_sums/coef_sums_buffer for clarity

This eliminates per-column allocations: 1 for 2FE, 4 for 3+FE cases.
Benchmarks show 4-12% improvement for medium-sized datasets (100K obs).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/demean/accelerator.rs |  5 +--
 src/demean/demeaner.rs    | 66 +++++++++++++++++++++++++--------------
 src/demean/projection.rs  | 28 +++++++++--------
 src/demean/types.rs       | 20 +++++++-----
 4 files changed, 73 insertions(+), 46 deletions(-)

diff --git a/src/demean/accelerator.rs b/src/demean/accelerator.rs
index 9ed3bd03f..beb9556c6 100644
--- a/src/demean/accelerator.rs
+++ b/src/demean/accelerator.rs
@@ -436,10 +436,11 @@ mod tests {
         let n1 = ctx.index.n_groups[1];
         let n_coef = n0 + n1;
 
-        let in_out = ctx.apply_design_matrix_t(&input);
+        let mut coef_sums = vec![0.0; n_coef];
+        ctx.apply_design_matrix_t(&input, &mut coef_sums);
         let mut coef = vec![0.0; n_coef];
         let mut accelerator = IronsTuckGrand::new(config, n_coef);
-        let mut projector = TwoFEProjector::new(&ctx, &in_out, &input);
+        let mut projector = TwoFEProjector::new(&ctx, &coef_sums, &input);
 
         let (iter, convergence) = accelerator.run(&mut projector, &mut coef, maxiter);
 
diff --git a/src/demean/demeaner.rs b/src/demean/demeaner.rs
index 410cfe357..b1a8d17f6 100644
--- a/src/demean/demeaner.rs
+++ b/src/demean/demeaner.rs
@@ -44,16 +44,21 @@ pub trait Demeaner {
 
 /// Demeaner for 1 fixed effect: O(n) closed-form solution.
 ///
-/// No iteration or buffers needed - direct computation.
+/// Owns a reusable buffer for the coefficient-space sums.
 pub struct SingleFEDemeaner<'a> {
     ctx: &'a DemeanContext,
+    /// Weighted sums per group (Dᵀ · input), reused across solves.
+    coef_sums_buffer: Vec<f64>,
 }
 
 impl<'a> SingleFEDemeaner<'a> {
     /// Create a new single-FE demeaner.
     #[inline]
     pub fn new(ctx: &'a DemeanContext) -> Self {
-        Self { ctx }
+        Self {
+            ctx,
+            coef_sums_buffer: vec![0.0; ctx.index.n_coef],
+        }
     }
 }
 
@@ -61,16 +66,16 @@ impl Demeaner for SingleFEDemeaner<'_> {
     fn solve(&mut self, input: &[f64]) -> DemeanResult {
         let n_obs = self.ctx.index.n_obs;
 
-        // Apply Dᵀ to get coefficient-space sums
-        let in_out = self.ctx.apply_design_matrix_t(input);
+        // Apply Dᵀ to get coefficient-space sums (reuses buffer)
+        self.ctx.apply_design_matrix_t(input, &mut self.coef_sums_buffer);
 
         let fe0 = self.ctx.index.group_ids_for_fe(0);
         let group_weights = self.ctx.group_weights_for_fe(0);
 
         // output[i] = input[i] - group_mean[fe0[i]]
-        // where group_mean[g] = in_out[g] / group_weights[g]
+        // where group_mean[g] = coef_sums_buffer[g] / group_weights[g]
         let demeaned: Vec<f64> = (0..n_obs)
-            .map(|i| input[i] - in_out[fe0[i]] / group_weights[fe0[i]])
+            .map(|i| input[i] - self.coef_sums_buffer[fe0[i]] / group_weights[fe0[i]])
             .collect();
 
         // Single FE is a closed-form solution, always converges in 0 iterations
@@ -92,7 +97,9 @@ impl Demeaner for SingleFEDemeaner<'_> {
 pub struct TwoFEDemeaner<'a> {
     ctx: &'a DemeanContext,
     config: &'a FixestConfig,
-    /// Coefficient array [alpha | beta], reused across calls to solve
+    /// Weighted sums per group (Dᵀ · input), reused across solves.
+    coef_sums_buffer: Vec<f64>,
+    /// Coefficient array [alpha | beta], reused across calls to solve.
     coef: Vec<f64>,
     /// Accelerator with internal buffers, reused across solves
     accelerator: IronsTuckGrand,
@@ -109,6 +116,7 @@ impl<'a> TwoFEDemeaner<'a> {
         Self {
             ctx,
             config,
+            coef_sums_buffer: vec![0.0; n_coef],
             coef: vec![0.0; n_coef],
             accelerator: IronsTuckGrand::new(*config, n_coef),
         }
@@ -120,14 +128,14 @@ impl Demeaner for TwoFEDemeaner<'_> {
         let n_obs = self.ctx.index.n_obs;
         let n0 = self.ctx.index.n_groups[0];
 
-        // Apply Dᵀ to get coefficient-space sums
-        let in_out = self.ctx.apply_design_matrix_t(input);
+        // Apply Dᵀ to get coefficient-space sums (reuses buffer)
+        self.ctx.apply_design_matrix_t(input, &mut self.coef_sums_buffer);
 
         // Reset coefficient array for this call to solve
         self.coef.fill(0.0);
 
-        // Create the projector (lightweight, references in_out and input)
-        let mut projector = TwoFEProjector::new(self.ctx, &in_out, input);
+        // Create the projector (lightweight, references coef_sums_buffer and input)
+        let mut projector = TwoFEProjector::new(self.ctx, &self.coef_sums_buffer, input);
 
         // Run acceleration loop
         let (iter, convergence) = self
@@ -165,8 +173,10 @@ struct MultiFEBuffers {
     coef: Vec<f64>,
     /// Coefficient array for 2-FE sub-convergence (coefficient-space, first 2 FEs only)
     coef_2fe: Vec<f64>,
-    /// Effective input after subtracting mu (observation-space)
+    /// Effective input after subtracting mu (observation-space).
     effective_input: Vec<f64>,
+    /// Weighted sums per group (Dᵀ · input), reused across phases.
+    coef_sums_buffer: Vec<f64>,
 }
 
 impl MultiFEBuffers {
@@ -177,6 +187,7 @@ impl MultiFEBuffers {
             coef: vec![0.0; n_coef],
             coef_2fe: vec![0.0; n_coef_2fe],
             effective_input: vec![0.0; n_obs],
+            coef_sums_buffer: vec![0.0; n_coef],
         }
     }
 
@@ -229,14 +240,16 @@ impl<'a> MultiFEDemeaner<'a> {
 
     /// Phase 1: Warmup with all FEs to get initial estimates.
     fn warmup_phase(&mut self, input: &[f64]) -> (usize, ConvergenceState) {
-        let in_out = self.ctx.apply_design_matrix_t(input);
-        let mut projector = MultiFEProjector::new(self.ctx, &in_out, input);
+        self.ctx
+            .apply_design_matrix_t(input, &mut self.buffers.coef_sums_buffer);
+        let mut projector = MultiFEProjector::new(self.ctx, &self.buffers.coef_sums_buffer, input);
 
         let (iter, convergence) = self
             .multi_acc
             .run(&mut projector, &mut self.buffers.coef, self.config.iter_warmup);
 
-        self.ctx.apply_design_matrix(&self.buffers.coef, &mut self.buffers.mu);
+        self.ctx
+            .apply_design_matrix(&self.buffers.coef, &mut self.buffers.mu);
         (iter, convergence)
     }
 
@@ -252,14 +265,17 @@ impl<'a> MultiFEDemeaner<'a> {
             self.buffers.effective_input[i] = input[i] - self.buffers.mu[i];
         }
 
-        // Apply Dᵀ to residuals (only need first 2 FEs)
-        let in_out_full = self.ctx.apply_design_matrix_t(&self.buffers.effective_input);
-        let in_out_2fe: Vec<f64> = in_out_full[..n_coef_2fe].to_vec();
+        // Apply Dᵀ to residuals (reuses buffer, only first 2 FEs used below)
+        self.ctx
+            .apply_design_matrix_t(&self.buffers.effective_input, &mut self.buffers.coef_sums_buffer);
 
-        // Run 2-FE acceleration
+        // Run 2-FE acceleration (use slice of coef_sums_buffer, no copy needed)
         self.buffers.coef_2fe.fill(0.0);
-        let mut projector =
-            TwoFEProjector::new(self.ctx, &in_out_2fe, &self.buffers.effective_input);
+        let mut projector = TwoFEProjector::new(
+            self.ctx,
+            &self.buffers.coef_sums_buffer[..n_coef_2fe],
+            &self.buffers.effective_input,
+        );
         let (iter, convergence) = self.two_acc.run(
             &mut projector,
             &mut self.buffers.coef_2fe,
@@ -287,15 +303,17 @@ impl<'a> MultiFEDemeaner<'a> {
             self.buffers.effective_input[i] = input[i] - self.buffers.mu[i];
         }
 
-        let in_out = self.ctx.apply_design_matrix_t(&self.buffers.effective_input);
+        self.ctx
+            .apply_design_matrix_t(&self.buffers.effective_input, &mut self.buffers.coef_sums_buffer);
         self.buffers.coef.fill(0.0);
 
-        let mut projector = MultiFEProjector::new(self.ctx, &in_out, input);
+        let mut projector = MultiFEProjector::new(self.ctx, &self.buffers.coef_sums_buffer, input);
         let (iter, convergence) =
             self.multi_acc
                 .run(&mut projector, &mut self.buffers.coef, remaining);
 
-        self.ctx.apply_design_matrix(&self.buffers.coef, &mut self.buffers.mu);
+        self.ctx
+            .apply_design_matrix(&self.buffers.coef, &mut self.buffers.mu);
         (iter, convergence)
     }
 
diff --git a/src/demean/projection.rs b/src/demean/projection.rs
index ab4c84dbe..6ccaa4d90 100644
--- a/src/demean/projection.rs
+++ b/src/demean/projection.rs
@@ -77,7 +77,8 @@ pub trait Projector {
 /// where alpha are the coefficients for FE 0 and beta for FE 1.
 pub struct TwoFEProjector<'a> {
     ctx: &'a DemeanContext,
-    in_out: &'a [f64],
+    /// Weighted sums per group (Dᵀ · input).
+    coef_sums: &'a [f64],
     input: &'a [f64],
     scratch: Vec<f64>,
 }
@@ -85,11 +86,11 @@ pub struct TwoFEProjector<'a> {
 impl<'a> TwoFEProjector<'a> {
     /// Create a new 2-FE projector.
     #[inline]
-    pub fn new(ctx: &'a DemeanContext, in_out: &'a [f64], input: &'a [f64]) -> Self {
+    pub fn new(ctx: &'a DemeanContext, coef_sums: &'a [f64], input: &'a [f64]) -> Self {
         let n1 = ctx.index.n_groups[1];
         Self {
             ctx,
-            in_out,
+            coef_sums,
             input,
             scratch: vec![0.0; n1],
         }
@@ -98,7 +99,7 @@ impl<'a> TwoFEProjector<'a> {
     /// Compute beta coefficients from alpha, storing the result in the scratch buffer.
     ///
     /// For each group g1 in FE1:
-    ///   beta[g1] = (in_out[g1] - Σ alpha[g0] * w) / group_weight[g1]
+    ///   beta[g1] = (coef_sums[g1] - Σ alpha[g0] * w) / group_weight[g1]
     #[inline(always)]
     fn compute_beta_from_alpha(&mut self, alpha: &[f64]) {
         let n0 = self.ctx.index.n_groups[0];
@@ -107,7 +108,7 @@ impl<'a> TwoFEProjector<'a> {
         let fe1 = self.ctx.index.group_ids_for_fe(1);
         let sw1 = self.ctx.group_weights_for_fe(1);
 
-        self.scratch[..n1].copy_from_slice(&self.in_out[n0..n0 + n1]);
+        self.scratch[..n1].copy_from_slice(&self.coef_sums[n0..n0 + n1]);
 
         if self.ctx.weights.is_uniform {
             for (&g0, &g1) in fe0.iter().zip(fe1.iter()) {
@@ -128,7 +129,7 @@ impl<'a> TwoFEProjector<'a> {
     /// Compute alpha coefficients from beta (stored in scratch), writing to alpha_out.
     ///
     /// For each group g0 in FE0:
-    ///   alpha[g0] = (in_out[g0] - Σ beta[g1] * w) / group_weight[g0]
+    ///   alpha[g0] = (coef_sums[g0] - Σ beta[g1] * w) / group_weight[g0]
     #[inline(always)]
     fn compute_alpha_from_beta(&self, alpha_out: &mut [f64]) {
         let n0 = self.ctx.index.n_groups[0];
@@ -136,7 +137,7 @@ impl<'a> TwoFEProjector<'a> {
         let fe1 = self.ctx.index.group_ids_for_fe(1);
         let sw0 = self.ctx.group_weights_for_fe(0);
 
-        alpha_out[..n0].copy_from_slice(&self.in_out[..n0]);
+        alpha_out[..n0].copy_from_slice(&self.coef_sums[..n0]);
 
         if self.ctx.weights.is_uniform {
             for (&g0, &g1) in fe0.iter().zip(fe1.iter()) {
@@ -221,7 +222,8 @@ impl Projector for TwoFEProjector<'_> {
 /// matching fixest's algorithm.
 pub struct MultiFEProjector<'a> {
     ctx: &'a DemeanContext,
-    in_out: &'a [f64],
+    /// Weighted sums per group (Dᵀ · input).
+    coef_sums: &'a [f64],
     input: &'a [f64],
     scratch: Vec<f64>,
 }
@@ -229,11 +231,11 @@ pub struct MultiFEProjector<'a> {
 impl<'a> MultiFEProjector<'a> {
     /// Create a new multi-FE projector.
     #[inline]
-    pub fn new(ctx: &'a DemeanContext, in_out: &'a [f64], input: &'a [f64]) -> Self {
+    pub fn new(ctx: &'a DemeanContext, coef_sums: &'a [f64], input: &'a [f64]) -> Self {
         let n_obs = ctx.index.n_obs;
         Self {
             ctx,
-            in_out,
+            coef_sums,
             input,
             scratch: vec![0.0; n_obs],
         }
@@ -255,7 +257,7 @@ impl<'a> MultiFEProjector<'a> {
     /// Update coefficients for a single FE given the accumulated other-FE sums.
     ///
     /// For each group g in FE q:
-    ///   coef_out[g] = (in_out[g] - Σ scratch[i] * w) / group_weight[g]
+    ///   coef_out[g] = (coef_sums[g] - Σ scratch[i] * w) / group_weight[g]
     #[inline(always)]
     fn update_fe_coefficients(&self, fe_idx: usize, coef_out: &mut [f64]) {
         let start = self.ctx.index.coef_start[fe_idx];
@@ -263,9 +265,9 @@ impl<'a> MultiFEProjector<'a> {
         let fe = self.ctx.index.group_ids_for_fe(fe_idx);
         let group_weights = self.ctx.group_weights_for_fe(fe_idx);
 
-        // Initialize from in_out
+        // Initialize from coef_sums
         coef_out[start..start + n_groups]
-            .copy_from_slice(&self.in_out[start..start + n_groups]);
+            .copy_from_slice(&self.coef_sums[start..start + n_groups]);
 
         // Subtract accumulated other-FE contributions
         if self.ctx.weights.is_uniform {
diff --git a/src/demean/types.rs b/src/demean/types.rs
index bfeb03c62..12f05a743 100644
--- a/src/demean/types.rs
+++ b/src/demean/types.rs
@@ -384,25 +384,31 @@ impl DemeanContext {
 
     /// Apply transpose of design matrix: Dᵀ · values.
     ///
-    /// Computes weighted sums of `values` for each group in each FE.
-    /// Returns a vector of length `n_coef` with the aggregated sums.
+    /// Computes weighted sums of `values` for each group in each FE,
+    /// writing the result to `out`. The buffer is zeroed before accumulation.
     #[inline]
-    pub fn apply_design_matrix_t(&self, values: &[f64]) -> Vec<f64> {
-        let mut result = vec![0.0; self.index.n_coef];
+    pub fn apply_design_matrix_t(&self, values: &[f64], out: &mut [f64]) {
+        debug_assert_eq!(
+            out.len(),
+            self.index.n_coef,
+            "output buffer length ({}) must match n_coef ({})",
+            out.len(),
+            self.index.n_coef
+        );
+        out.fill(0.0);
         for q in 0..self.index.n_fe {
             let offset = self.index.coef_start[q];
             let fe_ids = self.index.group_ids_for_fe(q);
             if self.weights.is_uniform {
                 for (i, &g) in fe_ids.iter().enumerate() {
-                    result[offset + g] += values[i];
+                    out[offset + g] += values[i];
                 }
             } else {
                 for (i, &g) in fe_ids.iter().enumerate() {
-                    result[offset + g] += values[i] * self.weights.per_obs[i];
+                    out[offset + g] += values[i] * self.weights.per_obs[i];
                 }
             }
         }
-        result
     }
 
     /// Apply design matrix and add to output: output += D · coef.

From 903ae07d119c33ab8437d020b83c38e066d65b4a Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Tue, 6 Jan 2026 00:33:12 +0100
Subject: [PATCH 12/24] Add manual loop unrolling for gather operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unroll the accumulate_fe_contributions loop 4x to enable better
instruction-level parallelism. This produces paired loads (ldp)
and reduces loop overhead, providing ~7% speedup on large 3FE
demeaning workloads.

Also refactor compute_ssr to reuse the optimized accumulate method.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/demean/projection.rs | 45 ++++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 15 deletions(-)

diff --git a/src/demean/projection.rs b/src/demean/projection.rs
index 6ccaa4d90..a2a8efbd1 100644
--- a/src/demean/projection.rs
+++ b/src/demean/projection.rs
@@ -248,9 +248,35 @@ impl<'a> MultiFEProjector<'a> {
     fn accumulate_fe_contributions(&mut self, fe_idx: usize, coef: &[f64]) {
         let start = self.ctx.index.coef_start[fe_idx];
         let fe = self.ctx.index.group_ids_for_fe(fe_idx);
+        let n = self.scratch.len().min(fe.len());
 
-        for (sum, &g) in self.scratch.iter_mut().zip(fe.iter()) {
-            *sum += coef[start + g];
+        // Manual 4x unrolling for better instruction-level parallelism.
+        unsafe {
+            let scratch_ptr = self.scratch.as_mut_ptr();
+            let fe_ptr = fe.as_ptr();
+            let coef_ptr = coef.as_ptr().add(start);
+
+            let chunks = n / 4;
+            let mut i = 0;
+
+            for _ in 0..chunks {
+                let g0 = *fe_ptr.add(i);
+                let g1 = *fe_ptr.add(i + 1);
+                let g2 = *fe_ptr.add(i + 2);
+                let g3 = *fe_ptr.add(i + 3);
+
+                *scratch_ptr.add(i) += *coef_ptr.add(g0);
+                *scratch_ptr.add(i + 1) += *coef_ptr.add(g1);
+                *scratch_ptr.add(i + 2) += *coef_ptr.add(g2);
+                *scratch_ptr.add(i + 3) += *coef_ptr.add(g3);
+
+                i += 4;
+            }
+
+            // Handle remainder
+            for j in i..n {
+                *scratch_ptr.add(j) += *coef_ptr.add(*fe_ptr.add(j));
+            }
         }
     }
 
@@ -333,22 +359,11 @@ impl Projector for MultiFEProjector<'_> {
     fn compute_ssr(&mut self, coef: &[f64]) -> f64 {
         let n_fe = self.ctx.index.n_fe;
 
-        // Compute SSR: Σ (input[i] - Σ_q coef[fe_q[i]])²
-        //
-        // We iterate over FEs in the outer loop and observations in the inner loop.
-        // This improves cache locality because:
-        // 1. group_ids_for_fe(q) returns a contiguous slice for FE q
-        // 2. We access the scratch buffer sequentially
-        // 3. The coefficient array (typically small) stays in the cache
-
         // Accumulate coefficient sums per observation using the scratch buffer
+        // (reuses the optimized unrolled gather loop)
         self.scratch.fill(0.0);
         for q in 0..n_fe {
-            let offset = self.ctx.index.coef_start[q];
-            let fe_ids = self.ctx.index.group_ids_for_fe(q);
-            for (sum, &g) in self.scratch.iter_mut().zip(fe_ids.iter()) {
-                *sum += coef[offset + g];
-            }
+            self.accumulate_fe_contributions(q, coef);
         }
 
         // Compute SSR from residuals

From 28eaf8362f86bcc281637f9f190ab6384b817286 Mon Sep 17 00:00:00 2001
From: Alexander Fischer <alexander-fischer1801@t-online.de>
Date: Tue, 6 Jan 2026 23:15:32 +0100
Subject: [PATCH 13/24] documentation clarifications in types.rs

---
 src/demean/types.rs | 65 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 56 insertions(+), 9 deletions(-)

diff --git a/src/demean/types.rs b/src/demean/types.rs
index 12f05a743..58703ca88 100644
--- a/src/demean/types.rs
+++ b/src/demean/types.rs
@@ -48,22 +48,47 @@ use std::ops::Range;
 ///
 /// # Memory Layout
 ///
-/// Group IDs are stored in column-major order for cache efficiency during iteration:
+/// Two key arrays with different purposes and sizes:
+///
+/// ## 1. Group IDs Array (`group_ids`)
+///
+/// Maps each observation to its group index for each fixed effect.
+/// - **Size**: `N × Q` (observations × fixed effects)
+/// - **Layout**: Column-major (all FE0 IDs first, then all FE1 IDs, etc.)
+///
 /// ```text
-/// group_ids = [fe0_obs0, fe0_obs1, ..., fe0_obsN, fe1_obs0, fe1_obs1, ..., fe1_obsN, ...]
-///              |-------- FE 0 ----------|         |-------- FE 1 ----------|
+/// group_ids = [fe0_obs0, fe0_obs1, ..., fe0_obsN,  fe1_obs0, fe1_obs1, ..., fe1_obsN, ...]
+///              |-------- N entries ---------|      |-------- N entries ---------|
 /// ```
 ///
-/// Access pattern: `group_ids[fe_index * n_obs + obs_index]`
+/// Access: `group_ids[fe_index * n_obs + obs_index]`
 ///
-/// # Example
+/// ## 2. Coefficient Array (`coef`)
+///
+/// Stores the actual FE coefficient values being solved for.
+/// - **Size**: `n_coef` = sum of all group counts
+/// - **Layout**: FE0 coefficients first, then FE1, etc.
+/// - **Indexing**: `coef_start[q]` gives the offset for FE q
 ///
 /// ```text
-/// 1000 observations, 2 fixed effects (individual, year):
-/// - n_groups = [100, 10]      // 100 individuals, 10 years
-/// - coef_start = [0, 100]     // individuals at 0..100, years at 100..110
-/// - n_coef = 110              // total coefficients
+/// coef = [α₀, α₁, ..., α_{n0-1},  γ₀, γ₁, ..., γ_{n1-1}, ...]
+///         |---- n_groups[0] ----|  |---- n_groups[1] ----|
+///         coef_start[0]=0          coef_start[1]=n0
 /// ```
+///
+/// ## Example: 1000 obs, 100 individuals, 10 years
+///
+/// | Array      | Size  | Contents                           |
+/// |------------|-------|-------------------------------------|
+/// | group_ids  | 2000  | Which individual/year each obs is  |
+/// | coef       | 110   | The 100 α + 10 γ coefficient values|
+///
+/// To get coefficient for observation i in FE q:
+/// ```rust
+/// let group = group_ids[q * n_obs + i];
+/// let coef_value = coef[coef_start[q] + group];
+/// ```
+
 pub struct FixedEffectsIndex {
     /// Number of observations (N).
     pub n_obs: usize,
@@ -386,6 +411,24 @@ impl DemeanContext {
     ///
     /// Computes weighted sums of `values` for each group in each FE,
     /// writing the result to `out`. The buffer is zeroed before accumulation.
+    ///
+    /// # Example
+    ///
+    /// With 4 observations, 2 firms (FE0), 2 years (FE1):
+    ///
+    /// ```text
+    /// values = [10, 20, 30, 40]  (e.g., y values)
+    /// firm   = [ 0,  0,  1,  1]  (obs 0,1 → firm 0; obs 2,3 → firm 1)
+    /// year   = [ 0,  1,  0,  1]  (obs 0,2 → year 0; obs 1,3 → year 1)
+    ///
+    /// out = [S₀[0], S₀[1], S₁[0], S₁[1]]
+    ///     = [10+20, 30+40, 10+30, 20+40]
+    ///     = [  30,    70,    40,    60 ]
+    ///       ├─ FE0 ─┤ ├─ FE1 ─┤
+    /// ```
+    ///
+    /// Used to precompute per-group sums of y (coefficient sums S)
+    /// and per-group sums of weights (group weights W).
     #[inline]
     pub fn apply_design_matrix_t(&self, values: &[f64], out: &mut [f64]) {
         debug_assert_eq!(
@@ -415,6 +458,10 @@ impl DemeanContext {
     ///
     /// For each observation, looks up its coefficient for each FE and adds to output.
     /// Computes: `output[i] += Σ_q coef[offset_q + fe_q[i]]`
+    ///
+    /// Used for: final residuals (r = y - D·coef), periodic SSR convergence checks,
+    /// and 3+ FE projector scratch computation (every iteration). The 2-FE projector
+    /// avoids calling this in its inner loop by working entirely in coefficient space.
     #[inline]
     pub fn apply_design_matrix(&self, coef: &[f64], output: &mut [f64]) {
         for q in 0..self.index.n_fe {

From 1610a7026af8af7415583f02cf14bce63d05e02a Mon Sep 17 00:00:00 2001
From: Alexander Fischer <alexander-fischer1801@t-online.de>
Date: Wed, 7 Jan 2026 23:20:30 +0100
Subject: [PATCH 14/24] document ssc = 0 convergence reason

---
 src/demean/accelerator.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/demean/accelerator.rs b/src/demean/accelerator.rs
index beb9556c6..41ec13379 100644
--- a/src/demean/accelerator.rs
+++ b/src/demean/accelerator.rs
@@ -313,6 +313,8 @@ impl IronsTuckGrand {
             })
             .fold((0.0, 0.0), |(vp, sq), (dvp, dsq)| (vp + dvp, sq + dsq));
 
+        // ssq = Σ(δ²x)² where δ²x = ggx - 2·gx + x.
+        //  At fixed point x*, all three equal x*, so δ²x = 0.
         if ssq == 0.0 {
             return ConvergenceState::Converged;
         }

From 06ef560a8ee2908e1cc870dd89db378d69ee8d16 Mon Sep 17 00:00:00 2001
From: Alexander Fischer <alexander-fischer1801@t-online.de>
Date: Wed, 7 Jan 2026 23:22:47 +0100
Subject: [PATCH 15/24] Rename coef to omega in Irons-Tuck accelerate for
 clarity

---
 src/demean/accelerator.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/demean/accelerator.rs b/src/demean/accelerator.rs
index 41ec13379..ebc32ef79 100644
--- a/src/demean/accelerator.rs
+++ b/src/demean/accelerator.rs
@@ -319,12 +319,12 @@ impl IronsTuckGrand {
             return ConvergenceState::Converged;
         }
 
-        let coef = vprod / ssq;
+        let omega = vprod / ssq;
         x.iter_mut()
             .zip(gx.iter())
             .zip(ggx.iter())
             .for_each(|((x_i, &gx_i), &ggx_i)| {
-                *x_i = ggx_i - coef * (ggx_i - gx_i);
+                *x_i = ggx_i - omega * (ggx_i - gx_i);
             });
 
         ConvergenceState::NotConverged

From de60290acfefca494938a9a249c390be5ccdb91b Mon Sep 17 00:00:00 2001
From: Alexander Fischer <alexander-fischer1801@t-online.de>
Date: Wed, 7 Jan 2026 23:44:01 +0100
Subject: [PATCH 16/24] DemeanResult struct does not contain coefficients
 (though it would be nice to have

---
 src/demean/demeaner.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/demean/demeaner.rs b/src/demean/demeaner.rs
index b1a8d17f6..3f1ec63df 100644
--- a/src/demean/demeaner.rs
+++ b/src/demean/demeaner.rs
@@ -34,7 +34,6 @@ pub trait Demeaner {
     /// - `demeaned`: The input with fixed effects removed
     /// - `success`: Whether the algorithm converged
     /// - `iterations`: Number of iterations (0 for closed-form solutions)
-    /// - `coefficients`: FE coefficients (`None` for 3+ FE case)
     fn solve(&mut self, input: &[f64]) -> DemeanResult;
 }
 

From 35ba573fd2c6e4f262ddd89ec4cd3465e2409b3b Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Fri, 9 Jan 2026 15:37:49 +0100
Subject: [PATCH 17/24] Always reorder FEs by size (remove reorder_fe config
 option)

Fixed effects are now always sorted by number of groups (largest first),
matching fixest's default behavior. This simplifies the API and ensures
optimal convergence properties.

Changes:
- Remove `reorder_fe` field from `FixestConfig`
- Remove `with_reorder` method from `FixedEffectsIndex`
- Remove `with_config` method from `DemeanContext`
- Simplify `FixedEffectsIndex::new()` to always reorder

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/demean/mod.rs   |  4 +--
 src/demean/types.rs | 61 +++++++--------------------------------------
 2 files changed, 11 insertions(+), 54 deletions(-)

diff --git a/src/demean/mod.rs b/src/demean/mod.rs
index b8a43f1b7..569bb8777 100644
--- a/src/demean/mod.rs
+++ b/src/demean/mod.rs
@@ -101,8 +101,8 @@ pub(crate) fn demean(
     let not_converged = Arc::new(AtomicUsize::new(0));
     let mut res = Array2::<f64>::zeros((n_samples, n_features));
 
-    // Use reorder_fe from config (default true, matching fixest)
-    let ctx = DemeanContext::with_config(flist, weights, config.reorder_fe);
+    // FEs are automatically reordered by size (largest first) for optimal convergence
+    let ctx = DemeanContext::new(flist, weights);
 
     res.axis_iter_mut(ndarray::Axis(1))
         .into_par_iter()
diff --git a/src/demean/types.rs b/src/demean/types.rs
index 58703ca88..3bdd92ced 100644
--- a/src/demean/types.rs
+++ b/src/demean/types.rs
@@ -131,28 +131,7 @@ impl FixedEffectsIndex {
     /// # Panics
     ///
     /// Panics in debug builds if `n_obs == 0` or `n_fe == 0`.
-    #[allow(dead_code)]
     pub fn new(flist: &ArrayView2<usize>) -> Self {
-        Self::with_reorder(flist, false)
-    }
-
-    /// Create a fixed effects index, optionally reordering FEs by size.
-    ///
-    /// When `reorder_fe` is true, fixed effects are sorted by number of groups
-    /// (largest first). This matches R's fixest behavior and improves convergence
-    /// for 3+ FE cases by making the 2-FE sub-convergence phase work on the
-    /// largest FEs first.
-    ///
-    /// # Arguments
-    ///
-    /// * `flist` - Fixed effect group IDs with shape `(n_obs, n_fe)`.
-    /// * `reorder_fe` - Whether to reorder FEs by size (largest first).
-    ///
-    /// # Returns
-    ///
-    /// A `FixedEffectsIndex` with `original_order` tracking the mapping from
-    /// current indices to original indices.
-    pub fn with_reorder(flist: &ArrayView2<usize>, reorder_fe: bool) -> Self {
         let (n_obs, n_fe) = flist.dim();
 
         debug_assert!(n_obs > 0, "Cannot create FixedEffectsIndex with 0 observations");
@@ -163,8 +142,11 @@ impl FixedEffectsIndex {
             .map(|j| flist.column(j).iter().max().unwrap_or(&0) + 1)
             .collect();
 
-        // Determine the order: either sorted by size or identity
-        let order: Vec<usize> = if reorder_fe && n_fe > 1 {
+        // Sort FEs by size (largest first) for optimal convergence.
+        // This matches fixest's default behavior and allows excluding the largest
+        // FE from convergence checking (since FE 0 will be at the start of the
+        // coefficient array, we can efficiently check just the suffix).
+        let order: Vec<usize> = if n_fe > 1 {
             let mut indices: Vec<usize> = (0..n_fe).collect();
             indices.sort_by_key(|&i| std::cmp::Reverse(n_groups_original[i]));
             indices
@@ -355,35 +337,18 @@ pub struct DemeanContext {
 impl DemeanContext {
     /// Create a demeaning context from input arrays.
     ///
-    /// # Arguments
-    ///
-    /// * `flist` - Fixed effect group IDs with shape `(n_obs, n_fe)`
-    /// * `weights` - Per-observation weights (length: `n_obs`)
-    ///
-    /// # Panics
-    ///
-    /// Panics in debug builds if `weights.len() != flist.nrows()`.
-    #[allow(dead_code)]
-    pub fn new(flist: &ArrayView2<usize>, weights: &ArrayView1<f64>) -> Self {
-        Self::with_config(flist, weights, false)
-    }
-
-    /// Create a demeaning context with configuration options.
+    /// Fixed effects are automatically reordered by size (largest first) for
+    /// optimal convergence. This matches fixest's default behavior.
     ///
     /// # Arguments
     ///
     /// * `flist` - Fixed effect group IDs with shape `(n_obs, n_fe)`
     /// * `weights` - Per-observation weights (length: `n_obs`)
-    /// * `reorder_fe` - Whether to reorder FEs by size (largest first)
     ///
     /// # Panics
     ///
     /// Panics in debug builds if `weights.len() != flist.nrows()`.
-    pub fn with_config(
-        flist: &ArrayView2<usize>,
-        weights: &ArrayView1<f64>,
-        reorder_fe: bool,
-    ) -> Self {
+    pub fn new(flist: &ArrayView2<usize>, weights: &ArrayView1<f64>) -> Self {
         debug_assert_eq!(
             weights.len(),
             flist.nrows(),
@@ -392,7 +357,7 @@ impl DemeanContext {
             flist.nrows()
         );
 
-        let index = FixedEffectsIndex::with_reorder(flist, reorder_fe);
+        let index = FixedEffectsIndex::new(flist);
         let weights = ObservationWeights::new(weights, &index);
         Self { index, weights }
     }
@@ -503,12 +468,6 @@ pub struct FixestConfig {
 
     /// Iterations between SSR-based convergence checks.
     pub ssr_check_interval: usize,
-
-    /// Whether to reorder fixed effects by size (largest first).
-    /// This matches fixest's default behavior and improves convergence
-    /// for 3+ FE cases by making the 2-FE sub-convergence phase work
-    /// on the largest FEs first.
-    pub reorder_fe: bool,
 }
 
 impl Default for FixestConfig {
@@ -527,8 +486,6 @@ impl Default for FixestConfig {
             iter_grand_acc: 4,
             // SSR convergence check frequency
             ssr_check_interval: 40,
-            // Reorder FEs by size (matches fixest's fixef.reorder = TRUE default)
-            reorder_fe: true,
         }
     }
 }

From c277282ad734023f7e1b9865d7c00c44e327ae36 Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Fri, 9 Jan 2026 15:38:03 +0100
Subject: [PATCH 18/24] Change convergence_len to convergence_range for generic
 Projector

Replace `convergence_len() -> usize` with `convergence_range() -> Range<usize>`
in the Projector trait. This makes the accelerator fully generic over any
Projector implementation, not just FE-specific ones that check a prefix.

The accelerator extracts (start, end) from the range to avoid cloning overhead.

Following fixest's approach, FE projectors exclude the last FE (smallest after
reordering) from convergence checking. At a fixed point, if (n_fe - 1) FEs
have converged, the remaining one must also have converged.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/demean/accelerator.rs | 35 +++++++++++++++++------------------
 src/demean/projection.rs  | 28 ++++++++++++++++++++--------
 2 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/src/demean/accelerator.rs b/src/demean/accelerator.rs
index ebc32ef79..498d6d0af 100644
--- a/src/demean/accelerator.rs
+++ b/src/demean/accelerator.rs
@@ -212,16 +212,17 @@ impl IronsTuckGrand {
         coef: &mut [f64],
         iter: usize,
     ) -> ConvergenceState {
-        let conv_len = projector.convergence_len();
+        let conv_range = projector.convergence_range();
+        let (cs, ce) = (conv_range.start, conv_range.end);
 
         // Double projection for Irons-Tuck: G(G(x))
         projector.project(&self.buffers.gx, &mut self.buffers.ggx);
 
         // Irons-Tuck acceleration
         if Self::accelerate(
-            &mut coef[..conv_len],
-            &self.buffers.gx[..conv_len],
-            &self.buffers.ggx[..conv_len],
+            &mut coef[cs..ce],
+            &self.buffers.gx[cs..ce],
+            &self.buffers.ggx[cs..ce],
         ) == ConvergenceState::Converged
         {
             return ConvergenceState::Converged;
@@ -229,7 +230,7 @@ impl IronsTuckGrand {
 
         // Post-acceleration projection (after warmup)
         if iter >= self.config.iter_proj_after_acc {
-            self.buffers.temp[..conv_len].copy_from_slice(&coef[..conv_len]);
+            self.buffers.temp[cs..ce].copy_from_slice(&coef[cs..ce]);
             projector.project(&self.buffers.temp, coef);
         }
 
@@ -280,12 +281,9 @@ impl IronsTuckGrand {
         coef: &[f64],
     ) -> ConvergenceState {
         projector.project(coef, &mut self.buffers.gx);
-        let conv_len = projector.convergence_len();
-        if Self::should_continue(
-            &coef[..conv_len],
-            &self.buffers.gx[..conv_len],
-            self.config.tol,
-        ) {
+        let conv_range = projector.convergence_range();
+        let (cs, ce) = (conv_range.start, conv_range.end);
+        if Self::should_continue(&coef[cs..ce], &self.buffers.gx[cs..ce], self.config.tol) {
             ConvergenceState::NotConverged
         } else {
             ConvergenceState::Converged
@@ -353,22 +351,23 @@ impl IronsTuckGrand {
         projector: &mut P,
         phase: GrandPhase,
     ) -> GrandStepResult {
-        let conv_len = projector.convergence_len();
+        let conv_range = projector.convergence_range();
+        let (cs, ce) = (conv_range.start, conv_range.end);
         match phase {
             GrandPhase::Collect1st => {
-                self.buffers.y[..conv_len].copy_from_slice(&self.buffers.gx[..conv_len]);
+                self.buffers.y[cs..ce].copy_from_slice(&self.buffers.gx[cs..ce]);
                 GrandStepResult::Continue(GrandPhase::Collect2nd)
             }
             GrandPhase::Collect2nd => {
-                self.buffers.gy[..conv_len].copy_from_slice(&self.buffers.gx[..conv_len]);
+                self.buffers.gy[cs..ce].copy_from_slice(&self.buffers.gx[cs..ce]);
                 GrandStepResult::Continue(GrandPhase::Collect3rdAndAccelerate)
             }
             GrandPhase::Collect3rdAndAccelerate => {
-                self.buffers.ggy[..conv_len].copy_from_slice(&self.buffers.gx[..conv_len]);
+                self.buffers.ggy[cs..ce].copy_from_slice(&self.buffers.gx[cs..ce]);
                 let convergence = Self::accelerate(
-                    &mut self.buffers.y[..conv_len],
-                    &self.buffers.gy[..conv_len],
-                    &self.buffers.ggy[..conv_len],
+                    &mut self.buffers.y[cs..ce],
+                    &self.buffers.gy[cs..ce],
+                    &self.buffers.ggy[cs..ce],
                 );
                 if convergence == ConvergenceState::Converged {
                     return GrandStepResult::Done(ConvergenceState::Converged);
diff --git a/src/demean/projection.rs b/src/demean/projection.rs
index a2a8efbd1..fefc6f5b1 100644
--- a/src/demean/projection.rs
+++ b/src/demean/projection.rs
@@ -23,6 +23,7 @@
 //! which handles the iteration strategy.
 
 use crate::demean::types::DemeanContext;
+use std::ops::Range;
 
 // =============================================================================
 // Projector Trait
@@ -55,11 +56,20 @@ pub trait Projector {
     /// Compute the sum of squared residuals for the given coefficients.
     fn compute_ssr(&mut self, coef: &[f64]) -> f64;
 
-    /// Length of the coefficient slice to use for convergence checking.
+    /// Range of coefficients to use for convergence checking.
     ///
-    /// This may be smaller than `coef_len()` when not all coefficients
-    /// need to be checked (e.g., for 2-FE only alpha is checked).
-    fn convergence_len(&self) -> usize;
+    /// # Why not all coefficients?
+    ///
+    /// At a fixed point, if any (n_fe - 1) fixed effects have converged,
+    /// the remaining one must also have converged (its inputs are stable,
+    /// so its output is stable). This allows us to skip checking one FE.
+    ///
+    /// # Which FE to exclude?
+    ///
+    /// Following fixest's approach, we exclude the **last FE** (smallest after
+    /// reordering). In the reverse sweep, this FE is processed first using
+    /// stale data from the previous iteration. Returns `0..n_coef - n_groups[n_fe-1]`.
+    fn convergence_range(&self) -> Range<usize>;
 }
 
 // =============================================================================
@@ -207,8 +217,9 @@ impl Projector for TwoFEProjector<'_> {
     }
 
     #[inline(always)]
-    fn convergence_len(&self) -> usize {
-        self.ctx.index.n_groups[0]
+    fn convergence_range(&self) -> Range<usize> {
+        // Exclude FE 1 (last/smallest), check only FE 0
+        0..self.ctx.index.n_groups[0]
     }
 }
 
@@ -378,7 +389,8 @@ impl Projector for MultiFEProjector<'_> {
     }
 
     #[inline(always)]
-    fn convergence_len(&self) -> usize {
-        self.ctx.index.n_coef - self.ctx.index.n_groups[self.ctx.index.n_fe - 1]
+    fn convergence_range(&self) -> Range<usize> {
+        // Exclude last FE (smallest), check FEs 0 through n_fe-2
+        0..self.ctx.index.n_coef - self.ctx.index.n_groups[self.ctx.index.n_fe - 1]
     }
 }

From 7a5089ba3f3575f16b406e95b2ec4c1dba2aa24d Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Fri, 9 Jan 2026 16:19:17 +0100
Subject: [PATCH 19/24] Add FE coefficient tracking with original order
 restoration

- Add original_to_reordered mapping to FixedEffectsIndex for tracking
  how FEs are reordered internally (by size for optimal convergence)
- Add fe_coefficients field to DemeanResult
- Add reorder_coefficients_to_original() method to restore coefficients
  to the user's original FE order
- Add total_coef buffer to MultiFEBuffers for accumulating coefficients
  across all demeaning phases (warmup, two_fe_convergence, reacceleration)
- Update all demeaners to populate and return FE coefficients

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 pyfixest/core/demean.py |   9 +-
 src/demean/demeaner.rs  |  37 +++-
 src/demean/mod.rs       | 396 +++++++++++++++++++++++++++++++++++++---
 src/demean/types.rs     |  76 ++++++++
 4 files changed, 485 insertions(+), 33 deletions(-)

diff --git a/pyfixest/core/demean.py b/pyfixest/core/demean.py
index 8af8c8bbe..4ce3982d0 100644
--- a/pyfixest/core/demean.py
+++ b/pyfixest/core/demean.py
@@ -1,8 +1,13 @@
+from typing import Any
+
 import numpy as np
 from numpy.typing import NDArray
 
 from ._core_impl import _demean_rs
 
+# Type alias for the dict returned by _demean_rs
+DemeanResultDict = dict[str, Any]
+
 
 def demean(
     x: NDArray[np.float64],
@@ -70,10 +75,12 @@ def demean(
     print(pf.feols(fml, data).coef())
     ```
     """
-    return _demean_rs(
+    # _demean_rs now returns a dict with demeaned, fe_coefficients, success
+    result: DemeanResultDict = _demean_rs(  # type: ignore[assignment]
         x.astype(np.float64, copy=False),
         flist.astype(np.uint64, copy=False),
         weights.astype(np.float64, copy=False),
         tol,
         maxiter,
     )
+    return result["demeaned"], result["success"]
diff --git a/src/demean/demeaner.rs b/src/demean/demeaner.rs
index 3f1ec63df..6ce36b4fa 100644
--- a/src/demean/demeaner.rs
+++ b/src/demean/demeaner.rs
@@ -64,6 +64,7 @@ impl<'a> SingleFEDemeaner<'a> {
 impl Demeaner for SingleFEDemeaner<'_> {
     fn solve(&mut self, input: &[f64]) -> DemeanResult {
         let n_obs = self.ctx.index.n_obs;
+        let n_coef = self.ctx.index.n_coef;
 
         // Apply Dᵀ to get coefficient-space sums (reuses buffer)
         self.ctx.apply_design_matrix_t(input, &mut self.coef_sums_buffer);
@@ -71,15 +72,21 @@ impl Demeaner for SingleFEDemeaner<'_> {
         let fe0 = self.ctx.index.group_ids_for_fe(0);
         let group_weights = self.ctx.group_weights_for_fe(0);
 
-        // output[i] = input[i] - group_mean[fe0[i]]
-        // where group_mean[g] = coef_sums_buffer[g] / group_weights[g]
+        // Compute FE coefficients: coef[g] = sum[g] / weight[g]
+        let fe_coefficients: Vec<f64> = (0..n_coef)
+            .map(|g| self.coef_sums_buffer[g] / group_weights[g])
+            .collect();
+
+        // output[i] = input[i] - coef[fe0[i]]
         let demeaned: Vec<f64> = (0..n_obs)
-            .map(|i| input[i] - self.coef_sums_buffer[fe0[i]] / group_weights[fe0[i]])
+            .map(|i| input[i] - fe_coefficients[fe0[i]])
             .collect();
 
         // Single FE is a closed-form solution, always converges in 0 iterations
+        // No reordering needed for 1 FE
         DemeanResult {
             demeaned,
+            fe_coefficients,
             convergence: ConvergenceState::Converged,
             iterations: 0,
         }
@@ -149,8 +156,12 @@ impl Demeaner for TwoFEDemeaner<'_> {
             .map(|i| input[i] - self.coef[fe0[i]] - self.coef[n0 + fe1[i]])
             .collect();
 
+        // Reorder coefficients back to original FE order
+        let fe_coefficients = self.ctx.index.reorder_coefficients_to_original(&self.coef);
+
         DemeanResult {
             demeaned,
+            fe_coefficients,
             convergence,
             iterations: iter,
         }
@@ -168,6 +179,8 @@ impl Demeaner for TwoFEDemeaner<'_> {
 struct MultiFEBuffers {
     /// Accumulated fixed effects per observation (observation-space)
     mu: Vec<f64>,
+    /// Accumulated coefficients across all phases (coefficient-space)
+    total_coef: Vec<f64>,
     /// Working coefficient array for accelerator (reset each phase)
     coef: Vec<f64>,
     /// Coefficient array for 2-FE sub-convergence (coefficient-space, first 2 FEs only)
@@ -183,6 +196,7 @@ impl MultiFEBuffers {
     fn new(n_obs: usize, n_coef: usize, n_coef_2fe: usize) -> Self {
         Self {
             mu: vec![0.0; n_obs],
+            total_coef: vec![0.0; n_coef],
             coef: vec![0.0; n_coef],
             coef_2fe: vec![0.0; n_coef_2fe],
             effective_input: vec![0.0; n_obs],
@@ -194,6 +208,7 @@ impl MultiFEBuffers {
     #[inline]
     fn reset(&mut self) {
         self.mu.fill(0.0);
+        self.total_coef.fill(0.0);
         self.coef.fill(0.0);
     }
 }
@@ -247,6 +262,10 @@ impl<'a> MultiFEDemeaner<'a> {
             .multi_acc
             .run(&mut projector, &mut self.buffers.coef, self.config.iter_warmup);
 
+        // Accumulate coefficients and apply to mu
+        for (tc, &c) in self.buffers.total_coef.iter_mut().zip(self.buffers.coef.iter()) {
+            *tc += c;
+        }
         self.ctx
             .apply_design_matrix(&self.buffers.coef, &mut self.buffers.mu);
         (iter, convergence)
@@ -281,6 +300,10 @@ impl<'a> MultiFEDemeaner<'a> {
             self.config.maxiter / 2,
         );
 
+        // Accumulate 2-FE coefficients to total_coef (first 2 FEs only)
+        for (tc, &c) in self.buffers.total_coef[..n_coef_2fe].iter_mut().zip(self.buffers.coef_2fe.iter()) {
+            *tc += c;
+        }
         // Add 2-FE coefficients to mu
         self.add_2fe_coefficients_to_mu();
         (iter, convergence)
@@ -311,6 +334,10 @@ impl<'a> MultiFEDemeaner<'a> {
             self.multi_acc
                 .run(&mut projector, &mut self.buffers.coef, remaining);
 
+        // Accumulate coefficients and apply to mu
+        for (tc, &c) in self.buffers.total_coef.iter_mut().zip(self.buffers.coef.iter()) {
+            *tc += c;
+        }
         self.ctx
             .apply_design_matrix(&self.buffers.coef, &mut self.buffers.mu);
         (iter, convergence)
@@ -341,8 +368,12 @@ impl<'a> MultiFEDemeaner<'a> {
             .map(|(&x, &mu)| x - mu)
             .collect();
 
+        // Reorder coefficients back to original FE order
+        let fe_coefficients = self.ctx.index.reorder_coefficients_to_original(&self.buffers.total_coef);
+
         DemeanResult {
             demeaned,
+            fe_coefficients,
             convergence,
             iterations: iter,
         }
diff --git a/src/demean/mod.rs b/src/demean/mod.rs
index 569bb8777..2b96e2088 100644
--- a/src/demean/mod.rs
+++ b/src/demean/mod.rs
@@ -34,9 +34,10 @@ pub mod types;
 use demeaner::{Demeaner, MultiFEDemeaner, SingleFEDemeaner, TwoFEDemeaner};
 use types::{ConvergenceState, DemeanContext, DemeanResult, FixestConfig};
 
-use ndarray::{Array2, ArrayView1, ArrayView2, Zip};
+use ndarray::{Array2, ArrayView1, ArrayView2};
 use numpy::{PyArray2, PyReadonlyArray1, PyReadonlyArray2};
 use pyo3::prelude::*;
+use pyo3::types::PyDict;
 use rayon::prelude::*;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
@@ -73,6 +74,13 @@ impl<'a> ThreadLocalDemeaner<'a> {
     }
 }
 
+/// Result of batch demeaning operation.
+pub(crate) struct DemeanBatchResult {
+    pub demeaned: Array2<f64>,
+    pub fe_coefficients: Array2<f64>,
+    pub success: bool,
+}
+
 /// Demean using accelerated coefficient-space iteration.
 ///
 /// Uses `for_each_init` to create one demeaner per thread, reusing buffers
@@ -80,8 +88,9 @@ impl<'a> ThreadLocalDemeaner<'a> {
 ///
 /// # Returns
 ///
-/// A tuple of (demeaned_data, success) where:
-/// - `demeaned_data`: The demeaned data as an `Array2<f64>`
+/// A `DemeanBatchResult` containing:
+/// - `demeaned`: The demeaned data as an `Array2<f64>`
+/// - `fe_coefficients`: FE coefficients as an `Array2<f64>`
 /// - `success`: True if all columns converged
 pub(crate) fn demean(
     x: &ArrayView2<f64>,
@@ -89,7 +98,7 @@ pub(crate) fn demean(
     weights: &ArrayView1<f64>,
     tol: f64,
     maxiter: usize,
-) -> (Array2<f64>, bool) {
+) -> DemeanBatchResult {
     let (n_samples, n_features) = x.dim();
 
     let config = FixestConfig {
@@ -99,65 +108,89 @@ pub(crate) fn demean(
     };
 
     let not_converged = Arc::new(AtomicUsize::new(0));
-    let mut res = Array2::<f64>::zeros((n_samples, n_features));
+    let mut demeaned = Array2::<f64>::zeros((n_samples, n_features));
 
     // FEs are automatically reordered by size (largest first) for optimal convergence
     let ctx = DemeanContext::new(flist, weights);
+    let n_coef = ctx.index.n_coef;
+
+    let mut fe_coefficients = Array2::<f64>::zeros((n_coef, n_features));
 
-    res.axis_iter_mut(ndarray::Axis(1))
+    // Process columns in parallel, collecting both demeaned values and FE coefficients
+    let results: Vec<(usize, DemeanResult)> = demeaned
+        .axis_iter_mut(ndarray::Axis(1))
         .into_par_iter()
         .enumerate()
-        .for_each_init(
-            // Init closure: called once per thread to create the thread-local state
+        .map_init(
             || ThreadLocalDemeaner::new(&ctx, &config),
-            // Body closure: called for each column, reusing thread-local state
-            |demeaner, (k, mut col)| {
+            |demeaner, (k, _)| {
                 let col_view = x.column(k);
-                // Zero-copy if the column is contiguous (F-order), otherwise copy
                 let result = if let Some(slice) = col_view.as_slice() {
                     demeaner.solve(slice)
                 } else {
                     let xk: Vec<f64> = col_view.to_vec();
                     demeaner.solve(&xk)
                 };
+                (k, result)
+            },
+        )
+        .collect();
 
-                if result.convergence == ConvergenceState::NotConverged {
-                    not_converged.fetch_add(1, Ordering::SeqCst);
-                }
+    // Copy results back (sequential, but fast)
+    for (k, result) in results {
+        if result.convergence == ConvergenceState::NotConverged {
+            not_converged.fetch_add(1, Ordering::SeqCst);
+        }
 
-                Zip::from(&mut col)
-                    .and(&result.demeaned)
-                    .for_each(|col_elm, &val| {
-                        *col_elm = val;
-                    });
-            },
-        );
+        // Copy demeaned values
+        for (i, &val) in result.demeaned.iter().enumerate() {
+            demeaned[[i, k]] = val;
+        }
+
+        // Copy FE coefficients
+        for (i, &val) in result.fe_coefficients.iter().enumerate() {
+            fe_coefficients[[i, k]] = val;
+        }
+    }
 
     let success = not_converged.load(Ordering::SeqCst) == 0;
-    (res, success)
+    DemeanBatchResult {
+        demeaned,
+        fe_coefficients,
+        success,
+    }
 }
 
 /// Python-exposed function for accelerated demeaning.
 ///
-/// Returns a tuple of (demeaned_array, success).
+/// Returns a dict with:
+/// - "demeaned": Array of demeaned values (n_samples, n_features)
+/// - "fe_coefficients": Array of FE coefficients (n_coef, n_features)
+/// - "success": Boolean indicating convergence
 #[pyfunction]
 #[pyo3(signature = (x, flist, weights, tol=1e-8, maxiter=100_000))]
-pub fn _demean_rs(
-    py: Python<'_>,
+pub fn _demean_rs<'py>(
+    py: Python<'py>,
     x: PyReadonlyArray2<f64>,
     flist: PyReadonlyArray2<usize>,
     weights: PyReadonlyArray1<f64>,
     tol: f64,
     maxiter: usize,
-) -> PyResult<(Py<PyArray2<f64>>, bool)> {
+) -> PyResult<Bound<'py, PyDict>> {
     let x_arr = x.as_array();
     let flist_arr = flist.as_array();
     let weights_arr = weights.as_array();
 
-    let (demeaned, success) = py.detach(|| demean(&x_arr, &flist_arr, &weights_arr, tol, maxiter));
+    let result = py.detach(|| demean(&x_arr, &flist_arr, &weights_arr, tol, maxiter));
 
-    let pyarray = PyArray2::from_owned_array(py, demeaned);
-    Ok((pyarray.into(), success))
+    let dict = PyDict::new(py);
+    dict.set_item("demeaned", PyArray2::from_owned_array(py, result.demeaned))?;
+    dict.set_item(
+        "fe_coefficients",
+        PyArray2::from_owned_array(py, result.fe_coefficients),
+    )?;
+    dict.set_item("success", result.success)?;
+    Ok(dict)
 }
 
 #[cfg(test)]
@@ -415,4 +448,309 @@ mod tests {
             "Different inputs should produce different results"
         );
     }
+
+    // =========================================================================
+    // FE Coefficient Tests
+    // =========================================================================
+
+    /// Helper: compute residuals by applying FE coefficients to observations.
+    /// Returns input[i] - sum_q(coef[fe_q[i]]) for each observation.
+    fn apply_coefficients(
+        input: &[f64],
+        flist: &Array2<usize>,
+        fe_coefficients: &[f64],
+        n_groups: &[usize],
+    ) -> Vec<f64> {
+        let n_obs = input.len();
+        let n_fe = flist.ncols();
+
+        // Compute coefficient offsets for each FE
+        let mut coef_offsets = vec![0usize; n_fe];
+        for q in 1..n_fe {
+            coef_offsets[q] = coef_offsets[q - 1] + n_groups[q - 1];
+        }
+
+        (0..n_obs)
+            .map(|i| {
+                let mut fe_sum = 0.0;
+                for q in 0..n_fe {
+                    let g = flist[[i, q]];
+                    fe_sum += fe_coefficients[coef_offsets[q] + g];
+                }
+                input[i] - fe_sum
+            })
+            .collect()
+    }
+
+    #[test]
+    fn test_single_fe_coefficients() {
+        let n_obs = 100;
+        let n_groups = 10;
+
+        let mut flist = Array2::<usize>::zeros((n_obs, 1));
+        for i in 0..n_obs {
+            flist[[i, 0]] = i % n_groups;
+        }
+
+        let weights = Array1::<f64>::ones(n_obs);
+        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+
+        let mut demeaner = SingleFEDemeaner::new(&ctx);
+        let result = demeaner.solve(&input);
+
+        // Verify coefficients are correct: applying them should give same residuals
+        let reconstructed = apply_coefficients(&input, &flist, &result.fe_coefficients, &[n_groups]);
+
+        for (i, (&demeaned, &reconstructed)) in
+            result.demeaned.iter().zip(reconstructed.iter()).enumerate()
+        {
+            assert!(
+                (demeaned - reconstructed).abs() < 1e-10,
+                "Obs {}: demeaned ({}) != reconstructed ({})",
+                i,
+                demeaned,
+                reconstructed
+            );
+        }
+
+        // Verify coefficient count
+        assert_eq!(
+            result.fe_coefficients.len(),
+            n_groups,
+            "Should have {} coefficients",
+            n_groups
+        );
+    }
+
+    #[test]
+    fn test_two_fe_coefficients_correct() {
+        let n_obs = 100;
+        let n_groups_0 = 10;
+        let n_groups_1 = 5;
+
+        let mut flist = Array2::<usize>::zeros((n_obs, 2));
+        for i in 0..n_obs {
+            flist[[i, 0]] = i % n_groups_0;
+            flist[[i, 1]] = i % n_groups_1;
+        }
+
+        let weights = Array1::<f64>::ones(n_obs);
+        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+
+        let config = FixestConfig::default();
+        let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
+        let result = demeaner.solve(&input);
+
+        // Verify coefficients are correct: applying them should give same residuals
+        let reconstructed =
+            apply_coefficients(&input, &flist, &result.fe_coefficients, &[n_groups_0, n_groups_1]);
+
+        for (i, (&demeaned, &reconstructed)) in
+            result.demeaned.iter().zip(reconstructed.iter()).enumerate()
+        {
+            assert!(
+                (demeaned - reconstructed).abs() < 1e-8,
+                "Obs {}: demeaned ({}) != reconstructed ({})",
+                i,
+                demeaned,
+                reconstructed
+            );
+        }
+
+        // Verify coefficient count
+        assert_eq!(
+            result.fe_coefficients.len(),
+            n_groups_0 + n_groups_1,
+            "Should have {} coefficients",
+            n_groups_0 + n_groups_1
+        );
+    }
+
+    #[test]
+    fn test_two_fe_coefficients_ordering() {
+        // Test that coefficients are returned in ORIGINAL FE order, not reordered
+        let n_obs = 100;
+
+        // FE 0: 5 groups (smaller), FE 1: 20 groups (larger)
+        // Internally, FEs get reordered by size (largest first), so FE 1 becomes internal FE 0
+        // But the coefficients should be returned in original order: [FE0 coeffs | FE1 coeffs]
+        let n_groups_0 = 5; // smaller
+        let n_groups_1 = 20; // larger
+
+        let mut flist = Array2::<usize>::zeros((n_obs, 2));
+        for i in 0..n_obs {
+            flist[[i, 0]] = i % n_groups_0;
+            flist[[i, 1]] = i % n_groups_1;
+        }
+
+        let weights = Array1::<f64>::ones(n_obs);
+        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+
+        let config = FixestConfig::default();
+        let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
+        let result = demeaner.solve(&input);
+
+        // Verify coefficient count matches original ordering
+        assert_eq!(
+            result.fe_coefficients.len(),
+            n_groups_0 + n_groups_1,
+            "Should have {} coefficients",
+            n_groups_0 + n_groups_1
+        );
+
+        // Verify coefficients are in original order by reconstructing residuals
+        // using the ORIGINAL flist (not reordered)
+        let reconstructed =
+            apply_coefficients(&input, &flist, &result.fe_coefficients, &[n_groups_0, n_groups_1]);
+
+        for (i, (&demeaned, &reconstructed)) in
+            result.demeaned.iter().zip(reconstructed.iter()).enumerate()
+        {
+            assert!(
+                (demeaned - reconstructed).abs() < 1e-8,
+                "Obs {}: demeaned ({}) != reconstructed ({}) - coefficients may be in wrong order",
+                i,
+                demeaned,
+                reconstructed
+            );
+        }
+    }
+
+    #[test]
+    fn test_three_fe_coefficients_correct() {
+        let n_obs = 120;
+        let n_groups_0 = 10;
+        let n_groups_1 = 6;
+        let n_groups_2 = 4;
+
+        let mut flist = Array2::<usize>::zeros((n_obs, 3));
+        for i in 0..n_obs {
+            flist[[i, 0]] = i % n_groups_0;
+            flist[[i, 1]] = i % n_groups_1;
+            flist[[i, 2]] = i % n_groups_2;
+        }
+
+        let weights = Array1::<f64>::ones(n_obs);
+        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+
+        let config = FixestConfig::default();
+        let mut demeaner = MultiFEDemeaner::new(&ctx, &config);
+        let result = demeaner.solve(&input);
+
+        // Verify coefficients are correct
+        let reconstructed = apply_coefficients(
+            &input,
+            &flist,
+            &result.fe_coefficients,
+            &[n_groups_0, n_groups_1, n_groups_2],
+        );
+
+        for (i, (&demeaned, &reconstructed)) in
+            result.demeaned.iter().zip(reconstructed.iter()).enumerate()
+        {
+            assert!(
+                (demeaned - reconstructed).abs() < 1e-6,
+                "Obs {}: demeaned ({}) != reconstructed ({})",
+                i,
+                demeaned,
+                reconstructed
+            );
+        }
+
+        // Verify coefficient count
+        assert_eq!(
+            result.fe_coefficients.len(),
+            n_groups_0 + n_groups_1 + n_groups_2,
+        );
+    }
+
+    #[test]
+    fn test_three_fe_coefficients_ordering() {
+        // Test that 3-FE coefficients are returned in original order
+        let n_obs = 120;
+
+        // Create FEs with different sizes to trigger reordering
+        // Original: FE0=3 groups (smallest), FE1=15 groups (largest), FE2=8 groups (middle)
+        // Reordered internally: FE1, FE2, FE0
+        let n_groups_0 = 3; // smallest
+        let n_groups_1 = 15; // largest
+        let n_groups_2 = 8; // middle
+
+        let mut flist = Array2::<usize>::zeros((n_obs, 3));
+        for i in 0..n_obs {
+            flist[[i, 0]] = i % n_groups_0;
+            flist[[i, 1]] = i % n_groups_1;
+            flist[[i, 2]] = i % n_groups_2;
+        }
+
+        let weights = Array1::<f64>::ones(n_obs);
+        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+
+        let config = FixestConfig::default();
+        let mut demeaner = MultiFEDemeaner::new(&ctx, &config);
+        let result = demeaner.solve(&input);
+
+        // Verify coefficients work with ORIGINAL flist ordering
+        let reconstructed = apply_coefficients(
+            &input,
+            &flist,
+            &result.fe_coefficients,
+            &[n_groups_0, n_groups_1, n_groups_2],
+        );
+
+        for (i, (&demeaned, &reconstructed)) in
+            result.demeaned.iter().zip(reconstructed.iter()).enumerate()
+        {
+            assert!(
+                (demeaned - reconstructed).abs() < 1e-6,
+                "Obs {}: demeaned ({}) != reconstructed ({}) - coefficients may be in wrong order",
+                i,
+                demeaned,
+                reconstructed
+            );
+        }
+    }
+
+    #[test]
+    fn test_weighted_coefficients() {
+        let n_obs = 100;
+        let n_groups_0 = 10;
+        let n_groups_1 = 5;
+
+        let mut flist = Array2::<usize>::zeros((n_obs, 2));
+        for i in 0..n_obs {
+            flist[[i, 0]] = i % n_groups_0;
+            flist[[i, 1]] = i % n_groups_1;
+        }
+
+        // Non-uniform weights
+        let weights: Array1<f64> = (0..n_obs).map(|i| 1.0 + (i % 3) as f64).collect();
+        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+
+        let config = FixestConfig::default();
+        let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
+        let result = demeaner.solve(&input);
+
+        // Verify coefficients are correct with weighted reconstruction
+        let reconstructed =
+            apply_coefficients(&input, &flist, &result.fe_coefficients, &[n_groups_0, n_groups_1]);
+
+        for (i, (&demeaned, &reconstructed)) in
+            result.demeaned.iter().zip(reconstructed.iter()).enumerate()
+        {
+            assert!(
+                (demeaned - reconstructed).abs() < 1e-8,
+                "Weighted obs {}: demeaned ({}) != reconstructed ({})",
+                i,
+                demeaned,
+                reconstructed
+            );
+        }
+    }
 }
diff --git a/src/demean/types.rs b/src/demean/types.rs
index 3bdd92ced..f98c5745c 100644
--- a/src/demean/types.rs
+++ b/src/demean/types.rs
@@ -111,6 +111,12 @@ pub struct FixedEffectsIndex {
 
     /// Total number of coefficients (sum of `n_groups`).
     pub n_coef: usize,
+
+    /// Mapping from original FE index to reordered position.
+    ///
+    /// `original_to_reordered[original_q]` gives the position of original
+    /// FE `original_q` in the reordered (sorted by size) layout.
+    original_to_reordered: Vec<usize>,
 }
 
 impl FixedEffectsIndex {
@@ -173,6 +179,13 @@ impl FixedEffectsIndex {
             }
         }
 
+        // Compute inverse mapping: original_to_reordered[original_q] = reordered_q
+        // order[reordered_q] = original_q, so we invert this
+        let mut original_to_reordered = vec![0usize; n_fe];
+        for (reordered_q, &original_q) in order.iter().enumerate() {
+            original_to_reordered[original_q] = reordered_q;
+        }
+
         Self {
             n_obs,
             n_fe,
@@ -180,6 +193,7 @@ impl FixedEffectsIndex {
             n_groups,
             coef_start,
             n_coef,
+            original_to_reordered,
         }
     }
 
@@ -214,6 +228,56 @@ impl FixedEffectsIndex {
         };
         start..end
     }
+
+    /// Reorder coefficients from internal (sorted by FE size) to original FE order.
+    ///
+    /// During solving, FEs are reordered by size (largest first) for optimal
+    /// convergence. This method restores coefficients to the original FE order
+    /// as they appeared in the input.
+    ///
+    /// # Arguments
+    ///
+    /// * `coef` - Coefficient array in internal (reordered) layout
+    ///
+    /// # Returns
+    ///
+    /// Coefficient array in original FE order.
+    ///
+    /// # Layout
+    ///
+    /// Input layout (reordered, largest FE first):
+    /// ```text
+    /// [FE_reord_0 | FE_reord_1 | ... | FE_reord_{n_fe-1}]
+    /// ```
+    ///
+    /// Output layout (original order):
+    /// ```text
+    /// [FE_orig_0 | FE_orig_1 | ... | FE_orig_{n_fe-1}]
+    /// ```
+    pub fn reorder_coefficients_to_original(&self, coef: &[f64]) -> Vec<f64> {
+        debug_assert_eq!(
+            coef.len(),
+            self.n_coef,
+            "coefficient length ({}) must match n_coef ({})",
+            coef.len(),
+            self.n_coef
+        );
+
+        let mut out = vec![0.0; self.n_coef];
+        let mut out_pos = 0;
+
+        // For each FE in original order
+        for original_q in 0..self.n_fe {
+            let reordered_q = self.original_to_reordered[original_q];
+            let src_start = self.coef_start[reordered_q];
+            let len = self.n_groups[reordered_q];
+
+            out[out_pos..out_pos + len].copy_from_slice(&coef[src_start..src_start + len]);
+            out_pos += len;
+        }
+
+        out
+    }
 }
 
 // =============================================================================
@@ -516,6 +580,18 @@ pub struct DemeanResult {
     /// Demeaned data (single column, length `n_obs`).
     pub demeaned: Vec<f64>,
 
+    /// Fixed effect coefficients in original FE order.
+    ///
+    /// The coefficients are laid out as:
+    /// ```text
+    /// [FE_0 coefficients | FE_1 coefficients | ... | FE_{n_fe-1} coefficients]
+    /// ```
+    /// where FE indices follow the original input order (before internal reordering).
+    ///
+    /// For FE `q`, coefficients are at indices `coef_start_original[q]..coef_start_original[q+1]`
+    /// where `coef_start_original` is the cumulative sum of `n_groups_original`.
+    pub fe_coefficients: Vec<f64>,
+
     /// Convergence state.
     pub convergence: ConvergenceState,
 

From 940ffaf53d68e8b4014f6bf14c0a25d4ea4c3626 Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Fri, 9 Jan 2026 16:21:13 +0100
Subject: [PATCH 20/24] Add Python tests for FE coefficient extraction

Test cases:
- Single FE coefficient correctness
- Two FE coefficient correctness
- Three FE coefficient correctness (random order)
- Coefficient ordering preservation (verifies coefficients match
  original FE order, not internal reordered order)
- Weighted demeaning with coefficient extraction

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 pyfixest/core/_core_impl.pyi |  11 +-
 pyfixest/core/demean.py      |   8 +-
 tests/test_demean.py         | 203 +++++++++++++++++++++++++++++++++++
 3 files changed, 214 insertions(+), 8 deletions(-)

diff --git a/pyfixest/core/_core_impl.pyi b/pyfixest/core/_core_impl.pyi
index 6bb849ec5..680921e29 100644
--- a/pyfixest/core/_core_impl.pyi
+++ b/pyfixest/core/_core_impl.pyi
@@ -1,6 +1,15 @@
+from typing import TypedDict
+
 import numpy as np
 from numpy.typing import NDArray
 
+class DemeanResult(TypedDict):
+    """Result from the Rust demeaning function."""
+
+    demeaned: NDArray[np.float64]
+    fe_coefficients: NDArray[np.float64]
+    success: bool
+
 def _find_collinear_variables_rs(x: NDArray[np.float64], tol: float = 1e-10): ...
 def _crv1_meat_loop_rs(
     scores: NDArray[np.float64],
@@ -13,7 +22,7 @@ def _demean_rs(
     weights: NDArray[np.float64],
     tol: float = 1e-08,
     maxiter: int = 100_000,
-) -> tuple[np.ndarray, bool]: ...
+) -> DemeanResult: ...
 def _count_fixef_fully_nested_all_rs(
     all_fixef_array: NDArray,
     cluster_colnames: NDArray,
diff --git a/pyfixest/core/demean.py b/pyfixest/core/demean.py
index 4ce3982d0..616cfda8f 100644
--- a/pyfixest/core/demean.py
+++ b/pyfixest/core/demean.py
@@ -1,13 +1,8 @@
-from typing import Any
-
 import numpy as np
 from numpy.typing import NDArray
 
 from ._core_impl import _demean_rs
 
-# Type alias for the dict returned by _demean_rs
-DemeanResultDict = dict[str, Any]
-
 
 def demean(
     x: NDArray[np.float64],
@@ -75,8 +70,7 @@ def demean(
     print(pf.feols(fml, data).coef())
     ```
     """
-    # _demean_rs now returns a dict with demeaned, fe_coefficients, success
-    result: DemeanResultDict = _demean_rs(  # type: ignore[assignment]
+    result = _demean_rs(
         x.astype(np.float64, copy=False),
         flist.astype(np.uint64, copy=False),
         weights.astype(np.float64, copy=False),
diff --git a/tests/test_demean.py b/tests/test_demean.py
index 5f20a60ed..9a5c65b82 100644
--- a/tests/test_demean.py
+++ b/tests/test_demean.py
@@ -3,6 +3,7 @@
 import pyhdfe
 import pytest
 
+from pyfixest.core._core_impl import _demean_rs
 from pyfixest.core.demean import demean as demean_rs
 from pyfixest.estimation.cupy.demean_cupy_ import demean_cupy32, demean_cupy64
 from pyfixest.estimation.demean_ import _set_demeaner_backend, demean, demean_model
@@ -517,3 +518,205 @@ def generate_complex_fixed_effects_data():
     flist = np.column_stack([id_indiv, id_firm, id_year]).astype(np.uint64)
     weights = rng.uniform(0.5, 2.0, n)
     return X, flist, weights
+
+
+# =============================================================================
+# FE Coefficient Tests
+# =============================================================================
+
+
+def _apply_fe_coefficients(x, flist, fe_coefficients, n_groups):
+    """
+    Apply FE coefficients to reconstruct residuals.
+
+    Returns x[i] - sum_q(coef[fe_q[i]]) for each observation.
+    """
+    n_obs, n_features = x.shape
+    n_fe = flist.shape[1]
+
+    # Compute coefficient offsets for each FE
+    coef_offsets = np.zeros(n_fe, dtype=int)
+    for q in range(1, n_fe):
+        coef_offsets[q] = coef_offsets[q - 1] + n_groups[q - 1]
+
+    reconstructed = np.zeros_like(x)
+    for k in range(n_features):
+        for i in range(n_obs):
+            fe_sum = 0.0
+            for q in range(n_fe):
+                g = int(flist[i, q])
+                fe_sum += fe_coefficients[coef_offsets[q] + g, k]
+            reconstructed[i, k] = x[i, k] - fe_sum
+
+    return reconstructed
+
+
+def test_fe_coefficients_single_fe():
+    """Test FE coefficients are correct for single FE."""
+    n_obs = 100
+    n_groups = 10
+
+    rng = np.random.default_rng(42)
+    x = rng.normal(0, 1, (n_obs, 2))
+    flist = (np.arange(n_obs) % n_groups).reshape(-1, 1).astype(np.uint64)
+    weights = np.ones(n_obs)
+
+    result = _demean_rs(x, flist, weights)
+
+    assert result["success"], "Should converge"
+    assert result["fe_coefficients"].shape == (n_groups, 2), "Wrong coefficient shape"
+
+    # Verify coefficients: applying them should give same residuals as demeaned
+    reconstructed = _apply_fe_coefficients(
+        x, flist, result["fe_coefficients"], [n_groups]
+    )
+
+    np.testing.assert_allclose(
+        result["demeaned"],
+        reconstructed,
+        rtol=1e-10,
+        atol=1e-10,
+        err_msg="FE coefficients don't reconstruct demeaned values",
+    )
+
+
+def test_fe_coefficients_two_fe():
+    """Test FE coefficients are correct for two FEs."""
+    n_obs = 100
+    n_groups_0 = 10
+    n_groups_1 = 5
+
+    rng = np.random.default_rng(42)
+    x = rng.normal(0, 1, (n_obs, 3))
+    flist = np.column_stack(
+        [np.arange(n_obs) % n_groups_0, np.arange(n_obs) % n_groups_1]
+    ).astype(np.uint64)
+    weights = np.ones(n_obs)
+
+    result = _demean_rs(x, flist, weights)
+
+    assert result["success"], "Should converge"
+    assert result["fe_coefficients"].shape == (n_groups_0 + n_groups_1, 3)
+
+    # Verify coefficients reconstruct demeaned values
+    reconstructed = _apply_fe_coefficients(
+        x, flist, result["fe_coefficients"], [n_groups_0, n_groups_1]
+    )
+
+    np.testing.assert_allclose(
+        result["demeaned"],
+        reconstructed,
+        rtol=1e-8,
+        atol=1e-8,
+        err_msg="FE coefficients don't reconstruct demeaned values",
+    )
+
+
+def test_fe_coefficients_ordering():
+    """Test that FE coefficients are in original FE order, not reordered."""
+    n_obs = 100
+
+    # FE 0: 5 groups (smaller), FE 1: 20 groups (larger)
+    # Internally, FEs get reordered by size (largest first)
+    # But coefficients should be returned in original order
+    n_groups_0 = 5  # smaller
+    n_groups_1 = 20  # larger
+
+    rng = np.random.default_rng(42)
+    x = rng.normal(0, 1, (n_obs, 2))
+    flist = np.column_stack(
+        [np.arange(n_obs) % n_groups_0, np.arange(n_obs) % n_groups_1]
+    ).astype(np.uint64)
+    weights = np.ones(n_obs)
+
+    result = _demean_rs(x, flist, weights)
+
+    # Verify coefficient shape matches original order
+    assert result["fe_coefficients"].shape == (n_groups_0 + n_groups_1, 2)
+
+    # Verify coefficients work with original flist ordering
+    reconstructed = _apply_fe_coefficients(
+        x, flist, result["fe_coefficients"], [n_groups_0, n_groups_1]
+    )
+
+    np.testing.assert_allclose(
+        result["demeaned"],
+        reconstructed,
+        rtol=1e-8,
+        atol=1e-8,
+        err_msg="Coefficients may be in wrong order",
+    )
+
+
+def test_fe_coefficients_three_fe():
+    """Test FE coefficients are correct for three FEs."""
+    n_obs = 120
+
+    # Create FEs with different sizes to trigger reordering
+    # Original: FE0=3 groups (smallest), FE1=15 groups (largest), FE2=8 groups (middle)
+    n_groups_0 = 3
+    n_groups_1 = 15
+    n_groups_2 = 8
+
+    rng = np.random.default_rng(42)
+    x = rng.normal(0, 1, (n_obs, 2))
+    flist = np.column_stack(
+        [
+            np.arange(n_obs) % n_groups_0,
+            np.arange(n_obs) % n_groups_1,
+            np.arange(n_obs) % n_groups_2,
+        ]
+    ).astype(np.uint64)
+    weights = np.ones(n_obs)
+
+    result = _demean_rs(x, flist, weights)
+
+    assert result["success"], "Should converge"
+    assert result["fe_coefficients"].shape == (
+        n_groups_0 + n_groups_1 + n_groups_2,
+        2,
+    )
+
+    # Verify coefficients reconstruct demeaned values
+    reconstructed = _apply_fe_coefficients(
+        x, flist, result["fe_coefficients"], [n_groups_0, n_groups_1, n_groups_2]
+    )
+
+    np.testing.assert_allclose(
+        result["demeaned"],
+        reconstructed,
+        rtol=1e-6,
+        atol=1e-6,
+        err_msg="FE coefficients don't reconstruct demeaned values",
+    )
+
+
+def test_fe_coefficients_weighted():
+    """Test FE coefficients are correct with non-uniform weights."""
+    n_obs = 100
+    n_groups_0 = 10
+    n_groups_1 = 5
+
+    rng = np.random.default_rng(42)
+    x = rng.normal(0, 1, (n_obs, 2))
+    flist = np.column_stack(
+        [np.arange(n_obs) % n_groups_0, np.arange(n_obs) % n_groups_1]
+    ).astype(np.uint64)
+    weights = rng.uniform(0.5, 2.0, n_obs)
+
+    result = _demean_rs(x, flist, weights)
+
+    assert result["success"], "Should converge"
+
+    # Verify coefficients reconstruct demeaned values
+    reconstructed = _apply_fe_coefficients(
+        x, flist, result["fe_coefficients"], [n_groups_0, n_groups_1]
+    )
+
+    np.testing.assert_allclose(
+        result["demeaned"],
+        reconstructed,
+        rtol=1e-8,
+        atol=1e-8,
+        err_msg="Weighted FE coefficients don't reconstruct demeaned values",
+    )

From 593664c3b71a1f84727c4644040bd043d6b85f14 Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Fri, 9 Jan 2026 16:58:30 +0100
Subject: [PATCH 21/24] Make weights optional in demean with fast unweighted
 path

Rust changes:
- DemeanContext now has weights: Option<ObservationWeights>
- When None, uses group_counts for denominators (no per-obs multiplication)
- _demean_rs binding takes weights=None by default

Python changes:
- demean() wrapper detects uniform weights (all equal) via np.allclose
- Passes None to Rust when weights are uniform, enabling fast path
- Public API unchanged (weights parameter still required)

This saves memory (no per-obs weight storage) and computation
(no weight multiplication in scatter operations) for unweighted regression.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 pyfixest/core/_core_impl.pyi |   2 +-
 pyfixest/core/demean.py      |   6 +-
 src/demean/accelerator.rs    |   2 +-
 src/demean/mod.rs            |  60 ++++++++-------
 src/demean/projection.rs     |  36 ++++-----
 src/demean/types.rs          | 138 ++++++++++++++---------------------
 6 files changed, 105 insertions(+), 139 deletions(-)

diff --git a/pyfixest/core/_core_impl.pyi b/pyfixest/core/_core_impl.pyi
index 680921e29..fe42826ef 100644
--- a/pyfixest/core/_core_impl.pyi
+++ b/pyfixest/core/_core_impl.pyi
@@ -19,7 +19,7 @@ def _crv1_meat_loop_rs(
 def _demean_rs(
     x: NDArray[np.float64],
     flist: NDArray[np.uint64],
-    weights: NDArray[np.float64],
+    weights: NDArray[np.float64] | None = None,
     tol: float = 1e-08,
     maxiter: int = 100_000,
 ) -> DemeanResult: ...
diff --git a/pyfixest/core/demean.py b/pyfixest/core/demean.py
index 616cfda8f..19cfa2998 100644
--- a/pyfixest/core/demean.py
+++ b/pyfixest/core/demean.py
@@ -70,10 +70,14 @@ def demean(
     print(pf.feols(fml, data).coef())
     ```
     """
+    # Check if weights are uniform (all equal) - use fast unweighted path
+    weights_f64 = weights.astype(np.float64, copy=False)
+    is_uniform = np.allclose(weights_f64, weights_f64.flat[0], atol=1e-10, rtol=0)
+
     result = _demean_rs(
         x.astype(np.float64, copy=False),
         flist.astype(np.uint64, copy=False),
-        weights.astype(np.float64, copy=False),
+        None if is_uniform else weights_f64,
         tol,
         maxiter,
     )
diff --git a/src/demean/accelerator.rs b/src/demean/accelerator.rs
index 498d6d0af..c2fc48393 100644
--- a/src/demean/accelerator.rs
+++ b/src/demean/accelerator.rs
@@ -422,7 +422,7 @@ mod tests {
             flist[[i, 1]] = i % 5;
         }
         let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
         (ctx, input)
     }
diff --git a/src/demean/mod.rs b/src/demean/mod.rs
index 2b96e2088..7ef4031e2 100644
--- a/src/demean/mod.rs
+++ b/src/demean/mod.rs
@@ -95,7 +95,7 @@ pub(crate) struct DemeanBatchResult {
 pub(crate) fn demean(
     x: &ArrayView2<f64>,
     flist: &ArrayView2<usize>,
-    weights: &ArrayView1<f64>,
+    weights: Option<&ArrayView1<f64>>,
     tol: f64,
     maxiter: usize,
 ) -> DemeanBatchResult {
@@ -168,20 +168,20 @@ pub(crate) fn demean(
 /// - "fe_coefficients": Array of FE coefficients (n_coef, n_features)
 /// - "success": Boolean indicating convergence
 #[pyfunction]
-#[pyo3(signature = (x, flist, weights, tol=1e-8, maxiter=100_000))]
+#[pyo3(signature = (x, flist, weights=None, tol=1e-8, maxiter=100_000))]
 pub fn _demean_rs<'py>(
     py: Python<'py>,
     x: PyReadonlyArray2<f64>,
     flist: PyReadonlyArray2<usize>,
-    weights: PyReadonlyArray1<f64>,
+    weights: Option<PyReadonlyArray1<f64>>,
     tol: f64,
     maxiter: usize,
 ) -> PyResult<Bound<'py, PyDict>> {
     let x_arr = x.as_array();
     let flist_arr = flist.as_array();
-    let weights_arr = weights.as_array();
+    let weights_arr = weights.as_ref().map(|w| w.as_array());
 
-    let result = py.detach(|| demean(&x_arr, &flist_arr, &weights_arr, tol, maxiter));
+    let result = py.detach(|| demean(&x_arr, &flist_arr, weights_arr.as_ref(), tol, maxiter));
 
     let dict = PyDict::new(py);
     dict.set_item("demeaned", PyArray2::from_owned_array(py, result.demeaned))?;
@@ -212,7 +212,7 @@ mod tests {
 
         let weights = Array1::<f64>::ones(n_obs);
 
-        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
@@ -238,7 +238,7 @@ mod tests {
 
         let weights = Array1::<f64>::ones(n_obs);
 
-        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
@@ -261,7 +261,7 @@ mod tests {
         }
 
         let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let mut demeaner = SingleFEDemeaner::new(&ctx);
@@ -301,11 +301,11 @@ mod tests {
 
         // Non-uniform weights: 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, ...
         let weights: Array1<f64> = (0..n_obs).map(|i| 1.0 + (i % 3) as f64).collect();
-        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
 
         assert!(
-            !ctx.weights.is_uniform,
-            "Weights should be detected as non-uniform"
+            ctx.weights.is_some(),
+            "Weights should be Some when provided"
         );
 
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
@@ -332,7 +332,7 @@ mod tests {
         }
 
         let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
@@ -362,7 +362,7 @@ mod tests {
         }
 
         let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
@@ -386,21 +386,19 @@ mod tests {
             flist[[i, 1]] = i % 3;
         }
 
-        // Test uniform weights (all 1.0)
-        let uniform_weights = Array1::<f64>::ones(n_obs);
-        let ctx_uniform = DemeanContext::new(&flist.view(), &uniform_weights.view());
+        // Test with no weights (None) - unweighted case
+        let ctx_unweighted = DemeanContext::new(&flist.view(), None);
         assert!(
-            ctx_uniform.weights.is_uniform,
-            "All-ones weights should be detected as uniform"
+            ctx_unweighted.weights.is_none(),
+            "No weights should result in weights=None"
         );
 
-        // Test non-uniform weights
-        let mut non_uniform_weights = Array1::<f64>::ones(n_obs);
-        non_uniform_weights[0] = 2.0;
-        let ctx_non_uniform = DemeanContext::new(&flist.view(), &non_uniform_weights.view());
+        // Test with weights (Some) - weighted case
+        let weights = Array1::<f64>::ones(n_obs);
+        let ctx_weighted = DemeanContext::new(&flist.view(), Some(&weights.view()));
         assert!(
-            !ctx_non_uniform.weights.is_uniform,
-            "Varying weights should be detected as non-uniform"
+            ctx_weighted.weights.is_some(),
+            "Provided weights should result in weights=Some"
         );
     }
 
@@ -417,7 +415,7 @@ mod tests {
         }
 
         let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
         let config = FixestConfig::default();
 
         // Create a single demeaner and use it multiple times
@@ -493,7 +491,7 @@ mod tests {
         }
 
         let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let mut demeaner = SingleFEDemeaner::new(&ctx);
@@ -536,7 +534,7 @@ mod tests {
         }
 
         let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
@@ -586,7 +584,7 @@ mod tests {
         }
 
         let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
@@ -634,7 +632,7 @@ mod tests {
         }
 
         let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
@@ -688,7 +686,7 @@ mod tests {
         }
 
         let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
@@ -730,7 +728,7 @@ mod tests {
 
         // Non-uniform weights
         let weights: Array1<f64> = (0..n_obs).map(|i| 1.0 + (i % 3) as f64).collect();
-        let ctx = DemeanContext::new(&flist.view(), &weights.view());
+        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
diff --git a/src/demean/projection.rs b/src/demean/projection.rs
index fefc6f5b1..b02b3dfa0 100644
--- a/src/demean/projection.rs
+++ b/src/demean/projection.rs
@@ -120,14 +120,13 @@ impl<'a> TwoFEProjector<'a> {
 
         self.scratch[..n1].copy_from_slice(&self.coef_sums[n0..n0 + n1]);
 
-        if self.ctx.weights.is_uniform {
-            for (&g0, &g1) in fe0.iter().zip(fe1.iter()) {
-                self.scratch[g1] -= alpha[g0];
+        if let Some(w) = &self.ctx.weights {
+            for ((&g0, &g1), &wo) in fe0.iter().zip(fe1.iter()).zip(w.per_obs.iter()) {
+                self.scratch[g1] -= alpha[g0] * wo;
             }
         } else {
-            for ((&g0, &g1), &w) in fe0.iter().zip(fe1.iter()).zip(self.ctx.weights.per_obs.iter())
-            {
-                self.scratch[g1] -= alpha[g0] * w;
+            for (&g0, &g1) in fe0.iter().zip(fe1.iter()) {
+                self.scratch[g1] -= alpha[g0];
             }
         }
 
@@ -149,14 +148,13 @@ impl<'a> TwoFEProjector<'a> {
 
         alpha_out[..n0].copy_from_slice(&self.coef_sums[..n0]);
 
-        if self.ctx.weights.is_uniform {
-            for (&g0, &g1) in fe0.iter().zip(fe1.iter()) {
-                alpha_out[g0] -= self.scratch[g1];
+        if let Some(w) = &self.ctx.weights {
+            for ((&g0, &g1), &wo) in fe0.iter().zip(fe1.iter()).zip(w.per_obs.iter()) {
+                alpha_out[g0] -= self.scratch[g1] * wo;
             }
         } else {
-            for ((&g0, &g1), &w) in fe0.iter().zip(fe1.iter()).zip(self.ctx.weights.per_obs.iter())
-            {
-                alpha_out[g0] -= self.scratch[g1] * w;
+            for (&g0, &g1) in fe0.iter().zip(fe1.iter()) {
+                alpha_out[g0] -= self.scratch[g1];
             }
         }
 
@@ -307,17 +305,13 @@ impl<'a> MultiFEProjector<'a> {
             .copy_from_slice(&self.coef_sums[start..start + n_groups]);
 
         // Subtract accumulated other-FE contributions
-        if self.ctx.weights.is_uniform {
-            for (&g, &sum) in fe.iter().zip(self.scratch.iter()) {
-                coef_out[start + g] -= sum;
+        if let Some(w) = &self.ctx.weights {
+            for ((&g, &sum), &wo) in fe.iter().zip(self.scratch.iter()).zip(w.per_obs.iter()) {
+                coef_out[start + g] -= sum * wo;
             }
         } else {
-            for ((&g, &sum), &w) in fe
-                .iter()
-                .zip(self.scratch.iter())
-                .zip(self.ctx.weights.per_obs.iter())
-            {
-                coef_out[start + g] -= sum * w;
+            for (&g, &sum) in fe.iter().zip(self.scratch.iter()) {
+                coef_out[start + g] -= sum;
             }
         }
 
diff --git a/src/demean/types.rs b/src/demean/types.rs
index f98c5745c..5ea9615dc 100644
--- a/src/demean/types.rs
+++ b/src/demean/types.rs
@@ -286,51 +286,19 @@ impl FixedEffectsIndex {
 
 /// Observation weights and their aggregation to group level.
 ///
-/// # Purpose
-///
-/// In weighted least squares, observations have different weights (e.g., inverse
-/// variance weights). To compute weighted group means, we need:
-///
-/// 1. Per-observation weights for the numerator: `Σ(weight[i] * value[i])`
-/// 2. Per-group weight sums for the denominator: `Σ(weight[i])` for each group
-///
-/// # Uniform Weights Fast Path
-///
-/// When all weights are 1.0 (unweighted regression), `is_uniform = true` enables
-/// optimized code paths that skip multiplication by weights.
+/// Only created when weights are non-uniform. For unweighted regression,
+/// `DemeanContext.weights` is `None`.
 pub struct ObservationWeights {
     /// Weight for each observation (length: `n_obs`).
-    /// Used when scattering values to coefficient space.
     pub per_obs: Vec<f64>,
 
     /// Sum of observation weights for each group (length: `n_coef`).
-    /// Used as denominator when computing group means.
-    /// Layout matches coefficient space: `[fe0_group0, ..., fe0_groupK, fe1_group0, ...]`.
     pub per_group: Vec<f64>,
-
-    /// True if all observation weights are 1.0 (enables the fast path).
-    pub is_uniform: bool,
 }
 
 impl ObservationWeights {
     /// Create observation weights from the input array.
-    ///
-    /// # Arguments
-    ///
-    /// * `weights` - Per-observation weights (length: `n_obs`)
-    /// * `index` - Fixed effects index (needed to aggregate weights to groups)
-    ///
-    /// # Computed Fields
-    ///
-    /// - `is_uniform`: True if all weights are 1.0 (within floating-point tolerance)
-    /// - `per_group`: Sum of observation weights for each group
     pub fn new(weights: &ArrayView1<f64>, index: &FixedEffectsIndex) -> Self {
-        // Tolerance for detecting uniform weights (all 1.0).
-        // Using 1e-10 to account for floating-point representation errors
-        // while being strict enough to intentionally catch non-uniform weights.
-        const UNIFORM_WEIGHT_TOL: f64 = 1e-10;
-        let is_uniform = weights.iter().all(|&w| (w - 1.0).abs() < UNIFORM_WEIGHT_TOL);
-
         // Aggregate observation weights to group level
         let mut per_group = vec![0.0; index.n_coef];
         for q in 0..index.n_fe {
@@ -352,7 +320,6 @@ impl ObservationWeights {
         Self {
             per_obs: weights.to_vec(),
             per_group,
-            is_uniform,
         }
     }
 }
@@ -363,39 +330,17 @@ impl ObservationWeights {
 
 /// Complete context for fixed effects demeaning operations.
 ///
-/// # Purpose
-///
-/// Combines the fixed effects index (which observation belongs to which groups)
-/// with observation weights. Provides the core scatter/gather operations needed
-/// by the iterative demeaning algorithm.
-///
-/// # Operations
-///
-/// The demeaning algorithm repeatedly:
-///
-/// 1. **Scatter**: Aggregate residuals from observations to group coefficients
-/// 2. **Gather**: Subtract group coefficients from observations
-///
-/// These operations transform data between observation space (N values) and
-/// coefficient space (`n_coef` values).
-///
-/// # Example Usage
-///
-/// ```ignore
-/// let ctx = DemeanContext::new(&flist, &weights);
-///
-/// // Apply Dᵀ to get coefficient-space sums
-/// let coef_sums = ctx.apply_design_matrix_t(&input);
-///
-/// // Compute group means: coef[g] = coef_sums[g] / group_weight[g]
-/// // ... (done in solver)
-/// ```
+/// Combines the fixed effects index with optional observation weights.
+/// When `weights` is `None`, uses the fast unweighted path.
 pub struct DemeanContext {
     /// Fixed effects index (observation → group mapping).
     pub index: FixedEffectsIndex,
 
-    /// Observation weights and group-level aggregations.
-    pub weights: ObservationWeights,
+    /// Group counts (length: `n_coef`). Used as denominator for unweighted case.
+    pub group_counts: Vec<f64>,
+
+    /// Observation weights. `None` for unweighted regression (fast path).
+    pub weights: Option<ObservationWeights>,
 }
 
 impl DemeanContext {
@@ -407,29 +352,54 @@ impl DemeanContext {
     /// # Arguments
     ///
     /// * `flist` - Fixed effect group IDs with shape `(n_obs, n_fe)`
-    /// * `weights` - Per-observation weights (length: `n_obs`)
-    ///
-    /// # Panics
-    ///
-    /// Panics in debug builds if `weights.len() != flist.nrows()`.
-    pub fn new(flist: &ArrayView2<usize>, weights: &ArrayView1<f64>) -> Self {
-        debug_assert_eq!(
-            weights.len(),
-            flist.nrows(),
-            "weights length ({}) must match number of observations ({})",
-            weights.len(),
-            flist.nrows()
-        );
-
+    /// * `weights` - Per-observation weights, or `None` for unweighted regression
+    pub fn new(flist: &ArrayView2<usize>, weights: Option<&ArrayView1<f64>>) -> Self {
         let index = FixedEffectsIndex::new(flist);
-        let weights = ObservationWeights::new(weights, &index);
-        Self { index, weights }
+
+        // Always compute group counts (needed for unweighted case)
+        let mut group_counts = vec![0.0; index.n_coef];
+        for q in 0..index.n_fe {
+            let offset = index.coef_start[q];
+            let fe_offset = q * index.n_obs;
+            for i in 0..index.n_obs {
+                let g = index.group_ids[fe_offset + i];
+                group_counts[offset + g] += 1.0;
+            }
+        }
+        // Avoid division by zero for empty groups
+        for c in &mut group_counts {
+            if *c == 0.0 {
+                *c = 1.0;
+            }
+        }
+
+        let weights = weights.map(|w| {
+            debug_assert_eq!(
+                w.len(),
+                flist.nrows(),
+                "weights length ({}) must match number of observations ({})",
+                w.len(),
+                flist.nrows()
+            );
+            ObservationWeights::new(w, &index)
+        });
+
+        Self {
+            index,
+            group_counts,
+            weights,
+        }
     }
 
     /// Get the weight sums for all groups in fixed effect `fe`.
+    /// Returns group counts for unweighted, weighted sums for weighted.
     #[inline(always)]
     pub fn group_weights_for_fe(&self, fe: usize) -> &[f64] {
-        &self.weights.per_group[self.index.coef_range_for_fe(fe)]
+        let range = self.index.coef_range_for_fe(fe);
+        match &self.weights {
+            Some(w) => &w.per_group[range],
+            None => &self.group_counts[range],
+        }
     }
 
     // =========================================================================
@@ -471,13 +441,13 @@ impl DemeanContext {
         for q in 0..self.index.n_fe {
             let offset = self.index.coef_start[q];
             let fe_ids = self.index.group_ids_for_fe(q);
-            if self.weights.is_uniform {
+            if let Some(w) = &self.weights {
                 for (i, &g) in fe_ids.iter().enumerate() {
-                    out[offset + g] += values[i];
+                    out[offset + g] += values[i] * w.per_obs[i];
                 }
             } else {
                 for (i, &g) in fe_ids.iter().enumerate() {
-                    out[offset + g] += values[i] * self.weights.per_obs[i];
+                    out[offset + g] += values[i];
                 }
             }
         }

From 1e11f97ae9a3a005280823401a32a270a473fe0e Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Fri, 9 Jan 2026 00:47:19 +0100
Subject: [PATCH 22/24] Refactor Gauss-Seidel sweeper and cache FE slices

---
 Cargo.lock                |   7 +
 Cargo.toml                |   1 +
 src/demean/accelerator.rs |  51 ++--
 src/demean/demeaner.rs    |  99 +++---
 src/demean/mod.rs         | 567 +++++++++++++----------------------
 src/demean/projection.rs  | 454 +++++++++++++---------------
 src/demean/sweep.rs       | 357 ++++++++++++++++++++++
 src/demean/types.rs       | 617 ++++++++++++++++----------------------
 tests/test_vs_fixest.py   |   3 +-
 9 files changed, 1115 insertions(+), 1041 deletions(-)
 create mode 100644 src/demean/sweep.rs

diff --git a/Cargo.lock b/Cargo.lock
index 37795e7da..a672f5abc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -176,6 +176,7 @@ dependencies = [
  "numpy",
  "pyo3",
  "rayon",
+ "smallvec",
  "thiserror",
 ]
 
@@ -287,6 +288,12 @@ version = "1.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
 
+[[package]]
+name = "smallvec"
+version = "1.15.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03"
+
 [[package]]
 name = "syn"
 version = "2.0.108"
diff --git a/Cargo.toml b/Cargo.toml
index 81eeb3b5e..a6adeda12 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,6 +14,7 @@ ndarray = { version = "0.16.1", features = ["rayon"] }
 rayon   = "1.11.0"
 numpy = "0.26.0"
 thiserror = "2.0.16"
+smallvec = "1.13"
 
 [profile.release]
 opt-level = 3        # Maximize performance
diff --git a/src/demean/accelerator.rs b/src/demean/accelerator.rs
index c2fc48393..d535a357f 100644
--- a/src/demean/accelerator.rs
+++ b/src/demean/accelerator.rs
@@ -194,12 +194,14 @@ impl IronsTuckGrand {
     /// This method should be called after `run()` has completed to retrieve
     /// the final coefficients from the internal `gx` buffer.
     #[inline]
-    fn finalize_output(&self, coef: &mut [f64],
-                           iter: usize,
-                           convergence: ConvergenceState,) -> (usize, ConvergenceState) {
+    fn finalize_output(
+        &self,
+        coef: &mut [f64],
+        iter: usize,
+        convergence: ConvergenceState,
+    ) -> (usize, ConvergenceState) {
         coef.copy_from_slice(&self.buffers.gx);
         (iter, convergence)
-
     }
 
     /// Perform the core Irons-Tuck acceleration step.
@@ -212,17 +214,16 @@ impl IronsTuckGrand {
         coef: &mut [f64],
         iter: usize,
     ) -> ConvergenceState {
-        let conv_range = projector.convergence_range();
-        let (cs, ce) = (conv_range.start, conv_range.end);
+        let std::ops::Range { start, end } = projector.convergence_range();
 
         // Double projection for Irons-Tuck: G(G(x))
         projector.project(&self.buffers.gx, &mut self.buffers.ggx);
 
         // Irons-Tuck acceleration
         if Self::accelerate(
-            &mut coef[cs..ce],
-            &self.buffers.gx[cs..ce],
-            &self.buffers.ggx[cs..ce],
+            &mut coef[start..end],
+            &self.buffers.gx[start..end],
+            &self.buffers.ggx[start..end],
         ) == ConvergenceState::Converged
         {
             return ConvergenceState::Converged;
@@ -230,7 +231,7 @@ impl IronsTuckGrand {
 
         // Post-acceleration projection (after warmup)
         if iter >= self.config.iter_proj_after_acc {
-            self.buffers.temp[cs..ce].copy_from_slice(&coef[cs..ce]);
+            self.buffers.temp[start..end].copy_from_slice(&coef[start..end]);
             projector.project(&self.buffers.temp, coef);
         }
 
@@ -281,9 +282,9 @@ impl IronsTuckGrand {
         coef: &[f64],
     ) -> ConvergenceState {
         projector.project(coef, &mut self.buffers.gx);
-        let conv_range = projector.convergence_range();
-        let (cs, ce) = (conv_range.start, conv_range.end);
-        if Self::should_continue(&coef[cs..ce], &self.buffers.gx[cs..ce], self.config.tol) {
+        let std::ops::Range { start, end } = projector.convergence_range();
+        if Self::should_continue(&coef[start..end], &self.buffers.gx[start..end], self.config.tol)
+        {
             ConvergenceState::NotConverged
         } else {
             ConvergenceState::Converged
@@ -351,23 +352,22 @@ impl IronsTuckGrand {
         projector: &mut P,
         phase: GrandPhase,
     ) -> GrandStepResult {
-        let conv_range = projector.convergence_range();
-        let (cs, ce) = (conv_range.start, conv_range.end);
+        let std::ops::Range { start, end } = projector.convergence_range();
         match phase {
             GrandPhase::Collect1st => {
-                self.buffers.y[cs..ce].copy_from_slice(&self.buffers.gx[cs..ce]);
+                self.buffers.y[start..end].copy_from_slice(&self.buffers.gx[start..end]);
                 GrandStepResult::Continue(GrandPhase::Collect2nd)
             }
             GrandPhase::Collect2nd => {
-                self.buffers.gy[cs..ce].copy_from_slice(&self.buffers.gx[cs..ce]);
+                self.buffers.gy[start..end].copy_from_slice(&self.buffers.gx[start..end]);
                 GrandStepResult::Continue(GrandPhase::Collect3rdAndAccelerate)
             }
             GrandPhase::Collect3rdAndAccelerate => {
-                self.buffers.ggy[cs..ce].copy_from_slice(&self.buffers.gx[cs..ce]);
+                self.buffers.ggy[start..end].copy_from_slice(&self.buffers.gx[start..end]);
                 let convergence = Self::accelerate(
-                    &mut self.buffers.y[cs..ce],
-                    &self.buffers.gy[cs..ce],
-                    &self.buffers.ggy[cs..ce],
+                    &mut self.buffers.y[start..end],
+                    &self.buffers.gy[start..end],
+                    &self.buffers.ggy[start..end],
                 );
                 if convergence == ConvergenceState::Converged {
                     return GrandStepResult::Done(ConvergenceState::Converged);
@@ -411,7 +411,7 @@ mod tests {
     use super::*;
     use crate::demean::projection::TwoFEProjector;
     use crate::demean::types::DemeanContext;
-    use ndarray::{Array1, Array2};
+    use ndarray::Array2;
 
     /// Create a test problem with 2 fixed effects
     fn create_test_problem(n_obs: usize) -> (DemeanContext, Vec<f64>) {
@@ -421,8 +421,7 @@ mod tests {
             flist[[i, 0]] = i % 10;
             flist[[i, 1]] = i % 5;
         }
-        let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
+        let ctx = DemeanContext::new(&flist.view(), None);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
         (ctx, input)
     }
@@ -433,8 +432,8 @@ mod tests {
         let config = FixestConfig::default();
         let maxiter = config.maxiter;
 
-        let n0 = ctx.index.n_groups[0];
-        let n1 = ctx.index.n_groups[1];
+        let n0 = ctx.fe_infos[0].n_groups;
+        let n1 = ctx.fe_infos[1].n_groups;
         let n_coef = n0 + n1;
 
         let mut coef_sums = vec![0.0; n_coef];
diff --git a/src/demean/demeaner.rs b/src/demean/demeaner.rs
index 6ce36b4fa..0d1b5988a 100644
--- a/src/demean/demeaner.rs
+++ b/src/demean/demeaner.rs
@@ -56,34 +56,34 @@ impl<'a> SingleFEDemeaner<'a> {
     pub fn new(ctx: &'a DemeanContext) -> Self {
         Self {
             ctx,
-            coef_sums_buffer: vec![0.0; ctx.index.n_coef],
+            coef_sums_buffer: vec![0.0; ctx.dims.n_coef],
         }
     }
 }
 
 impl Demeaner for SingleFEDemeaner<'_> {
     fn solve(&mut self, input: &[f64]) -> DemeanResult {
-        let n_obs = self.ctx.index.n_obs;
-        let n_coef = self.ctx.index.n_coef;
+        let n_obs = self.ctx.dims.n_obs;
 
         // Apply Dᵀ to get coefficient-space sums (reuses buffer)
         self.ctx.apply_design_matrix_t(input, &mut self.coef_sums_buffer);
 
-        let fe0 = self.ctx.index.group_ids_for_fe(0);
-        let group_weights = self.ctx.group_weights_for_fe(0);
+        let fe0 = &self.ctx.fe_infos[0];
 
-        // Compute FE coefficients: coef[g] = sum[g] / weight[g]
-        let fe_coefficients: Vec<f64> = (0..n_coef)
-            .map(|g| self.coef_sums_buffer[g] / group_weights[g])
+        // Compute FE coefficients (group means) using precomputed inverse weights
+        let fe_coefficients: Vec<f64> = self.coef_sums_buffer[..fe0.n_groups]
+            .iter()
+            .zip(fe0.inv_group_weights.iter())
+            .map(|(&sum, &inv_w)| sum * inv_w)
             .collect();
 
-        // output[i] = input[i] - coef[fe0[i]]
+        // output[i] = input[i] - group_mean[fe0[i]]
         let demeaned: Vec<f64> = (0..n_obs)
-            .map(|i| input[i] - fe_coefficients[fe0[i]])
+            .map(|i| input[i] - fe_coefficients[fe0.group_ids[i]])
             .collect();
 
         // Single FE is a closed-form solution, always converges in 0 iterations
-        // No reordering needed for 1 FE
+        // No reordering needed for single FE
         DemeanResult {
             demeaned,
             fe_coefficients,
@@ -115,8 +115,8 @@ impl<'a> TwoFEDemeaner<'a> {
     /// Create a new two-FE demeaner with pre-allocated buffers.
     #[inline]
     pub fn new(ctx: &'a DemeanContext, config: &'a FixestConfig) -> Self {
-        let n0 = ctx.index.n_groups[0];
-        let n1 = ctx.index.n_groups[1];
+        let n0 = ctx.fe_infos[0].n_groups;
+        let n1 = ctx.fe_infos[1].n_groups;
         let n_coef = n0 + n1;
 
         Self {
@@ -131,8 +131,8 @@ impl<'a> TwoFEDemeaner<'a> {
 
 impl Demeaner for TwoFEDemeaner<'_> {
     fn solve(&mut self, input: &[f64]) -> DemeanResult {
-        let n_obs = self.ctx.index.n_obs;
-        let n0 = self.ctx.index.n_groups[0];
+        let n_obs = self.ctx.dims.n_obs;
+        let n0 = self.ctx.fe_infos[0].n_groups;
 
         // Apply Dᵀ to get coefficient-space sums (reuses buffer)
         self.ctx.apply_design_matrix_t(input, &mut self.coef_sums_buffer);
@@ -149,15 +149,15 @@ impl Demeaner for TwoFEDemeaner<'_> {
             .run(&mut projector, &mut self.coef, self.config.maxiter);
 
         // Reconstruct output: input - alpha - beta
-        let fe0 = self.ctx.index.group_ids_for_fe(0);
-        let fe1 = self.ctx.index.group_ids_for_fe(1);
+        let fe0 = &self.ctx.fe_infos[0];
+        let fe1 = &self.ctx.fe_infos[1];
 
         let demeaned: Vec<f64> = (0..n_obs)
-            .map(|i| input[i] - self.coef[fe0[i]] - self.coef[n0 + fe1[i]])
+            .map(|i| input[i] - self.coef[fe0.group_ids[i]] - self.coef[n0 + fe1.group_ids[i]])
             .collect();
 
-        // Reorder coefficients back to original FE order
-        let fe_coefficients = self.ctx.index.reorder_coefficients_to_original(&self.coef);
+        // Reorder coefficients to original FE order
+        let fe_coefficients = self.ctx.reorder_coef_to_original(&self.coef);
 
         DemeanResult {
             demeaned,
@@ -179,10 +179,10 @@ impl Demeaner for TwoFEDemeaner<'_> {
 struct MultiFEBuffers {
     /// Accumulated fixed effects per observation (observation-space)
     mu: Vec<f64>,
-    /// Accumulated coefficients across all phases (coefficient-space)
-    total_coef: Vec<f64>,
     /// Working coefficient array for accelerator (reset each phase)
     coef: Vec<f64>,
+    /// Accumulated total coefficients across all phases
+    total_coef: Vec<f64>,
     /// Coefficient array for 2-FE sub-convergence (coefficient-space, first 2 FEs only)
     coef_2fe: Vec<f64>,
     /// Effective input after subtracting mu (observation-space).
@@ -196,8 +196,8 @@ impl MultiFEBuffers {
     fn new(n_obs: usize, n_coef: usize, n_coef_2fe: usize) -> Self {
         Self {
             mu: vec![0.0; n_obs],
-            total_coef: vec![0.0; n_coef],
             coef: vec![0.0; n_coef],
+            total_coef: vec![0.0; n_coef],
             coef_2fe: vec![0.0; n_coef_2fe],
             effective_input: vec![0.0; n_obs],
             coef_sums_buffer: vec![0.0; n_coef],
@@ -208,8 +208,8 @@ impl MultiFEBuffers {
     #[inline]
     fn reset(&mut self) {
         self.mu.fill(0.0);
-        self.total_coef.fill(0.0);
         self.coef.fill(0.0);
+        self.total_coef.fill(0.0);
     }
 }
 
@@ -237,10 +237,10 @@ impl<'a> MultiFEDemeaner<'a> {
     /// Create a new multi-FE demeaner with pre-allocated buffers.
     #[inline]
     pub fn new(ctx: &'a DemeanContext, config: &'a FixestConfig) -> Self {
-        let n_obs = ctx.index.n_obs;
-        let n_coef = ctx.index.n_coef;
-        let n0 = ctx.index.n_groups[0];
-        let n1 = ctx.index.n_groups[1];
+        let n_obs = ctx.dims.n_obs;
+        let n_coef = ctx.dims.n_coef;
+        let n0 = ctx.fe_infos[0].n_groups;
+        let n1 = ctx.fe_infos[1].n_groups;
         let n_coef_2fe = n0 + n1;
 
         Self {
@@ -262,10 +262,11 @@ impl<'a> MultiFEDemeaner<'a> {
             .multi_acc
             .run(&mut projector, &mut self.buffers.coef, self.config.iter_warmup);
 
-        // Accumulate coefficients and apply to mu
-        for (tc, &c) in self.buffers.total_coef.iter_mut().zip(self.buffers.coef.iter()) {
-            *tc += c;
+        // Accumulate coefficients from this phase
+        for (total, &c) in self.buffers.total_coef.iter_mut().zip(self.buffers.coef.iter()) {
+            *total += c;
         }
+
         self.ctx
             .apply_design_matrix(&self.buffers.coef, &mut self.buffers.mu);
         (iter, convergence)
@@ -273,9 +274,9 @@ impl<'a> MultiFEDemeaner<'a> {
 
     /// Phase 2: Fast 2-FE sub-convergence on the first two fixed effects.
     fn two_fe_convergence_phase(&mut self, input: &[f64]) -> (usize, ConvergenceState) {
-        let n_obs = self.ctx.index.n_obs;
-        let n0 = self.ctx.index.n_groups[0];
-        let n1 = self.ctx.index.n_groups[1];
+        let n_obs = self.ctx.dims.n_obs;
+        let n0 = self.ctx.fe_infos[0].n_groups;
+        let n1 = self.ctx.fe_infos[1].n_groups;
         let n_coef_2fe = n0 + n1;
 
         // Compute residuals: input - mu
@@ -300,10 +301,11 @@ impl<'a> MultiFEDemeaner<'a> {
             self.config.maxiter / 2,
         );
 
-        // Accumulate 2-FE coefficients to total_coef (first 2 FEs only)
-        for (tc, &c) in self.buffers.total_coef[..n_coef_2fe].iter_mut().zip(self.buffers.coef_2fe.iter()) {
-            *tc += c;
+        // Accumulate 2-FE coefficients (only first 2 FEs)
+        for (total, &c) in self.buffers.total_coef[..n_coef_2fe].iter_mut().zip(self.buffers.coef_2fe.iter()) {
+            *total += c;
         }
+
         // Add 2-FE coefficients to mu
         self.add_2fe_coefficients_to_mu();
         (iter, convergence)
@@ -321,7 +323,7 @@ impl<'a> MultiFEDemeaner<'a> {
         }
 
         // Compute residuals: input - mu
-        for i in 0..self.ctx.index.n_obs {
+        for i in 0..self.ctx.dims.n_obs {
             self.buffers.effective_input[i] = input[i] - self.buffers.mu[i];
         }
 
@@ -334,10 +336,11 @@ impl<'a> MultiFEDemeaner<'a> {
             self.multi_acc
                 .run(&mut projector, &mut self.buffers.coef, remaining);
 
-        // Accumulate coefficients and apply to mu
-        for (tc, &c) in self.buffers.total_coef.iter_mut().zip(self.buffers.coef.iter()) {
-            *tc += c;
+        // Accumulate coefficients from this phase
+        for (total, &c) in self.buffers.total_coef.iter_mut().zip(self.buffers.coef.iter()) {
+            *total += c;
         }
+
         self.ctx
             .apply_design_matrix(&self.buffers.coef, &mut self.buffers.mu);
         (iter, convergence)
@@ -345,13 +348,13 @@ impl<'a> MultiFEDemeaner<'a> {
 
     /// Add 2-FE coefficients to the accumulated mu buffer.
     fn add_2fe_coefficients_to_mu(&mut self) {
-        let n0 = self.ctx.index.n_groups[0];
-        let fe0 = self.ctx.index.group_ids_for_fe(0);
-        let fe1 = self.ctx.index.group_ids_for_fe(1);
+        let n0 = self.ctx.fe_infos[0].n_groups;
+        let fe0 = &self.ctx.fe_infos[0];
+        let fe1 = &self.ctx.fe_infos[1];
 
-        for i in 0..self.ctx.index.n_obs {
+        for i in 0..self.ctx.dims.n_obs {
             self.buffers.mu[i] +=
-                self.buffers.coef_2fe[fe0[i]] + self.buffers.coef_2fe[n0 + fe1[i]];
+                self.buffers.coef_2fe[fe0.group_ids[i]] + self.buffers.coef_2fe[n0 + fe1.group_ids[i]];
         }
     }
 
@@ -368,8 +371,8 @@ impl<'a> MultiFEDemeaner<'a> {
             .map(|(&x, &mu)| x - mu)
             .collect();
 
-        // Reorder coefficients back to original FE order
-        let fe_coefficients = self.ctx.index.reorder_coefficients_to_original(&self.buffers.total_coef);
+        // Reorder coefficients to original FE order
+        let fe_coefficients = self.ctx.reorder_coef_to_original(&self.buffers.total_coef);
 
         DemeanResult {
             demeaned,
diff --git a/src/demean/mod.rs b/src/demean/mod.rs
index 7ef4031e2..34f255bf7 100644
--- a/src/demean/mod.rs
+++ b/src/demean/mod.rs
@@ -7,9 +7,10 @@
 //! # Module Structure
 //!
 //! - [`types`]: Core data types
-//!   - [`FixedEffectsIndex`](types::FixedEffectsIndex): Fixed effects indexing (which obs belongs to which group)
-//!   - [`ObservationWeights`](types::ObservationWeights): Observation weights and group-level aggregations
-//!   - [`DemeanContext`](DemeanContext): Combines index and weights for demeaning operations
+//!   - [`Dimensions`](types::Dimensions): Problem shape
+//!   - [`Weights`](types::Weights): Observation weights
+//!   - [`FixedEffectInfo`](types::FixedEffectInfo): Per-FE information
+//!   - [`DemeanContext`](DemeanContext): Combines all context for demeaning
 //!   - [`FixestConfig`](FixestConfig): Algorithm parameters
 //! - [`projection`]: Projection operations with [`Projector`](projection::Projector) trait
 //!   - [`TwoFEProjector`](projection::TwoFEProjector): Specialized 2-FE projection
@@ -29,20 +30,20 @@
 pub mod accelerator;
 pub mod demeaner;
 pub mod projection;
+mod sweep;
 pub mod types;
 
 use demeaner::{Demeaner, MultiFEDemeaner, SingleFEDemeaner, TwoFEDemeaner};
-use types::{ConvergenceState, DemeanContext, DemeanResult, FixestConfig};
+use types::{ConvergenceState, DemeanContext, DemeanMultiResult, DemeanResult, FixestConfig};
 
-use ndarray::{Array2, ArrayView1, ArrayView2};
+use ndarray::{Array2, ArrayView1, ArrayView2, Zip};
 use numpy::{PyArray2, PyReadonlyArray1, PyReadonlyArray2};
 use pyo3::prelude::*;
 use pyo3::types::PyDict;
 use rayon::prelude::*;
-use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::atomic::AtomicUsize;
 use std::sync::Arc;
 
-
 /// Thread-local demeaner state that wraps the appropriate demeaner type.
 ///
 /// This enum allows `for_each_init` to create a demeaner once per thread,
@@ -55,8 +56,9 @@ enum ThreadLocalDemeaner<'a> {
 
 impl<'a> ThreadLocalDemeaner<'a> {
     /// Create a new thread-local demeaner based on the FE count.
+    #[inline]
     fn new(ctx: &'a DemeanContext, config: &'a FixestConfig) -> Self {
-        match ctx.index.n_fe {
+        match ctx.dims.n_fe {
             1 => ThreadLocalDemeaner::Single(SingleFEDemeaner::new(ctx)),
             2 => ThreadLocalDemeaner::Two(TwoFEDemeaner::new(ctx, config)),
             _ => ThreadLocalDemeaner::Multi(MultiFEDemeaner::new(ctx, config)),
@@ -64,7 +66,7 @@ impl<'a> ThreadLocalDemeaner<'a> {
     }
 
     /// Solve the demeaning problem, reusing internal buffers.
-    #[inline]
+    #[inline(always)]
     fn solve(&mut self, input: &[f64]) -> DemeanResult {
         match self {
             ThreadLocalDemeaner::Single(d) => d.solve(input),
@@ -74,31 +76,29 @@ impl<'a> ThreadLocalDemeaner<'a> {
     }
 }
 
-/// Result of batch demeaning operation.
-pub(crate) struct DemeanBatchResult {
-    pub demeaned: Array2<f64>,
-    pub fe_coefficients: Array2<f64>,
-    pub success: bool,
-}
-
 /// Demean using accelerated coefficient-space iteration.
 ///
 /// Uses `for_each_init` to create one demeaner per thread, reusing buffers
 /// across all columns processed by that thread.
 ///
+/// # Arguments
+///
+/// * `x` - Input data array (n_samples, n_features)
+/// * `flist` - Fixed effect group IDs (n_samples, n_fe)
+/// * `weights` - Per-observation weights, or None for unweighted
+/// * `tol` - Convergence tolerance
+/// * `maxiter` - Maximum iterations
+///
 /// # Returns
 ///
-/// A `DemeanBatchResult` containing:
-/// - `demeaned`: The demeaned data as an `Array2<f64>`
-/// - `fe_coefficients`: FE coefficients as an `Array2<f64>`
-/// - `success`: True if all columns converged
+/// A [`DemeanMultiResult`] containing demeaned data, FE coefficients, and convergence status.
 pub(crate) fn demean(
     x: &ArrayView2<f64>,
     flist: &ArrayView2<usize>,
     weights: Option<&ArrayView1<f64>>,
     tol: f64,
     maxiter: usize,
-) -> DemeanBatchResult {
+) -> DemeanMultiResult {
     let (n_samples, n_features) = x.dim();
 
     let config = FixestConfig {
@@ -110,51 +110,56 @@ pub(crate) fn demean(
     let not_converged = Arc::new(AtomicUsize::new(0));
     let mut demeaned = Array2::<f64>::zeros((n_samples, n_features));
 
-    // FEs are automatically reordered by size (largest first) for optimal convergence
+    // Create context (FEs are always reordered by size, matching fixest)
     let ctx = DemeanContext::new(flist, weights);
-    let n_coef = ctx.index.n_coef;
+    let n_coef = ctx.dims.n_coef;
 
     let mut fe_coefficients = Array2::<f64>::zeros((n_coef, n_features));
 
     // Process columns in parallel, collecting both demeaned values and FE coefficients
-    let results: Vec<(usize, DemeanResult)> = demeaned
+    demeaned
         .axis_iter_mut(ndarray::Axis(1))
         .into_par_iter()
+        .zip(
+            fe_coefficients
+                .axis_iter_mut(ndarray::Axis(1))
+                .into_par_iter(),
+        )
         .enumerate()
-        .map_init(
+        .for_each_init(
+            // Init closure: called once per thread to create the thread-local state
             || ThreadLocalDemeaner::new(&ctx, &config),
-            |demeaner, (k, _)| {
+            // Body closure: called for each column, reusing thread-local state
+            |demeaner, (k, (mut dem_col, mut coef_col))| {
                 let col_view = x.column(k);
+                // Zero-copy if the column is contiguous (F-order), otherwise copy
                 let result = if let Some(slice) = col_view.as_slice() {
                     demeaner.solve(slice)
                 } else {
                     let xk: Vec<f64> = col_view.to_vec();
                     demeaner.solve(&xk)
                 };
-                (k, result)
-            },
-        )
-        .collect();
 
-    // Copy results back (sequential, but fast)
-    for (k, result) in results {
-        if result.convergence == ConvergenceState::NotConverged {
-            not_converged.fetch_add(1, Ordering::SeqCst);
-        }
-
-        // Copy demeaned values
-        for (i, &val) in result.demeaned.iter().enumerate() {
-            demeaned[[i, k]] = val;
-        }
+                if result.convergence == ConvergenceState::NotConverged {
+                    not_converged.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+                }
 
-        // Copy FE coefficients
-        for (i, &val) in result.fe_coefficients.iter().enumerate() {
-            fe_coefficients[[i, k]] = val;
-        }
-    }
+                Zip::from(&mut dem_col)
+                    .and(&result.demeaned)
+                    .for_each(|col_elm, &val| {
+                        *col_elm = val;
+                    });
+
+                Zip::from(&mut coef_col)
+                    .and(&result.fe_coefficients)
+                    .for_each(|col_elm, &val| {
+                        *col_elm = val;
+                    });
+            },
+        );
 
-    let success = not_converged.load(Ordering::SeqCst) == 0;
-    DemeanBatchResult {
+    let success = not_converged.load(std::sync::atomic::Ordering::Relaxed) == 0;
+    DemeanMultiResult {
         demeaned,
         fe_coefficients,
         success,
@@ -163,7 +168,17 @@ pub(crate) fn demean(
 
 /// Python-exposed function for accelerated demeaning.
 ///
-/// Returns a dict with:
+/// # Arguments
+///
+/// * `x` - Input data array (n_samples, n_features)
+/// * `flist` - Fixed effect group IDs (n_samples, n_fe)
+/// * `weights` - Per-observation weights, or None for unweighted (fast path)
+/// * `tol` - Convergence tolerance (default: 1e-8)
+/// * `maxiter` - Maximum iterations (default: 100_000)
+///
+/// # Returns
+///
+/// A dict with:
 /// - "demeaned": Array of demeaned values (n_samples, n_features)
 /// - "fe_coefficients": Array of FE coefficients (n_coef, n_features)
 /// - "success": Boolean indicating convergence
@@ -196,8 +211,8 @@ pub fn _demean_rs<'py>(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use demeaner::{MultiFEDemeaner, SingleFEDemeaner};
-    use ndarray::{Array1, Array2};
+    use demeaner::MultiFEDemeaner;
+    use ndarray::Array2;
 
     #[test]
     fn test_2fe_convergence() {
@@ -210,16 +225,19 @@ mod tests {
             flist[[i, 1]] = i % 5;
         }
 
-        let weights = Array1::<f64>::ones(n_obs);
-
-        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
+        // Unweighted case
+        let ctx = DemeanContext::new(&flist.view(), None);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
         let result = demeaner.solve(&input);
 
-        assert_eq!(result.convergence, ConvergenceState::Converged, "Should converge");
+        assert_eq!(
+            result.convergence,
+            ConvergenceState::Converged,
+            "Should converge"
+        );
         assert!(result.iterations < 100, "Should converge quickly");
         assert!(result.demeaned.iter().all(|&v| v.is_finite()));
     }
@@ -236,9 +254,8 @@ mod tests {
             flist[[i, 2]] = i % 3;
         }
 
-        let weights = Array1::<f64>::ones(n_obs);
-
-        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
+        // Unweighted case
+        let ctx = DemeanContext::new(&flist.view(), None);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
@@ -247,6 +264,27 @@ mod tests {
 
         assert_eq!(result.convergence, ConvergenceState::Converged);
         assert!(result.demeaned.iter().all(|&v| v.is_finite()));
+
+        // Verify demeaning: each FE group's sum should be approximately 0
+        let group_counts = [10, 5, 3];
+        for q in 0..n_fe {
+            for g in 0..group_counts[q] {
+                let group_sum: f64 = result
+                    .demeaned
+                    .iter()
+                    .enumerate()
+                    .filter(|(i, _)| flist[[*i, q]] == g)
+                    .map(|(_, &v)| v)
+                    .sum();
+                assert!(
+                    group_sum.abs() < 1e-8,
+                    "FE {} group {} sum should be ~0, got {}",
+                    q,
+                    g,
+                    group_sum
+                );
+            }
+        }
     }
 
     #[test]
@@ -260,15 +298,21 @@ mod tests {
             flist[[i, 0]] = i % n_groups;
         }
 
-        let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
+        let ctx = DemeanContext::new(&flist.view(), None);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let mut demeaner = SingleFEDemeaner::new(&ctx);
         let result = demeaner.solve(&input);
 
-        assert_eq!(result.convergence, ConvergenceState::Converged, "Single FE should always converge");
-        assert_eq!(result.iterations, 0, "Single FE should be closed-form (0 iterations)");
+        assert_eq!(
+            result.convergence,
+            ConvergenceState::Converged,
+            "Single FE should always converge"
+        );
+        assert_eq!(
+            result.iterations, 0,
+            "Single FE should be closed-form (0 iterations)"
+        );
 
         // Verify demeaning: each group's sum should be approximately 0
         for g in 0..n_groups {
@@ -300,8 +344,9 @@ mod tests {
         }
 
         // Non-uniform weights: 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, ...
-        let weights: Array1<f64> = (0..n_obs).map(|i| 1.0 + (i % 3) as f64).collect();
-        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
+        let weights: ndarray::Array1<f64> = (0..n_obs).map(|i| 1.0 + (i % 3) as f64).collect();
+        let ctx =
+            DemeanContext::new(&flist.view(), Some(&weights.view()));
 
         assert!(
             ctx.weights.is_some(),
@@ -313,63 +358,11 @@ mod tests {
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
         let result = demeaner.solve(&input);
 
-        assert_eq!(result.convergence, ConvergenceState::Converged, "Weighted regression should converge");
-        assert!(
-            result.demeaned.iter().all(|&v| v.is_finite()),
-            "All results should be finite"
-        );
-    }
-
-    #[test]
-    fn test_singleton_groups() {
-        // Each observation in its own group for FE 0 (singleton groups)
-        let n_obs = 20;
-
-        let mut flist = Array2::<usize>::zeros((n_obs, 2));
-        for i in 0..n_obs {
-            flist[[i, 0]] = i; // Singleton groups (each obs is its own group)
-            flist[[i, 1]] = i % 4; // 4 groups in FE 1
-        }
-
-        let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
-        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
-
-        let config = FixestConfig::default();
-        let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
-        let result = demeaner.solve(&input);
-
-        assert_eq!(result.convergence, ConvergenceState::Converged, "Singleton groups should converge");
-
-        // With singleton groups in FE 0, each observation's own mean is subtracted,
-        // then adjusted for FE 1. The result should be all zeros since each
-        // observation perfectly absorbs its own value in FE 0.
-        assert!(
-            result.demeaned.iter().all(|&v| v.abs() < 1e-10),
-            "Singleton groups should yield near-zero residuals"
+        assert_eq!(
+            result.convergence,
+            ConvergenceState::Converged,
+            "Weighted regression should converge"
         );
-    }
-
-    #[test]
-    fn test_small_groups() {
-        // Test with very few observations per group
-        let n_obs = 30;
-
-        let mut flist = Array2::<usize>::zeros((n_obs, 2));
-        for i in 0..n_obs {
-            flist[[i, 0]] = i / 3; // 10 groups, 3 obs each
-            flist[[i, 1]] = i % 2; // 2 groups, 15 obs each
-        }
-
-        let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
-        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
-
-        let config = FixestConfig::default();
-        let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
-        let result = demeaner.solve(&input);
-
-        assert_eq!(result.convergence, ConvergenceState::Converged, "Small groups should converge");
         assert!(
             result.demeaned.iter().all(|&v| v.is_finite()),
             "All results should be finite"
@@ -394,8 +387,9 @@ mod tests {
         );
 
         // Test with weights (Some) - weighted case
-        let weights = Array1::<f64>::ones(n_obs);
-        let ctx_weighted = DemeanContext::new(&flist.view(), Some(&weights.view()));
+        let weights: ndarray::Array1<f64> = (0..n_obs).map(|i| 1.0 + (i % 2) as f64).collect();
+        let ctx_weighted =
+            DemeanContext::new(&flist.view(), Some(&weights.view()));
         assert!(
             ctx_weighted.weights.is_some(),
             "Provided weights should result in weights=Some"
@@ -414,8 +408,7 @@ mod tests {
             flist[[i, 1]] = i % 5;
         }
 
-        let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
+        let ctx = DemeanContext::new(&flist.view(), None);
         let config = FixestConfig::default();
 
         // Create a single demeaner and use it multiple times
@@ -448,307 +441,169 @@ mod tests {
     }
 
     // =========================================================================
-    // FE Coefficient Tests
+    // Edge Case Tests
     // =========================================================================
 
-    /// Helper: compute residuals by applying FE coefficients to observations.
-    /// Returns input[i] - sum_q(coef[fe_q[i]]) for each observation.
-    fn apply_coefficients(
-        input: &[f64],
-        flist: &Array2<usize>,
-        fe_coefficients: &[f64],
-        n_groups: &[usize],
-    ) -> Vec<f64> {
-        let n_obs = input.len();
-        let n_fe = flist.ncols();
-
-        // Compute coefficient offsets for each FE
-        let mut coef_offsets = vec![0usize; n_fe];
-        for q in 1..n_fe {
-            coef_offsets[q] = coef_offsets[q - 1] + n_groups[q - 1];
-        }
-
-        (0..n_obs)
-            .map(|i| {
-                let mut fe_sum = 0.0;
-                for q in 0..n_fe {
-                    let g = flist[[i, q]];
-                    fe_sum += fe_coefficients[coef_offsets[q] + g];
-                }
-                input[i] - fe_sum
-            })
-            .collect()
-    }
-
     #[test]
-    fn test_single_fe_coefficients() {
-        let n_obs = 100;
-        let n_groups = 10;
-
-        let mut flist = Array2::<usize>::zeros((n_obs, 1));
-        for i in 0..n_obs {
-            flist[[i, 0]] = i % n_groups;
-        }
-
-        let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
-        let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
+    fn test_single_observation() {
+        // Edge case: only 1 observation
+        let flist = Array2::<usize>::zeros((1, 2));
+        let ctx = DemeanContext::new(&flist.view(), None);
 
-        let mut demeaner = SingleFEDemeaner::new(&ctx);
+        let input = vec![42.0];
+        let config = FixestConfig::default();
+        let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
         let result = demeaner.solve(&input);
 
-        // Verify coefficients are correct: applying them should give same residuals
-        let reconstructed = apply_coefficients(&input, &flist, &result.fe_coefficients, &[n_groups]);
-
-        for (i, (&demeaned, &reconstructed)) in
-            result.demeaned.iter().zip(reconstructed.iter()).enumerate()
-        {
-            assert!(
-                (demeaned - reconstructed).abs() < 1e-10,
-                "Obs {}: demeaned ({}) != reconstructed ({})",
-                i,
-                demeaned,
-                reconstructed
-            );
-        }
-
-        // Verify coefficient count
-        assert_eq!(
-            result.fe_coefficients.len(),
-            n_groups,
-            "Should have {} coefficients",
-            n_groups
+        assert_eq!(result.convergence, ConvergenceState::Converged);
+        // With a single observation, demeaned value should be 0 (input - mean = 0)
+        assert!(
+            result.demeaned[0].abs() < 1e-10,
+            "Single observation should demean to 0"
         );
     }
 
     #[test]
-    fn test_two_fe_coefficients_correct() {
-        let n_obs = 100;
-        let n_groups_0 = 10;
-        let n_groups_1 = 5;
-
-        let mut flist = Array2::<usize>::zeros((n_obs, 2));
-        for i in 0..n_obs {
-            flist[[i, 0]] = i % n_groups_0;
-            flist[[i, 1]] = i % n_groups_1;
-        }
+    fn test_single_group_per_fe() {
+        // Edge case: all observations in the same group for each FE
+        let n_obs = 50;
+        let flist = Array2::<usize>::zeros((n_obs, 2)); // All zeros = single group each
 
-        let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
+        let ctx = DemeanContext::new(&flist.view(), None);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
         let result = demeaner.solve(&input);
 
-        // Verify coefficients are correct: applying them should give same residuals
-        let reconstructed =
-            apply_coefficients(&input, &flist, &result.fe_coefficients, &[n_groups_0, n_groups_1]);
-
-        for (i, (&demeaned, &reconstructed)) in
-            result.demeaned.iter().zip(reconstructed.iter()).enumerate()
-        {
+        assert_eq!(result.convergence, ConvergenceState::Converged);
+        // All in same group means demeaned = input - mean(input)
+        let mean: f64 = input.iter().sum::<f64>() / n_obs as f64;
+        for (i, &val) in result.demeaned.iter().enumerate() {
+            let expected = input[i] - mean;
             assert!(
-                (demeaned - reconstructed).abs() < 1e-8,
-                "Obs {}: demeaned ({}) != reconstructed ({})",
-                i,
-                demeaned,
-                reconstructed
+                (val - expected).abs() < 1e-10,
+                "Demeaned value should equal input - mean"
             );
         }
-
-        // Verify coefficient count
-        assert_eq!(
-            result.fe_coefficients.len(),
-            n_groups_0 + n_groups_1,
-            "Should have {} coefficients",
-            n_groups_0 + n_groups_1
-        );
     }
 
     #[test]
-    fn test_two_fe_coefficients_ordering() {
-        // Test that coefficients are returned in ORIGINAL FE order, not reordered
-        let n_obs = 100;
-
-        // FE 0: 5 groups (smaller), FE 1: 20 groups (larger)
-        // Internally, FEs get reordered by size (largest first), so FE 1 becomes internal FE 0
-        // But the coefficients should be returned in original order: [FE0 coeffs | FE1 coeffs]
-        let n_groups_0 = 5; // smaller
-        let n_groups_1 = 20; // larger
-
+    fn test_many_groups() {
+        // Edge case: many groups (each observation in its own group for FE0)
+        let n_obs = 200;
         let mut flist = Array2::<usize>::zeros((n_obs, 2));
         for i in 0..n_obs {
-            flist[[i, 0]] = i % n_groups_0;
-            flist[[i, 1]] = i % n_groups_1;
+            flist[[i, 0]] = i; // Each obs in its own group
+            flist[[i, 1]] = i % 5;
         }
 
-        let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
+        let ctx = DemeanContext::new(&flist.view(), None);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
         let result = demeaner.solve(&input);
 
-        // Verify coefficient count matches original ordering
-        assert_eq!(
-            result.fe_coefficients.len(),
-            n_groups_0 + n_groups_1,
-            "Should have {} coefficients",
-            n_groups_0 + n_groups_1
-        );
-
-        // Verify coefficients are in original order by reconstructing residuals
-        // using the ORIGINAL flist (not reordered)
-        let reconstructed =
-            apply_coefficients(&input, &flist, &result.fe_coefficients, &[n_groups_0, n_groups_1]);
-
-        for (i, (&demeaned, &reconstructed)) in
-            result.demeaned.iter().zip(reconstructed.iter()).enumerate()
-        {
-            assert!(
-                (demeaned - reconstructed).abs() < 1e-8,
-                "Obs {}: demeaned ({}) != reconstructed ({}) - coefficients may be in wrong order",
-                i,
-                demeaned,
-                reconstructed
-            );
-        }
+        assert_eq!(result.convergence, ConvergenceState::Converged);
+        assert!(result.demeaned.iter().all(|&v| v.is_finite()));
     }
 
     #[test]
-    fn test_three_fe_coefficients_correct() {
-        let n_obs = 120;
-        let n_groups_0 = 10;
-        let n_groups_1 = 6;
-        let n_groups_2 = 4;
-
-        let mut flist = Array2::<usize>::zeros((n_obs, 3));
+    fn test_extreme_weight_ratios() {
+        // Edge case: very different weights
+        let n_obs = 100;
+        let mut flist = Array2::<usize>::zeros((n_obs, 2));
         for i in 0..n_obs {
-            flist[[i, 0]] = i % n_groups_0;
-            flist[[i, 1]] = i % n_groups_1;
-            flist[[i, 2]] = i % n_groups_2;
+            flist[[i, 0]] = i % 10;
+            flist[[i, 1]] = i % 5;
         }
 
-        let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
+        // Extreme weights: 0.001, 1000, 0.001, 1000, ...
+        let weights: ndarray::Array1<f64> = (0..n_obs)
+            .map(|i| if i % 2 == 0 { 0.001 } else { 1000.0 })
+            .collect();
+
+        let ctx =
+            DemeanContext::new(&flist.view(), Some(&weights.view()));
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
-        let mut demeaner = MultiFEDemeaner::new(&ctx, &config);
+        let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
         let result = demeaner.solve(&input);
 
-        // Verify coefficients are correct
-        let reconstructed = apply_coefficients(
-            &input,
-            &flist,
-            &result.fe_coefficients,
-            &[n_groups_0, n_groups_1, n_groups_2],
-        );
-
-        for (i, (&demeaned, &reconstructed)) in
-            result.demeaned.iter().zip(reconstructed.iter()).enumerate()
-        {
-            assert!(
-                (demeaned - reconstructed).abs() < 1e-6,
-                "Obs {}: demeaned ({}) != reconstructed ({})",
-                i,
-                demeaned,
-                reconstructed
-            );
-        }
-
-        // Verify coefficient count
         assert_eq!(
-            result.fe_coefficients.len(),
-            n_groups_0 + n_groups_1 + n_groups_2,
+            result.convergence,
+            ConvergenceState::Converged,
+            "Should converge even with extreme weight ratios"
+        );
+        assert!(
+            result.demeaned.iter().all(|&v| v.is_finite()),
+            "All results should be finite"
         );
     }
 
+    // =========================================================================
+    // Convergence Failure Tests
+    // =========================================================================
+
     #[test]
-    fn test_three_fe_coefficients_ordering() {
-        // Test that 3-FE coefficients are returned in original order
-        let n_obs = 120;
-
-        // Create FEs with different sizes to trigger reordering
-        // Original: FE0=3 groups (smallest), FE1=15 groups (largest), FE2=8 groups (middle)
-        // Reordered internally: FE1, FE2, FE0
-        let n_groups_0 = 3; // smallest
-        let n_groups_1 = 15; // largest
-        let n_groups_2 = 8; // middle
-
-        let mut flist = Array2::<usize>::zeros((n_obs, 3));
+    fn test_small_maxiter_produces_valid_results() {
+        // Test that even with very limited iterations, results are valid (finite)
+        // The accelerated algorithm may still converge quickly for simple problems
+        let n_obs = 100;
+        let n_fe = 2;
+
+        let mut flist = Array2::<usize>::zeros((n_obs, n_fe));
         for i in 0..n_obs {
-            flist[[i, 0]] = i % n_groups_0;
-            flist[[i, 1]] = i % n_groups_1;
-            flist[[i, 2]] = i % n_groups_2;
+            flist[[i, 0]] = i % 10;
+            flist[[i, 1]] = i % 5;
         }
 
-        let weights = Array1::<f64>::ones(n_obs);
-        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
+        let ctx = DemeanContext::new(&flist.view(), None);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
-        let config = FixestConfig::default();
-        let mut demeaner = MultiFEDemeaner::new(&ctx, &config);
+        // Use maxiter=1 - algorithm may or may not converge depending on data
+        let config = FixestConfig {
+            maxiter: 1,
+            ..FixestConfig::default()
+        };
+        let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
         let result = demeaner.solve(&input);
 
-        // Verify coefficients work with ORIGINAL flist ordering
-        let reconstructed = apply_coefficients(
-            &input,
-            &flist,
-            &result.fe_coefficients,
-            &[n_groups_0, n_groups_1, n_groups_2],
+        // Regardless of convergence, results should be finite
+        assert!(
+            result.demeaned.iter().all(|&v| v.is_finite()),
+            "Results should be finite even with limited iterations"
+        );
+        assert!(
+            result.iterations <= 1,
+            "Should have at most 1 iteration"
         );
-
-        for (i, (&demeaned, &reconstructed)) in
-            result.demeaned.iter().zip(reconstructed.iter()).enumerate()
-        {
-            assert!(
-                (demeaned - reconstructed).abs() < 1e-6,
-                "Obs {}: demeaned ({}) != reconstructed ({}) - coefficients may be in wrong order",
-                i,
-                demeaned,
-                reconstructed
-            );
-        }
     }
 
     #[test]
-    fn test_weighted_coefficients() {
-        let n_obs = 100;
-        let n_groups_0 = 10;
-        let n_groups_1 = 5;
-
+    fn test_convergence_failure_with_zero_maxiter() {
+        // Edge case: maxiter=0
+        let n_obs = 50;
         let mut flist = Array2::<usize>::zeros((n_obs, 2));
         for i in 0..n_obs {
-            flist[[i, 0]] = i % n_groups_0;
-            flist[[i, 1]] = i % n_groups_1;
+            flist[[i, 0]] = i % 5;
+            flist[[i, 1]] = i % 3;
         }
 
-        // Non-uniform weights
-        let weights: Array1<f64> = (0..n_obs).map(|i| 1.0 + (i % 3) as f64).collect();
-        let ctx = DemeanContext::new(&flist.view(), Some(&weights.view()));
+        let ctx = DemeanContext::new(&flist.view(), None);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
-        let config = FixestConfig::default();
+        let config = FixestConfig {
+            maxiter: 0,
+            ..FixestConfig::default()
+        };
         let mut demeaner = TwoFEDemeaner::new(&ctx, &config);
         let result = demeaner.solve(&input);
 
-        // Verify coefficients are correct with weighted reconstruction
-        let reconstructed =
-            apply_coefficients(&input, &flist, &result.fe_coefficients, &[n_groups_0, n_groups_1]);
-
-        for (i, (&demeaned, &reconstructed)) in
-            result.demeaned.iter().zip(reconstructed.iter()).enumerate()
-        {
-            assert!(
-                (demeaned - reconstructed).abs() < 1e-8,
-                "Weighted obs {}: demeaned ({}) != reconstructed ({})",
-                i,
-                demeaned,
-                reconstructed
-            );
-        }
+        // With maxiter=0, should not converge (unless already converged after init)
+        // The exact behavior depends on implementation, but results should be finite
+        assert!(result.demeaned.iter().all(|&v| v.is_finite()));
     }
 }
diff --git a/src/demean/projection.rs b/src/demean/projection.rs
index b02b3dfa0..567a068fa 100644
--- a/src/demean/projection.rs
+++ b/src/demean/projection.rs
@@ -22,8 +22,9 @@
 //! Projectors are used with [`IronsTuckGrand`](crate::demean::accelerator::IronsTuckGrand)
 //! which handles the iteration strategy.
 
+use super::sweep::{GaussSeidelSweeper, TwoFESweeper};
 use crate::demean::types::DemeanContext;
-use std::ops::Range;
+use smallvec::SmallVec;
 
 // =============================================================================
 // Projector Trait
@@ -31,23 +32,11 @@ use std::ops::Range;
 
 /// A projection operation for fixed-effects demeaning.
 ///
-/// Projectors hold all context needed for projection: the [`DemeanContext`],
-/// scattered input sums, original input values, and scratch buffers.
-/// This makes the projection interface simple and clear.
-///
-/// Projectors are used with [`IronsTuckGrand`](crate::demean_accelerated::accelerator::IronsTuckGrand)
-/// which handles the iteration strategy.
-///
-/// # Performance
-///
-/// All methods are called in tight loops and should be marked `#[inline(always)]`.
-/// Using static dispatch (`impl Projector` or generics) ensures zero overhead.
+/// Projectors hold all context needed for projection and provide the core
+/// operations used by accelerators. All methods are called in tight loops
+/// and should be optimized for performance.
 pub trait Projector {
     /// Total number of coefficients this projector operates on.
-    ///
-    /// This defines the required size of coefficient arrays passed to
-    /// `project()` and `compute_ssr()`. Accelerator buffers must be
-    /// sized to match this value.
     fn coef_len(&self) -> usize;
 
     /// Project coefficients: coef_in → coef_out.
@@ -58,18 +47,8 @@ pub trait Projector {
 
     /// Range of coefficients to use for convergence checking.
     ///
-    /// # Why not all coefficients?
-    ///
-    /// At a fixed point, if any (n_fe - 1) fixed effects have converged,
-    /// the remaining one must also have converged (its inputs are stable,
-    /// so its output is stable). This allows us to skip checking one FE.
-    ///
-    /// # Which FE to exclude?
-    ///
-    /// Following fixest's approach, we exclude the **last FE** (smallest after
-    /// reordering). In the reverse sweep, this FE is processed first using
-    /// stale data from the previous iteration. Returns `0..n_coef - n_groups[n_fe-1]`.
-    fn convergence_range(&self) -> Range<usize>;
+    /// May be smaller than `0..coef_len()` when not all coefficients need checking.
+    fn convergence_range(&self) -> std::ops::Range<usize>;
 }
 
 // =============================================================================
@@ -86,10 +65,25 @@ pub trait Projector {
 /// Coefficients are stored as `[alpha_0, ..., alpha_{n0-1}, beta_0, ..., beta_{n1-1}]`
 /// where alpha are the coefficients for FE 0 and beta for FE 1.
 pub struct TwoFEProjector<'a> {
-    ctx: &'a DemeanContext,
-    /// Weighted sums per group (Dᵀ · input).
-    coef_sums: &'a [f64],
+    // Dimensions
+    n_obs: usize,
+    n0: usize,
+    n1: usize,
+
+    // Sweepers for each direction
+    /// Computes alpha from beta
+    alpha_sweeper: TwoFESweeper<'a>,
+    /// Computes beta from alpha
+    beta_sweeper: TwoFESweeper<'a>,
+
+    // Group ID pointers (needed for SSR computation)
+    fe0_group_ids_ptr: *const usize,
+    fe1_group_ids_ptr: *const usize,
+
+    // Input data
     input: &'a [f64],
+
+    // Scratch buffer for beta coefficients
     scratch: Vec<f64>,
 }
 
@@ -97,127 +91,134 @@ impl<'a> TwoFEProjector<'a> {
     /// Create a new 2-FE projector.
     #[inline]
     pub fn new(ctx: &'a DemeanContext, coef_sums: &'a [f64], input: &'a [f64]) -> Self {
-        let n1 = ctx.index.n_groups[1];
+        let fe0_info = &ctx.fe_infos[0];
+        let fe1_info = &ctx.fe_infos[1];
+        let n0 = fe0_info.n_groups;
+        let n1 = fe1_info.n_groups;
+        let weights_ptr = ctx.weights.as_ref().map(|w| w.as_ptr());
+
         Self {
-            ctx,
-            coef_sums,
+            n_obs: ctx.dims.n_obs,
+            n0,
+            n1,
+            // alpha_sweeper: computes alpha from beta (out=fe0, other=fe1)
+            alpha_sweeper: TwoFESweeper::new(
+                ctx.dims.n_obs,
+                weights_ptr,
+                fe0_info,
+                fe1_info,
+                coef_sums,
+                0, // alpha starts at offset 0
+            ),
+            // beta_sweeper: computes beta from alpha (out=fe1, other=fe0)
+            beta_sweeper: TwoFESweeper::new(
+                ctx.dims.n_obs,
+                weights_ptr,
+                fe1_info,
+                fe0_info,
+                coef_sums,
+                n0, // beta starts at offset n0
+            ),
+            fe0_group_ids_ptr: fe0_info.group_ids.as_ptr(),
+            fe1_group_ids_ptr: fe1_info.group_ids.as_ptr(),
             input,
             scratch: vec![0.0; n1],
         }
     }
-
-    /// Compute beta coefficients from alpha, storing the result in the scratch buffer.
-    ///
-    /// For each group g1 in FE1:
-    ///   beta[g1] = (coef_sums[g1] - Σ alpha[g0] * w) / group_weight[g1]
-    #[inline(always)]
-    fn compute_beta_from_alpha(&mut self, alpha: &[f64]) {
-        let n0 = self.ctx.index.n_groups[0];
-        let n1 = self.ctx.index.n_groups[1];
-        let fe0 = self.ctx.index.group_ids_for_fe(0);
-        let fe1 = self.ctx.index.group_ids_for_fe(1);
-        let sw1 = self.ctx.group_weights_for_fe(1);
-
-        self.scratch[..n1].copy_from_slice(&self.coef_sums[n0..n0 + n1]);
-
-        if let Some(w) = &self.ctx.weights {
-            for ((&g0, &g1), &wo) in fe0.iter().zip(fe1.iter()).zip(w.per_obs.iter()) {
-                self.scratch[g1] -= alpha[g0] * wo;
-            }
-        } else {
-            for (&g0, &g1) in fe0.iter().zip(fe1.iter()) {
-                self.scratch[g1] -= alpha[g0];
-            }
-        }
-
-        for (b, &sw) in self.scratch[..n1].iter_mut().zip(sw1.iter()) {
-            *b /= sw;
-        }
-    }
-
-    /// Compute alpha coefficients from beta (stored in scratch), writing to alpha_out.
-    ///
-    /// For each group g0 in FE0:
-    ///   alpha[g0] = (coef_sums[g0] - Σ beta[g1] * w) / group_weight[g0]
-    #[inline(always)]
-    fn compute_alpha_from_beta(&self, alpha_out: &mut [f64]) {
-        let n0 = self.ctx.index.n_groups[0];
-        let fe0 = self.ctx.index.group_ids_for_fe(0);
-        let fe1 = self.ctx.index.group_ids_for_fe(1);
-        let sw0 = self.ctx.group_weights_for_fe(0);
-
-        alpha_out[..n0].copy_from_slice(&self.coef_sums[..n0]);
-
-        if let Some(w) = &self.ctx.weights {
-            for ((&g0, &g1), &wo) in fe0.iter().zip(fe1.iter()).zip(w.per_obs.iter()) {
-                alpha_out[g0] -= self.scratch[g1] * wo;
-            }
-        } else {
-            for (&g0, &g1) in fe0.iter().zip(fe1.iter()) {
-                alpha_out[g0] -= self.scratch[g1];
-            }
-        }
-
-        for (a, &sw) in alpha_out[..n0].iter_mut().zip(sw0.iter()) {
-            *a /= sw;
-        }
-    }
 }
 
 impl Projector for TwoFEProjector<'_> {
     #[inline(always)]
     fn coef_len(&self) -> usize {
-        self.ctx.index.n_groups[0] + self.ctx.index.n_groups[1]
+        self.n0 + self.n1
     }
 
     #[inline(always)]
     fn project(&mut self, coef_in: &[f64], coef_out: &mut [f64]) {
-        let n0 = self.ctx.index.n_groups[0];
-        let n1 = self.ctx.index.n_groups[1];
-
-        // Step 1: alpha_in -> beta
-        self.compute_beta_from_alpha(&coef_in[..n0]);
+        // Step 1: alpha_in -> beta (stored in scratch)
+        self.beta_sweeper.sweep(&coef_in[..self.n0], &mut self.scratch);
 
         // Step 2: beta -> alpha_out
-        self.compute_alpha_from_beta(coef_out);
+        self.alpha_sweeper.sweep(&self.scratch, &mut coef_out[..self.n0]);
 
         // Step 3: Copy beta to output
-        coef_out[n0..n0 + n1].copy_from_slice(&self.scratch[..n1]);
+        coef_out[self.n0..self.n0 + self.n1].copy_from_slice(&self.scratch);
     }
 
-    /// Compute the sum of squared residuals for the given coefficients.
-    ///
-    /// # Side Effects
-    ///
-    /// This method recomputes beta from alpha and stores it in `self.scratch`.
-    /// After this call, `self.scratch[..n1]` contains the beta coefficients
-    /// derived from `coef[..n0]` (the alpha coefficients).
-    ///
-    /// This is intentional: the SSR computation needs consistent alpha/beta pairs,
-    /// and recomputing beta ensures correctness even if the caller's `coef` array
-    /// has stale beta values.
     #[inline(always)]
     fn compute_ssr(&mut self, coef: &[f64]) -> f64 {
-        let n0 = self.ctx.index.n_groups[0];
-        let fe0 = self.ctx.index.group_ids_for_fe(0);
-        let fe1 = self.ctx.index.group_ids_for_fe(1);
-
         // Compute beta from alpha (updates self.scratch)
-        self.compute_beta_from_alpha(&coef[..n0]);
+        self.beta_sweeper.sweep(&coef[..self.n0], &mut self.scratch);
 
         // Compute SSR: Σ (input[i] - alpha[fe0[i]] - beta[fe1[i]])²
+        // Use 4x unrolling for better ILP
+        let n_obs = self.n_obs;
+        let chunks = n_obs / 4;
+        let mut i = 0usize;
         let mut ssr = 0.0;
-        for ((&g0, &g1), &x) in fe0.iter().zip(fe1.iter()).zip(self.input.iter()) {
-            let resid = x - coef[g0] - self.scratch[g1];
-            ssr += resid * resid;
+
+        // SAFETY: All pointer accesses are valid because:
+        // - i < n_obs throughout (loop bounds ensure this)
+        // - fe0_ptr, fe1_ptr point to arrays of length n_obs (from FixedEffectInfo)
+        // - input_ptr points to array of length n_obs (from caller)
+        // - group IDs (g0_*, g1_*) are always < n0 or < n1 respectively
+        //   (invariant from DemeanContext construction)
+        // - alpha_ptr points to coef with length >= n0, beta_ptr to scratch with length n1
+        unsafe {
+            let alpha_ptr = coef.as_ptr();
+            let beta_ptr = self.scratch.as_ptr();
+            let input_ptr = self.input.as_ptr();
+            let fe0_ptr = self.fe0_group_ids_ptr;
+            let fe1_ptr = self.fe1_group_ids_ptr;
+
+            for _ in 0..chunks {
+                let g0_0 = *fe0_ptr.add(i);
+                let g0_1 = *fe0_ptr.add(i + 1);
+                let g0_2 = *fe0_ptr.add(i + 2);
+                let g0_3 = *fe0_ptr.add(i + 3);
+
+                let g1_0 = *fe1_ptr.add(i);
+                let g1_1 = *fe1_ptr.add(i + 1);
+                let g1_2 = *fe1_ptr.add(i + 2);
+                let g1_3 = *fe1_ptr.add(i + 3);
+
+                debug_assert!(g0_0 < self.n0 && g0_1 < self.n0 && g0_2 < self.n0 && g0_3 < self.n0,
+                    "FE0 group ID out of bounds: max({}, {}, {}, {}) >= n0 ({})",
+                    g0_0, g0_1, g0_2, g0_3, self.n0);
+                debug_assert!(g1_0 < self.n1 && g1_1 < self.n1 && g1_2 < self.n1 && g1_3 < self.n1,
+                    "FE1 group ID out of bounds: max({}, {}, {}, {}) >= n1 ({})",
+                    g1_0, g1_1, g1_2, g1_3, self.n1);
+
+                let resid0 =
+                    *input_ptr.add(i) - *alpha_ptr.add(g0_0) - *beta_ptr.add(g1_0);
+                let resid1 =
+                    *input_ptr.add(i + 1) - *alpha_ptr.add(g0_1) - *beta_ptr.add(g1_1);
+                let resid2 =
+                    *input_ptr.add(i + 2) - *alpha_ptr.add(g0_2) - *beta_ptr.add(g1_2);
+                let resid3 =
+                    *input_ptr.add(i + 3) - *alpha_ptr.add(g0_3) - *beta_ptr.add(g1_3);
+
+                ssr += resid0 * resid0 + resid1 * resid1 + resid2 * resid2 + resid3 * resid3;
+                i += 4;
+            }
+
+            // Handle remainder
+            while i < n_obs {
+                let g0 = *fe0_ptr.add(i);
+                let g1 = *fe1_ptr.add(i);
+                debug_assert!(g0 < self.n0, "FE0 group ID ({}) >= n0 ({})", g0, self.n0);
+                debug_assert!(g1 < self.n1, "FE1 group ID ({}) >= n1 ({})", g1, self.n1);
+                let resid = *input_ptr.add(i) - *alpha_ptr.add(g0) - *beta_ptr.add(g1);
+                ssr += resid * resid;
+                i += 1;
+            }
         }
         ssr
     }
 
     #[inline(always)]
-    fn convergence_range(&self) -> Range<usize> {
-        // Exclude FE 1 (last/smallest), check only FE 0
-        0..self.ctx.index.n_groups[0]
+    fn convergence_range(&self) -> std::ops::Range<usize> {
+        0..self.n0
     }
 }
 
@@ -227,100 +228,39 @@ impl Projector for TwoFEProjector<'_> {
 
 /// Projector for 3+ fixed effects.
 ///
-/// Uses a general Q-FE projection that processes FEs in reverse order,
-/// matching fixest's algorithm.
+/// Uses Gauss-Seidel block updates, processing FEs in reverse order
+/// to match fixest's algorithm.
 pub struct MultiFEProjector<'a> {
     ctx: &'a DemeanContext,
-    /// Weighted sums per group (Dᵀ · input).
-    coef_sums: &'a [f64],
     input: &'a [f64],
-    scratch: Vec<f64>,
+    /// Pre-created sweepers for each FE (stored in reverse order for iteration).
+    sweepers: Vec<GaussSeidelSweeper<'a>>,
+    /// Precomputed (group_ids_ptr, coef_start) for each FE, used in SSR computation.
+    /// SmallVec avoids heap allocation for typical 3-4 FE cases.
+    fe_ptrs: SmallVec<[(*const usize, usize); 4]>,
 }
 
 impl<'a> MultiFEProjector<'a> {
-    /// Create a new multi-FE projector.
     #[inline]
     pub fn new(ctx: &'a DemeanContext, coef_sums: &'a [f64], input: &'a [f64]) -> Self {
-        let n_obs = ctx.index.n_obs;
+        // Pre-create sweepers in reverse order (how they're processed)
+        let sweepers: Vec<_> = (0..ctx.dims.n_fe)
+            .rev()
+            .map(|q| GaussSeidelSweeper::new(ctx, coef_sums, q))
+            .collect();
+
+        // Precompute FE pointers for SSR computation (avoids per-call allocation)
+        let fe_ptrs: SmallVec<[(*const usize, usize); 4]> = ctx
+            .fe_infos
+            .iter()
+            .map(|fe| (fe.group_ids.as_ptr(), fe.coef_start))
+            .collect();
+
         Self {
             ctx,
-            coef_sums,
             input,
-            scratch: vec![0.0; n_obs],
-        }
-    }
-
-    /// Accumulate coefficient contributions from one FE into the scratch buffer.
-    ///
-    /// For each observation i: scratch[i] += coef[start + fe[i]]
-    #[inline(always)]
-    fn accumulate_fe_contributions(&mut self, fe_idx: usize, coef: &[f64]) {
-        let start = self.ctx.index.coef_start[fe_idx];
-        let fe = self.ctx.index.group_ids_for_fe(fe_idx);
-        let n = self.scratch.len().min(fe.len());
-
-        // Manual 4x unrolling for better instruction-level parallelism.
-        unsafe {
-            let scratch_ptr = self.scratch.as_mut_ptr();
-            let fe_ptr = fe.as_ptr();
-            let coef_ptr = coef.as_ptr().add(start);
-
-            let chunks = n / 4;
-            let mut i = 0;
-
-            for _ in 0..chunks {
-                let g0 = *fe_ptr.add(i);
-                let g1 = *fe_ptr.add(i + 1);
-                let g2 = *fe_ptr.add(i + 2);
-                let g3 = *fe_ptr.add(i + 3);
-
-                *scratch_ptr.add(i) += *coef_ptr.add(g0);
-                *scratch_ptr.add(i + 1) += *coef_ptr.add(g1);
-                *scratch_ptr.add(i + 2) += *coef_ptr.add(g2);
-                *scratch_ptr.add(i + 3) += *coef_ptr.add(g3);
-
-                i += 4;
-            }
-
-            // Handle remainder
-            for j in i..n {
-                *scratch_ptr.add(j) += *coef_ptr.add(*fe_ptr.add(j));
-            }
-        }
-    }
-
-    /// Update coefficients for a single FE given the accumulated other-FE sums.
-    ///
-    /// For each group g in FE q:
-    ///   coef_out[g] = (coef_sums[g] - Σ scratch[i] * w) / group_weight[g]
-    #[inline(always)]
-    fn update_fe_coefficients(&self, fe_idx: usize, coef_out: &mut [f64]) {
-        let start = self.ctx.index.coef_start[fe_idx];
-        let n_groups = self.ctx.index.n_groups[fe_idx];
-        let fe = self.ctx.index.group_ids_for_fe(fe_idx);
-        let group_weights = self.ctx.group_weights_for_fe(fe_idx);
-
-        // Initialize from coef_sums
-        coef_out[start..start + n_groups]
-            .copy_from_slice(&self.coef_sums[start..start + n_groups]);
-
-        // Subtract accumulated other-FE contributions
-        if let Some(w) = &self.ctx.weights {
-            for ((&g, &sum), &wo) in fe.iter().zip(self.scratch.iter()).zip(w.per_obs.iter()) {
-                coef_out[start + g] -= sum * wo;
-            }
-        } else {
-            for (&g, &sum) in fe.iter().zip(self.scratch.iter()) {
-                coef_out[start + g] -= sum;
-            }
-        }
-
-        // Normalize by group weights
-        for (coef, &sw) in coef_out[start..start + n_groups]
-            .iter_mut()
-            .zip(group_weights.iter())
-        {
-            *coef /= sw;
+            sweepers,
+            fe_ptrs,
         }
     }
 }
@@ -328,63 +268,83 @@ impl<'a> MultiFEProjector<'a> {
 impl Projector for MultiFEProjector<'_> {
     #[inline(always)]
     fn coef_len(&self) -> usize {
-        self.ctx.index.n_coef
+        self.ctx.dims.n_coef
     }
 
-    /// Project coefficients using reverse-order FE updates.
-    ///
-    /// For each FE q from (n_fe-1) down to 0:
-    ///   1. Accumulate contributions from FEs before q (from coef_in)
-    ///   2. Accumulate contributions from FEs after q (from coef_out, already computed)
-    ///   3. Update coef_out for FE q
     #[inline(always)]
     fn project(&mut self, coef_in: &[f64], coef_out: &mut [f64]) {
-        let n_fe = self.ctx.index.n_fe;
-
-        for q in (0..n_fe).rev() {
-            // Reset scratch buffer
-            self.scratch.fill(0.0);
-
-            // Accumulate from FEs before q (use coef_in)
-            for h in 0..q {
-                self.accumulate_fe_contributions(h, coef_in);
-            }
-
-            // Accumulate from FEs after q (use coef_out, already computed)
-            for h in (q + 1)..n_fe {
-                self.accumulate_fe_contributions(h, coef_out);
-            }
-
-            // Update coefficients for FE q
-            self.update_fe_coefficients(q, coef_out);
+        for sweeper in &self.sweepers {
+            sweeper.sweep(coef_in, coef_out);
         }
     }
 
     #[inline(always)]
     fn compute_ssr(&mut self, coef: &[f64]) -> f64 {
-        let n_fe = self.ctx.index.n_fe;
+        let n_obs = self.ctx.dims.n_obs;
+        let coef_ptr = coef.as_ptr();
+        let input_ptr = self.input.as_ptr();
 
-        // Accumulate coefficient sums per observation using the scratch buffer
-        // (reuses the optimized unrolled gather loop)
-        self.scratch.fill(0.0);
-        for q in 0..n_fe {
-            self.accumulate_fe_contributions(q, coef);
+        let mut ssr = 0.0;
+
+        // SAFETY: All pointer accesses are valid because:
+        // - i < n_obs throughout (loop bounds ensure this)
+        // - group_ids_ptr for each FE points to array of length n_obs (from FixedEffectInfo)
+        // - input_ptr points to array of length n_obs (from caller)
+        // - group IDs are always < n_groups for their respective FE
+        //   (invariant from DemeanContext construction)
+        // - coef_start + g < coef.len() because coef_start is the FE's offset and
+        //   g < n_groups for that FE (DemeanContext guarantees this layout)
+        unsafe {
+            // Main loop with 4x unrolling
+            let chunks = n_obs / 4;
+            let mut i = 0usize;
+
+            for _ in 0..chunks {
+                let mut sum0 = 0.0;
+                let mut sum1 = 0.0;
+                let mut sum2 = 0.0;
+                let mut sum3 = 0.0;
+
+                for &(group_ids_ptr, coef_start) in &self.fe_ptrs {
+                    let g0 = *group_ids_ptr.add(i);
+                    let g1 = *group_ids_ptr.add(i + 1);
+                    let g2 = *group_ids_ptr.add(i + 2);
+                    let g3 = *group_ids_ptr.add(i + 3);
+
+                    sum0 += *coef_ptr.add(coef_start + g0);
+                    sum1 += *coef_ptr.add(coef_start + g1);
+                    sum2 += *coef_ptr.add(coef_start + g2);
+                    sum3 += *coef_ptr.add(coef_start + g3);
+                }
+
+                let resid0 = *input_ptr.add(i) - sum0;
+                let resid1 = *input_ptr.add(i + 1) - sum1;
+                let resid2 = *input_ptr.add(i + 2) - sum2;
+                let resid3 = *input_ptr.add(i + 3) - sum3;
+
+                ssr += resid0 * resid0 + resid1 * resid1 + resid2 * resid2 + resid3 * resid3;
+                i += 4;
+            }
+
+            // Handle remainder
+            while i < n_obs {
+                let mut sum = 0.0;
+                for &(group_ids_ptr, coef_start) in &self.fe_ptrs {
+                    let g = *group_ids_ptr.add(i);
+                    sum += *coef_ptr.add(coef_start + g);
+                }
+                let resid = *input_ptr.add(i) - sum;
+                ssr += resid * resid;
+                i += 1;
+            }
         }
 
-        // Compute SSR from residuals
-        self.input
-            .iter()
-            .zip(self.scratch.iter())
-            .map(|(&x, &sum)| {
-                let resid = x - sum;
-                resid * resid
-            })
-            .sum()
+        ssr
     }
 
     #[inline(always)]
-    fn convergence_range(&self) -> Range<usize> {
-        // Exclude last FE (smallest), check FEs 0 through n_fe-2
-        0..self.ctx.index.n_coef - self.ctx.index.n_groups[self.ctx.index.n_fe - 1]
+    fn convergence_range(&self) -> std::ops::Range<usize> {
+        let n_fe = self.ctx.dims.n_fe;
+        0..(self.ctx.dims.n_coef - self.ctx.fe_infos[n_fe - 1].n_groups)
     }
 }
diff --git a/src/demean/sweep.rs b/src/demean/sweep.rs
new file mode 100644
index 000000000..92929febb
--- /dev/null
+++ b/src/demean/sweep.rs
@@ -0,0 +1,357 @@
+//! Block sweepers for fixed-effects demeaning.
+//!
+//! This module contains the low-level sweepers that encapsulate unsafe pointer
+//! operations for the projection algorithms:
+//!
+//! - [`TwoFESweeper`]: For 2-FE case, computes one side's coefficients from the other
+//! - [`GaussSeidelSweeper`]: For 3+ FE case, performs one block update in the Gauss-Seidel iteration
+
+use crate::demean::types::{DemeanContext, FixedEffectInfo};
+use smallvec::SmallVec;
+
+// =============================================================================
+// TwoFESweeper
+// =============================================================================
+
+/// Performs a single-direction sweep for 2-FE demeaning.
+///
+/// Each sweeper computes coefficients for one FE given the other FE's coefficients.
+/// For a complete 2-FE iteration, use two instances:
+/// - `alpha_sweeper`: computes alpha coefficients from beta
+/// - `beta_sweeper`: computes beta coefficients from alpha
+///
+/// All data needed for the hot loop is precomputed at construction time
+/// to minimize indirection during iteration.
+pub(super) struct TwoFESweeper<'a> {
+    n_obs: usize,
+    n_groups: usize,
+
+    // Per-observation weights (None = uniform)
+    weights_ptr: Option<*const f64>,
+
+    // This side's data
+    out_groups_ptr: *const usize,
+    inv_group_weights_ptr: *const f64,
+    coef_sums_ptr: *const f64,
+
+    // Other side's group IDs (for reading input coefficients)
+    other_groups_ptr: *const usize,
+
+    _phantom: std::marker::PhantomData<&'a ()>,
+}
+
+impl<'a> TwoFESweeper<'a> {
+    /// Create a sweeper for computing `out_fe`'s coefficients from `other_fe`'s coefficients.
+    #[inline]
+    pub fn new(
+        n_obs: usize,
+        weights_ptr: Option<*const f64>,
+        out_fe: &'a FixedEffectInfo,
+        other_fe: &'a FixedEffectInfo,
+        coef_sums: &'a [f64],
+        out_coef_start: usize,
+    ) -> Self {
+        // Verify bounds before creating raw pointer
+        debug_assert!(
+            out_coef_start + out_fe.n_groups <= coef_sums.len(),
+            "out_coef_start ({}) + n_groups ({}) exceeds coef_sums.len() ({})",
+            out_coef_start,
+            out_fe.n_groups,
+            coef_sums.len()
+        );
+
+        // SAFETY: out_coef_start is the offset for this FE within coef_sums,
+        // verified by debug_assert above and guaranteed by DemeanContext construction.
+        let coef_sums_ptr = unsafe { coef_sums.as_ptr().add(out_coef_start) };
+
+        Self {
+            n_obs,
+            n_groups: out_fe.n_groups,
+            weights_ptr,
+            out_groups_ptr: out_fe.group_ids.as_ptr(),
+            inv_group_weights_ptr: out_fe.inv_group_weights.as_ptr(),
+            coef_sums_ptr,
+            other_groups_ptr: other_fe.group_ids.as_ptr(),
+            _phantom: std::marker::PhantomData,
+        }
+    }
+
+    /// Compute output coefficients from the other side's coefficients.
+    ///
+    /// Formula: `out[g] = (sums[g] - Σᵢ other[other_groups[i]] * w[i]) * inv_weights[g]`
+    #[inline(always)]
+    pub fn sweep(&self, other_coef: &[f64], out_coef: &mut [f64]) {
+        debug_assert!(
+            out_coef.len() >= self.n_groups,
+            "out_coef.len() ({}) must be >= n_groups ({})",
+            out_coef.len(),
+            self.n_groups
+        );
+
+        let other_ptr = other_coef.as_ptr();
+        let out_ptr = out_coef.as_mut_ptr();
+
+        // SAFETY: All pointer operations are valid because:
+        // - coef_sums_ptr points to n_groups elements (set in constructor)
+        // - out_ptr has capacity n_groups (caller's responsibility, same as other_coef.len())
+        // - inv_group_weights_ptr points to n_groups elements (from FixedEffectInfo)
+        // - scatter_* methods only access indices < n_obs (loop bounds)
+        // - group IDs are always < n_groups (invariant from DemeanContext construction)
+        unsafe {
+            // 1. Initialize from coef_sums
+            std::ptr::copy_nonoverlapping(self.coef_sums_ptr, out_ptr, self.n_groups);
+
+            // 2. Scatter-subtract
+            match self.weights_ptr {
+                None => self.scatter_uniform(other_ptr, out_ptr),
+                Some(w_ptr) => self.scatter_weighted(other_ptr, out_ptr, w_ptr),
+            }
+
+            // 3. Normalize by inverse group weights (slice-based for auto-vectorization)
+            let out_slice = std::slice::from_raw_parts_mut(out_ptr, self.n_groups);
+            let weights_slice =
+                std::slice::from_raw_parts(self.inv_group_weights_ptr, self.n_groups);
+            for (o, &w) in out_slice.iter_mut().zip(weights_slice.iter()) {
+                *o *= w;
+            }
+        }
+    }
+
+    /// Scatter-subtract for uniform weights.
+    #[inline(always)]
+    unsafe fn scatter_uniform(&self, other_ptr: *const f64, out_ptr: *mut f64) {
+        let out_groups = self.out_groups_ptr;
+        let other_groups = self.other_groups_ptr;
+
+        for i in 0..self.n_obs {
+            let g_out = *out_groups.add(i);
+            let g_other = *other_groups.add(i);
+            debug_assert!(g_out < self.n_groups, "g_out ({}) >= n_groups ({})", g_out, self.n_groups);
+            *out_ptr.add(g_out) -= *other_ptr.add(g_other);
+        }
+    }
+
+    /// Scatter-subtract for weighted case.
+    #[inline(always)]
+    unsafe fn scatter_weighted(
+        &self,
+        other_ptr: *const f64,
+        out_ptr: *mut f64,
+        w_ptr: *const f64,
+    ) {
+        let out_groups = self.out_groups_ptr;
+        let other_groups = self.other_groups_ptr;
+
+        for i in 0..self.n_obs {
+            let g_out = *out_groups.add(i);
+            let g_other = *other_groups.add(i);
+            debug_assert!(g_out < self.n_groups, "g_out ({}) >= n_groups ({})", g_out, self.n_groups);
+            let w = *w_ptr.add(i);
+            *out_ptr.add(g_out) -= *other_ptr.add(g_other) * w;
+        }
+    }
+}
+
+// =============================================================================
+// OtherFEInfo
+// =============================================================================
+
+/// Precomputed info for accessing another FE's coefficients.
+#[derive(Clone, Copy)]
+pub(super) struct OtherFEInfo {
+    /// Offset into coefficient array for this FE
+    coef_start: usize,
+    /// Pointer to group IDs for this FE
+    group_ids_ptr: *const usize,
+}
+
+// =============================================================================
+// GaussSeidelSweeper
+// =============================================================================
+
+/// Performs Gauss-Seidel block sweeps for multi-FE demeaning.
+///
+/// All data needed for the hot loop is precomputed at construction time
+/// to minimize indirection during iteration.
+pub(super) struct GaussSeidelSweeper<'a> {
+    // This FE's cached data
+    n_obs: usize,
+    coef_start: usize,
+    n_groups: usize,
+    group_ids_ptr: *const usize,
+    inv_group_weights_ptr: *const f64,
+    coef_sums_ptr: *const f64,
+
+    // Weight info: None = uniform (unweighted), Some = weighted
+    weights_ptr: Option<*const f64>,
+
+    // Other FEs' info (precomputed to avoid fe_infos lookup in hot loop)
+    // SmallVec avoids heap allocation for typical 2-5 FE cases (max 4 other FEs)
+    /// FEs processed before this one (read from coef_in)
+    other_before: SmallVec<[OtherFEInfo; 4]>,
+    /// FEs processed after this one (read from coef_out)
+    other_after: SmallVec<[OtherFEInfo; 4]>,
+
+    /// Marker to tie the struct's lifetime to the borrowed data.
+    _phantom: std::marker::PhantomData<&'a ()>,
+}
+
+impl<'a> GaussSeidelSweeper<'a> {
+    #[inline]
+    pub fn new(ctx: &'a DemeanContext, coef_sums: &'a [f64], q: usize) -> Self {
+        let fe = &ctx.fe_infos[q];
+
+        // Precompute other FEs' info
+        let other_before: SmallVec<[OtherFEInfo; 4]> = (0..q)
+            .map(|h| {
+                let fe_h = &ctx.fe_infos[h];
+                OtherFEInfo {
+                    coef_start: fe_h.coef_start,
+                    group_ids_ptr: fe_h.group_ids.as_ptr(),
+                }
+            })
+            .collect();
+
+        let other_after: SmallVec<[OtherFEInfo; 4]> = ((q + 1)..ctx.dims.n_fe)
+            .map(|h| {
+                let fe_h = &ctx.fe_infos[h];
+                OtherFEInfo {
+                    coef_start: fe_h.coef_start,
+                    group_ids_ptr: fe_h.group_ids.as_ptr(),
+                }
+            })
+            .collect();
+
+        // Verify bounds before creating raw pointer
+        debug_assert!(
+            fe.coef_start + fe.n_groups <= coef_sums.len(),
+            "coef_start ({}) + n_groups ({}) exceeds coef_sums.len() ({})",
+            fe.coef_start,
+            fe.n_groups,
+            coef_sums.len()
+        );
+
+        // SAFETY: fe.coef_start is the offset for this FE within coef_sums,
+        // verified by debug_assert above and guaranteed by DemeanContext construction.
+        let coef_sums_ptr = unsafe { coef_sums.as_ptr().add(fe.coef_start) };
+
+        Self {
+            n_obs: ctx.dims.n_obs,
+            coef_start: fe.coef_start,
+            n_groups: fe.n_groups,
+            group_ids_ptr: fe.group_ids.as_ptr(),
+            inv_group_weights_ptr: fe.inv_group_weights.as_ptr(),
+            coef_sums_ptr,
+            weights_ptr: ctx.weights.as_ref().map(|w| w.as_ptr()),
+            other_before,
+            other_after,
+            _phantom: std::marker::PhantomData,
+        }
+    }
+
+    /// Perform one Gauss-Seidel block update for this FE.
+    #[inline(always)]
+    pub fn sweep(&self, coef_in: &[f64], coef_out: &mut [f64]) {
+        debug_assert!(
+            coef_out.len() >= self.coef_start + self.n_groups,
+            "coef_out.len() ({}) must be >= coef_start + n_groups ({})",
+            coef_out.len(),
+            self.coef_start + self.n_groups
+        );
+
+        let coef_in_ptr = coef_in.as_ptr();
+        let coef_out_ptr = coef_out.as_mut_ptr();
+
+        // SAFETY: All pointer operations are valid because:
+        // - coef_start + n_groups <= coef_out.len() (caller provides full coefficient array)
+        // - coef_sums_ptr points to n_groups elements (set in constructor)
+        // - inv_group_weights_ptr points to n_groups elements (from FixedEffectInfo)
+        // - scatter_* methods only access indices < n_obs (loop bounds)
+        // - group IDs are always < n_groups (invariant from DemeanContext construction)
+        // - other_before/other_after coef_starts are valid offsets into coef arrays
+        unsafe {
+            // 1. Initialize from coef_sums
+            let out_start = coef_out_ptr.add(self.coef_start);
+            std::ptr::copy_nonoverlapping(self.coef_sums_ptr, out_start, self.n_groups);
+
+            // 2. Scatter-subtract
+            match self.weights_ptr {
+                None => self.scatter_uniform(coef_in_ptr, coef_out_ptr, out_start),
+                Some(w_ptr) => self.scatter_weighted(coef_in_ptr, coef_out_ptr, out_start, w_ptr),
+            }
+
+            // 3. Normalize by inverse group weights (slice-based for auto-vectorization)
+            let out_slice = std::slice::from_raw_parts_mut(out_start, self.n_groups);
+            let weights_slice =
+                std::slice::from_raw_parts(self.inv_group_weights_ptr, self.n_groups);
+            for (o, &w) in out_slice.iter_mut().zip(weights_slice.iter()) {
+                *o *= w;
+            }
+        }
+    }
+
+    /// Scatter-subtract for uniform weights.
+    #[inline(always)]
+    unsafe fn scatter_uniform(
+        &self,
+        coef_in_ptr: *const f64,
+        coef_out_ptr: *mut f64,
+        out_start: *mut f64,
+    ) {
+        let group_ids = self.group_ids_ptr;
+
+        for i in 0..self.n_obs {
+            let sum = self.accumulate_other_effects(i, coef_in_ptr, coef_out_ptr);
+            let g = *group_ids.add(i);
+            debug_assert!(g < self.n_groups, "g ({}) >= n_groups ({})", g, self.n_groups);
+            *out_start.add(g) -= sum;
+        }
+    }
+
+    /// Scatter-subtract for weighted case.
+    #[inline(always)]
+    unsafe fn scatter_weighted(
+        &self,
+        coef_in_ptr: *const f64,
+        coef_out_ptr: *mut f64,
+        out_start: *mut f64,
+        w_ptr: *const f64,
+    ) {
+        let group_ids = self.group_ids_ptr;
+
+        for i in 0..self.n_obs {
+            let sum = self.accumulate_other_effects(i, coef_in_ptr, coef_out_ptr);
+            let g = *group_ids.add(i);
+            debug_assert!(g < self.n_groups, "g ({}) >= n_groups ({})", g, self.n_groups);
+            let w = *w_ptr.add(i);
+            *out_start.add(g) -= sum * w;
+        }
+    }
+
+    /// Accumulate coefficient contributions from all other FEs.
+    ///
+    /// This is the innermost hot loop - kept minimal for best inlining.
+    #[inline(always)]
+    unsafe fn accumulate_other_effects(
+        &self,
+        i: usize,
+        coef_in_ptr: *const f64,
+        coef_out_ptr: *mut f64,
+    ) -> f64 {
+        let mut sum = 0.0;
+
+        // FEs before this one: read from coef_in
+        for other in &self.other_before {
+            let g = *other.group_ids_ptr.add(i);
+            sum += *coef_in_ptr.add(other.coef_start + g);
+        }
+
+        // FEs after this one: read from coef_out (already updated)
+        for other in &self.other_after {
+            let g = *other.group_ids_ptr.add(i);
+            sum += *coef_out_ptr.add(other.coef_start + g);
+        }
+
+        sum
+    }
+}
diff --git a/src/demean/types.rs b/src/demean/types.rs
index 5ea9615dc..e1ba04aff 100644
--- a/src/demean/types.rs
+++ b/src/demean/types.rs
@@ -3,7 +3,7 @@
 //! # Overview
 //!
 //! Fixed effects demeaning removes group means from data. For example, with
-//! individual and time-fixed effects, we remove both individual-specific and
+//! individual and time fixed effects, we remove both individual-specific and
 //! time-specific means from each observation.
 //!
 //! # Two Spaces
@@ -27,378 +27,237 @@
 //!
 //! # Main Types
 //!
-//! - [`FixedEffectsIndex`]: Maps observations to their group IDs for each FE
-//! - [`ObservationWeights`]: Per-observation and per-group weight sums
-//! - [`DemeanContext`]: Combines index and weights, provides scatter/gather operations
+//! - [`Dimensions`]: Problem shape (n_obs, n_fe, n_coef)
+//! - [`Weights`]: Observation-level weights (None = uniform weights)
+//! - [`FixedEffectInfo`]: Per-FE group IDs and weights
+//! - [`DemeanContext`]: Combines all of the above, provides scatter/gather operations
 //! - [`FixestConfig`]: Algorithm parameters (tolerance, max iterations, etc.)
 
-use ndarray::{ArrayView1, ArrayView2};
-use std::ops::Range;
+use ndarray::{Array2, ArrayView1, ArrayView2};
 
 // =============================================================================
-// FixedEffectsIndex
+// Dimensions
 // =============================================================================
 
-/// Index mapping observations to fixed effect groups.
+/// Problem dimensions for fixed effects demeaning.
 ///
-/// # Purpose
+/// The algorithm operates in two spaces:
+/// - **Observation space**: length `n_obs` (input/output data)
+/// - **Coefficient space**: length `n_coef` (one coefficient per group per FE)
 ///
-/// Maps each observation to its group ID for each fixed effect. For example,
-/// observation 42 might belong to individual 7 and time period 3.
+/// # Example
 ///
-/// # Memory Layout
-///
-/// Two key arrays with different purposes and sizes:
+/// With 10,000 observations, 500 firms, and 20 years:
+/// - `n_obs = 10_000`
+/// - `n_fe = 2`
+/// - `n_coef = 520` (500 firm coefficients + 20 year coefficients)
+#[derive(Clone, Copy, Debug)]
+pub(crate) struct Dimensions {
+    /// Number of observations (N).
+    pub n_obs: usize,
+    /// Number of fixed effects (Q). E.g., 2 for firm + year.
+    pub n_fe: usize,
+    /// Total coefficients: sum of group counts across all FEs.
+    pub n_coef: usize,
+}
+
+
+// =============================================================================
+// FixedEffectInfo
+// =============================================================================
+
+/// Information for a single fixed effect.
 ///
-/// ## 1. Group IDs Array (`group_ids`)
+/// Each fixed effect (e.g., firm, year) has its own group structure.
+/// This struct holds the mapping from observations to groups and the
+/// precomputed weight sums needed for computing group means.
 ///
-/// Maps each observation to its group index for each fixed effect.
-/// - **Size**: `N × Q` (observations × fixed effects)
-/// - **Layout**: Column-major (all FE0 IDs first, then all FE1 IDs, etc.)
+/// # Coefficient Layout
 ///
+/// Coefficients for all FEs are stored in a single flat array:
 /// ```text
-/// group_ids = [fe0_obs0, fe0_obs1, ..., fe0_obsN,  fe1_obs0, fe1_obs1, ..., fe1_obsN, ...]
-///              |-------- N entries ---------|      |-------- N entries ---------|
+/// [FE0_group0, ..., FE0_groupK, FE1_group0, ..., FE1_groupM, ...]
 /// ```
+/// The `coef_start` field gives the offset where this FE's coefficients begin.
+#[derive(Clone, Debug)]
+pub(crate) struct FixedEffectInfo {
+    /// Number of groups in this FE. E.g., 500 firms.
+    pub n_groups: usize,
+    /// Starting index in coefficient arrays for this FE.
+    pub coef_start: usize,
+    /// Group ID for each observation (length: `n_obs`).
+    /// `group_ids[i]` gives the group index (0..n_groups) for observation i.
+    pub group_ids: Vec<usize>,
+    /// Inverse of group weights (length: `n_groups`).
+    /// Precomputed as `1.0 / sum_of_observation_weights_per_group` to replace
+    /// division with multiplication in hot loops. For unweighted case, this is
+    /// `1.0 / count_of_observations_per_group`.
+    pub inv_group_weights: Vec<f64>,
+}
+
+// =============================================================================
+// DemeanContext
+// =============================================================================
+
+/// Complete context for fixed effects demeaning operations.
 ///
-/// Access: `group_ids[fe_index * n_obs + obs_index]`
+/// Combines problem dimensions, observation weights, and per-FE information.
+/// Provides the core scatter/gather operations used by the iterative algorithm.
 ///
-/// ## 2. Coefficient Array (`coef`)
+/// # Construction
 ///
-/// Stores the actual FE coefficient values being solved for.
-/// - **Size**: `n_coef` = sum of all group counts
-/// - **Layout**: FE0 coefficients first, then FE1, etc.
-/// - **Indexing**: `coef_start[q]` gives the offset for FE q
+/// Use [`DemeanContext::new`] to create a context from input arrays. The context
+/// is reused across multiple columns being demeaned.
 ///
-/// ```text
-/// coef = [α₀, α₁, ..., α_{n0-1},  γ₀, γ₁, ..., γ_{n1-1}, ...]
-///         |---- n_groups[0] ----|  |---- n_groups[1] ----|
-///         coef_start[0]=0          coef_start[1]=n0
-/// ```
+/// # FE Ordering
 ///
-/// ## Example: 1000 obs, 100 individuals, 10 years
+/// Fixed effects are always reordered by size (largest first) to match fixest's
+/// behavior and ensure optimal convergence properties.
 ///
-/// | Array      | Size  | Contents                           |
-/// |------------|-------|-------------------------------------|
-/// | group_ids  | 2000  | Which individual/year each obs is  |
-/// | coef       | 110   | The 100 α + 10 γ coefficient values|
+/// # Uniform Weights Fast Path
 ///
-/// To get coefficient for observation i in FE q:
-/// ```rust
-/// let group = group_ids[q * n_obs + i];
-/// let coef_value = coef[coef_start[q] + group];
-/// ```
-
-pub struct FixedEffectsIndex {
-    /// Number of observations (N).
-    pub n_obs: usize,
-
-    /// Number of fixed effects (e.g., 2 for individual and time).
-    pub n_fe: usize,
-
-    /// Flat group IDs in column-major order.
-    /// Index with `fe * n_obs + obs` to get the group ID for observation `obs` in FE `fe`.
-    pub group_ids: Vec<usize>,
-
-    /// Number of groups in each fixed effect.
-    /// Example: `[100, 10]` means FE 0 has 100 groups, FE 1 has 10 groups.
-    pub n_groups: Vec<usize>,
-
-    /// Starting index in coefficient arrays for each FE.
-    /// Example: `[0, 100]` means FE 0 coefficients are at indices 0..100,
-    /// FE 1 coefficients are at indices 100..110.
-    pub coef_start: Vec<usize>,
-
-    /// Total number of coefficients (sum of `n_groups`).
-    pub n_coef: usize,
-
-    /// Mapping from original FE index to reordered position.
-    ///
-    /// `original_to_reordered[original_q]` gives the position of original
-    /// FE `original_q` in the reordered (sorted by size) layout.
-    original_to_reordered: Vec<usize>,
+/// When `weights` is `None`, all observations are equally weighted. This enables
+/// optimized code paths that skip weight multiplication in hot loops.
+///
+/// # Operations
+///
+/// - [`apply_design_matrix_t`](Self::apply_design_matrix_t): Scatter values to coefficient space
+/// - [`apply_design_matrix`](Self::apply_design_matrix): Gather coefficients to observation space
+pub struct DemeanContext {
+    /// Problem dimensions.
+    pub(crate) dims: Dimensions,
+    /// Observation-level weights (length: `n_obs`). None means uniform weights (unweighted case).
+    pub(crate) weights: Option<Vec<f64>>,
+    /// Per-fixed-effect information (in internal/reordered order).
+    pub(crate) fe_infos: Vec<FixedEffectInfo>,
+    /// Mapping from internal FE index to original FE index.
+    /// `fe_order[q]` gives the original column index for internal FE `q`.
+    /// Used to reorder coefficients back to original order when returning.
+    pub(crate) fe_order: Vec<usize>,
 }
 
-impl FixedEffectsIndex {
-    /// Create a fixed effects index from the input array.
+impl DemeanContext {
+    /// Create a demeaning context from input arrays.
+    ///
+    /// Fixed effects are automatically reordered by size (largest first) to
+    /// match fixest's behavior and ensure optimal convergence.
     ///
     /// # Arguments
     ///
     /// * `flist` - Fixed effect group IDs with shape `(n_obs, n_fe)`.
     ///   Each row is one observation, each column is one fixed effect.
     ///   Values must be 0-indexed group IDs.
+    /// * `weights` - Per-observation weights (length: `n_obs`), or None for unweighted.
     ///
-    /// # Computed Fields
+    /// # Panics
     ///
-    /// - `n_groups`: Computed as `max(group_id) + 1` for each FE
-    /// - `coef_start`: Cumulative sum of `n_groups`
-    /// - `group_ids`: Transposed to column-major order for cache efficiency
+    /// Panics if:
+    /// - `flist` has zero rows or columns
+    /// - `weights.len() != flist.nrows()`
     ///
-    /// # Panics
+    /// # Empty Groups
     ///
-    /// Panics in debug builds if `n_obs == 0` or `n_fe == 0`.
-    pub fn new(flist: &ArrayView2<usize>) -> Self {
+    /// Groups with no observations (e.g., sparse group IDs) are handled by setting
+    /// their weight to 1, matching fixest's approach. Since no observation belongs
+    /// to these groups, their coefficients are never used in computations.
+    pub fn new(flist: &ArrayView2<usize>, weights: Option<&ArrayView1<f64>>) -> Self {
         let (n_obs, n_fe) = flist.dim();
 
-        debug_assert!(n_obs > 0, "Cannot create FixedEffectsIndex with 0 observations");
-        debug_assert!(n_fe > 0, "Cannot create FixedEffectsIndex with 0 fixed effects");
+        assert!(n_obs > 0, "Cannot create DemeanContext with 0 observations");
+        assert!(n_fe > 0, "Cannot create DemeanContext with 0 fixed effects");
+        if let Some(w) = weights {
+            assert_eq!(
+                w.len(),
+                n_obs,
+                "weights length ({}) must match number of observations ({})",
+                w.len(),
+                n_obs
+            );
+        }
 
-        // Compute n_groups: max group_id + 1 for each FE (in original order)
+        // Compute n_groups for each FE (max group_id + 1)
+        // Panics if any column is empty (which shouldn't happen with n_obs > 0)
         let n_groups_original: Vec<usize> = (0..n_fe)
-            .map(|j| flist.column(j).iter().max().unwrap_or(&0) + 1)
+            .map(|j| {
+                flist
+                    .column(j)
+                    .iter()
+                    .max()
+                    .expect("FE column should not be empty when n_obs > 0")
+                    + 1
+            })
             .collect();
 
-        // Sort FEs by size (largest first) for optimal convergence.
-        // This matches fixest's default behavior and allows excluding the largest
-        // FE from convergence checking (since FE 0 will be at the start of the
-        // coefficient array, we can efficiently check just the suffix).
+        // Always reorder FEs by size (largest first) - matches fixest behavior
         let order: Vec<usize> = if n_fe > 1 {
             let mut indices: Vec<usize> = (0..n_fe).collect();
             indices.sort_by_key(|&i| std::cmp::Reverse(n_groups_original[i]));
             indices
         } else {
-            (0..n_fe).collect()
+            vec![0]
         };
 
-        // Reorder n_groups according to the sort order
+        // Compute dimensions
         let n_groups: Vec<usize> = order.iter().map(|&i| n_groups_original[i]).collect();
-
-        // Compute coefficient start indices (cumulative sum of reordered n_groups)
-        let mut coef_start = vec![0usize; n_fe];
+        let mut coef_starts = vec![0usize; n_fe];
         for q in 1..n_fe {
-            coef_start[q] = coef_start[q - 1] + n_groups[q - 1];
+            coef_starts[q] = coef_starts[q - 1] + n_groups[q - 1];
         }
         let n_coef: usize = n_groups.iter().sum();
 
-        // Transpose group_ids from row-major (obs, fe) to column-major (fe, obs)
-        // applying the reordering during the transpose (zero extra cost)
-        let mut group_ids = vec![0usize; n_fe * n_obs];
-        for (new_q, &old_q) in order.iter().enumerate() {
-            for (i, &g) in flist.column(old_q).iter().enumerate() {
-                group_ids[new_q * n_obs + i] = g;
-            }
-        }
-
-        // Compute inverse mapping: original_to_reordered[original_q] = reordered_q
-        // order[reordered_q] = original_q, so we invert this
-        let mut original_to_reordered = vec![0usize; n_fe];
-        for (reordered_q, &original_q) in order.iter().enumerate() {
-            original_to_reordered[original_q] = reordered_q;
-        }
-
-        Self {
-            n_obs,
-            n_fe,
-            group_ids,
-            n_groups,
-            coef_start,
-            n_coef,
-            original_to_reordered,
-        }
-    }
-
-    /// Get the group IDs for all observations in fixed effect `fe`.
-    ///
-    /// Returns a slice of length `n_obs` where `result[i]` is the group ID
-    /// for observation `i` in this fixed effect.
-    ///
-    /// # Example
-    ///
-    /// ```ignore
-    /// let individual_ids = index.group_ids_for_fe(0);  // [7, 3, 7, 12, ...]
-    /// let year_ids = index.group_ids_for_fe(1);        // [0, 1, 0, 2, ...]
-    /// ```
-    #[inline(always)]
-    pub fn group_ids_for_fe(&self, fe: usize) -> &[usize] {
-        let start = fe * self.n_obs;
-        &self.group_ids[start..start + self.n_obs]
-    }
-
-    /// Get the coefficient index range for fixed effect `fe`.
-    ///
-    /// Returns the range of indices in coefficient arrays that correspond
-    /// to this fixed effect's groups.
-    #[inline(always)]
-    pub fn coef_range_for_fe(&self, fe: usize) -> Range<usize> {
-        let start = self.coef_start[fe];
-        let end = if fe + 1 < self.n_fe {
-            self.coef_start[fe + 1]
-        } else {
-            self.n_coef
-        };
-        start..end
-    }
-
-    /// Reorder coefficients from internal (sorted by FE size) to original FE order.
-    ///
-    /// During solving, FEs are reordered by size (largest first) for optimal
-    /// convergence. This method restores coefficients to the original FE order
-    /// as they appeared in the input.
-    ///
-    /// # Arguments
-    ///
-    /// * `coef` - Coefficient array in internal (reordered) layout
-    ///
-    /// # Returns
-    ///
-    /// Coefficient array in original FE order.
-    ///
-    /// # Layout
-    ///
-    /// Input layout (reordered, largest FE first):
-    /// ```text
-    /// [FE_reord_0 | FE_reord_1 | ... | FE_reord_{n_fe-1}]
-    /// ```
-    ///
-    /// Output layout (original order):
-    /// ```text
-    /// [FE_orig_0 | FE_orig_1 | ... | FE_orig_{n_fe-1}]
-    /// ```
-    pub fn reorder_coefficients_to_original(&self, coef: &[f64]) -> Vec<f64> {
-        debug_assert_eq!(
-            coef.len(),
-            self.n_coef,
-            "coefficient length ({}) must match n_coef ({})",
-            coef.len(),
-            self.n_coef
-        );
-
-        let mut out = vec![0.0; self.n_coef];
-        let mut out_pos = 0;
-
-        // For each FE in original order
-        for original_q in 0..self.n_fe {
-            let reordered_q = self.original_to_reordered[original_q];
-            let src_start = self.coef_start[reordered_q];
-            let len = self.n_groups[reordered_q];
-
-            out[out_pos..out_pos + len].copy_from_slice(&coef[src_start..src_start + len]);
-            out_pos += len;
-        }
+        let dims = Dimensions { n_obs, n_fe, n_coef };
 
-        out
-    }
-}
+        // Build observation weights (None if uniform)
+        let obs_weights = weights.map(|w| w.to_vec());
 
-// =============================================================================
-// ObservationWeights
-// =============================================================================
+        // Build per-FE info
+        let mut fe_infos = Vec::with_capacity(n_fe);
+        for q in 0..n_fe {
+            let original_col = order[q];
 
-/// Observation weights and their aggregation to group level.
-///
-/// Only created when weights are non-uniform. For unweighted regression,
-/// `DemeanContext.weights` is `None`.
-pub struct ObservationWeights {
-    /// Weight for each observation (length: `n_obs`).
-    pub per_obs: Vec<f64>,
-
-    /// Sum of observation weights for each group (length: `n_coef`).
-    pub per_group: Vec<f64>,
-}
+            // Extract group IDs for this FE
+            let group_ids: Vec<usize> = flist.column(original_col).iter().copied().collect();
 
-impl ObservationWeights {
-    /// Create observation weights from the input array.
-    pub fn new(weights: &ArrayView1<f64>, index: &FixedEffectsIndex) -> Self {
-        // Aggregate observation weights to group level
-        let mut per_group = vec![0.0; index.n_coef];
-        for q in 0..index.n_fe {
-            let offset = index.coef_start[q];
-            let fe_offset = q * index.n_obs;
-            for (i, &w) in weights.iter().enumerate() {
-                let g = index.group_ids[fe_offset + i];
-                per_group[offset + g] += w;
+            // Aggregate observation weights to group level
+            let mut group_weights = vec![0.0; n_groups[q]];
+            match &obs_weights {
+                Some(w) => {
+                    for (i, &g) in group_ids.iter().enumerate() {
+                        group_weights[g] += w[i];
+                    }
+                }
+                None => {
+                    // Unweighted: count observations per group
+                    for &g in group_ids.iter() {
+                        group_weights[g] += 1.0;
+                    }
+                }
             }
-        }
 
-        // Avoid division by zero for empty groups
-        for w in &mut per_group {
-            if *w == 0.0 {
-                *w = 1.0;
+            // Handle empty groups (weight=0) by setting weight to 1, matching fixest's approach.
+            // This is defensive programming - empty groups are never accessed since no
+            // observation belongs to them, but this prevents any potential division by zero.
+            for w in &mut group_weights {
+                if *w == 0.0 {
+                    *w = 1.0;
+                }
             }
-        }
-
-        Self {
-            per_obs: weights.to_vec(),
-            per_group,
-        }
-    }
-}
-
-// =============================================================================
-// DemeanContext
-// =============================================================================
-
-/// Complete context for fixed effects demeaning operations.
-///
-/// Combines the fixed effects index with optional observation weights.
-/// When `weights` is `None`, uses the fast unweighted path.
-pub struct DemeanContext {
-    /// Fixed effects index (observation → group mapping).
-    pub index: FixedEffectsIndex,
 
-    /// Group counts (length: `n_coef`). Used as denominator for unweighted case.
-    pub group_counts: Vec<f64>,
+            let inv_group_weights: Vec<f64> = group_weights.iter().map(|&w| 1.0 / w).collect();
 
-    /// Observation weights. `None` for unweighted regression (fast path).
-    pub weights: Option<ObservationWeights>,
-}
-
-impl DemeanContext {
-    /// Create a demeaning context from input arrays.
-    ///
-    /// Fixed effects are automatically reordered by size (largest first) for
-    /// optimal convergence. This matches fixest's default behavior.
-    ///
-    /// # Arguments
-    ///
-    /// * `flist` - Fixed effect group IDs with shape `(n_obs, n_fe)`
-    /// * `weights` - Per-observation weights, or `None` for unweighted regression
-    pub fn new(flist: &ArrayView2<usize>, weights: Option<&ArrayView1<f64>>) -> Self {
-        let index = FixedEffectsIndex::new(flist);
-
-        // Always compute group counts (needed for unweighted case)
-        let mut group_counts = vec![0.0; index.n_coef];
-        for q in 0..index.n_fe {
-            let offset = index.coef_start[q];
-            let fe_offset = q * index.n_obs;
-            for i in 0..index.n_obs {
-                let g = index.group_ids[fe_offset + i];
-                group_counts[offset + g] += 1.0;
-            }
+            fe_infos.push(FixedEffectInfo {
+                n_groups: n_groups[q],
+                coef_start: coef_starts[q],
+                group_ids,
+                inv_group_weights,
+            });
         }
-        // Avoid division by zero for empty groups
-        for c in &mut group_counts {
-            if *c == 0.0 {
-                *c = 1.0;
-            }
-        }
-
-        let weights = weights.map(|w| {
-            debug_assert_eq!(
-                w.len(),
-                flist.nrows(),
-                "weights length ({}) must match number of observations ({})",
-                w.len(),
-                flist.nrows()
-            );
-            ObservationWeights::new(w, &index)
-        });
 
         Self {
-            index,
-            group_counts,
-            weights,
-        }
-    }
-
-    /// Get the weight sums for all groups in fixed effect `fe`.
-    /// Returns group counts for unweighted, weighted sums for weighted.
-    #[inline(always)]
-    pub fn group_weights_for_fe(&self, fe: usize) -> &[f64] {
-        let range = self.index.coef_range_for_fe(fe);
-        match &self.weights {
-            Some(w) => &w.per_group[range],
-            None => &self.group_counts[range],
+            dims,
+            weights: obs_weights,
+            fe_infos,
+            fe_order: order,
         }
     }
 
@@ -411,43 +270,46 @@ impl DemeanContext {
     /// Computes weighted sums of `values` for each group in each FE,
     /// writing the result to `out`. The buffer is zeroed before accumulation.
     ///
+    /// # Arguments
+    ///
+    /// * `values` - Input values in observation space (length: `n_obs`)
+    /// * `out` - Output buffer in coefficient space (length: `n_coef`)
+    ///
     /// # Example
     ///
     /// With 4 observations, 2 firms (FE0), 2 years (FE1):
     ///
     /// ```text
-    /// values = [10, 20, 30, 40]  (e.g., y values)
-    /// firm   = [ 0,  0,  1,  1]  (obs 0,1 → firm 0; obs 2,3 → firm 1)
-    /// year   = [ 0,  1,  0,  1]  (obs 0,2 → year 0; obs 1,3 → year 1)
+    /// values = [10, 20, 30, 40]
+    /// firm   = [ 0,  0,  1,  1]
+    /// year   = [ 0,  1,  0,  1]
     ///
-    /// out = [S₀[0], S₀[1], S₁[0], S₁[1]]
-    ///     = [10+20, 30+40, 10+30, 20+40]
-    ///     = [  30,    70,    40,    60 ]
-    ///       ├─ FE0 ─┤ ├─ FE1 ─┤
+    /// out = [10+20, 30+40, 10+30, 20+40] = [30, 70, 40, 60]
+    ///       |-- FE0 --|  |-- FE1 --|
     /// ```
-    ///
-    /// Used to precompute per-group sums of y (coefficient sums S)
-    /// and per-group sums of weights (group weights W).
     #[inline]
     pub fn apply_design_matrix_t(&self, values: &[f64], out: &mut [f64]) {
         debug_assert_eq!(
             out.len(),
-            self.index.n_coef,
+            self.dims.n_coef,
             "output buffer length ({}) must match n_coef ({})",
             out.len(),
-            self.index.n_coef
+            self.dims.n_coef
         );
         out.fill(0.0);
-        for q in 0..self.index.n_fe {
-            let offset = self.index.coef_start[q];
-            let fe_ids = self.index.group_ids_for_fe(q);
-            if let Some(w) = &self.weights {
-                for (i, &g) in fe_ids.iter().enumerate() {
-                    out[offset + g] += values[i] * w.per_obs[i];
+
+        for fe in &self.fe_infos {
+            let offset = fe.coef_start;
+            match &self.weights {
+                None => {
+                    for (i, &g) in fe.group_ids.iter().enumerate() {
+                        out[offset + g] += values[i];
+                    }
                 }
-            } else {
-                for (i, &g) in fe_ids.iter().enumerate() {
-                    out[offset + g] += values[i];
+                Some(w) => {
+                    for (i, &g) in fe.group_ids.iter().enumerate() {
+                        out[offset + g] += values[i] * w[i];
+                    }
                 }
             }
         }
@@ -456,22 +318,46 @@ impl DemeanContext {
     /// Apply design matrix and add to output: output += D · coef.
     ///
     /// For each observation, looks up its coefficient for each FE and adds to output.
-    /// Computes: `output[i] += Σ_q coef[offset_q + fe_q[i]]`
     ///
-    /// Used for: final residuals (r = y - D·coef), periodic SSR convergence checks,
-    /// and 3+ FE projector scratch computation (every iteration). The 2-FE projector
-    /// avoids calling this in its inner loop by working entirely in coefficient space.
+    /// # Arguments
+    ///
+    /// * `coef` - Coefficients in coefficient space (length: `n_coef`)
+    /// * `output` - Output buffer in observation space (length: `n_obs`), accumulated into
     #[inline]
     pub fn apply_design_matrix(&self, coef: &[f64], output: &mut [f64]) {
-        for q in 0..self.index.n_fe {
-            let offset = self.index.coef_start[q];
-            let fe_ids = self.index.group_ids_for_fe(q);
-            for (i, &g) in fe_ids.iter().enumerate() {
+        for fe in &self.fe_infos {
+            let offset = fe.coef_start;
+            for (i, &g) in fe.group_ids.iter().enumerate() {
                 output[i] += coef[offset + g];
             }
         }
     }
 
+    /// Reorder coefficients from internal order to original FE order.
+    ///
+    /// The input `coef` is in internal order (potentially reordered by size).
+    /// Returns coefficients in the original FE column order from the input flist.
+    #[must_use]
+    pub fn reorder_coef_to_original(&self, coef: &[f64]) -> Vec<f64> {
+        let n_fe = self.dims.n_fe;
+
+        // Build inverse mapping: original_fe_index -> internal_fe_index
+        let mut internal_idx = vec![0usize; n_fe];
+        for (q, &orig) in self.fe_order.iter().enumerate() {
+            internal_idx[orig] = q;
+        }
+
+        // Reorder coefficients
+        let mut out = Vec::with_capacity(self.dims.n_coef);
+        for orig_fe in 0..n_fe {
+            let q = internal_idx[orig_fe];
+            let fe = &self.fe_infos[q];
+            let start = fe.coef_start;
+            let end = start + fe.n_groups;
+            out.extend_from_slice(&coef[start..end]);
+        }
+        out
+    }
 }
 
 // =============================================================================
@@ -483,7 +369,7 @@ impl DemeanContext {
 /// These parameters control the convergence behavior of the iterative
 /// demeaning algorithm. The defaults match R's fixest package.
 #[derive(Clone, Copy)]
-pub struct FixestConfig {
+pub(crate) struct FixestConfig {
     /// Convergence tolerance for coefficient changes.
     pub tol: f64,
 
@@ -508,17 +394,11 @@ impl Default for FixestConfig {
     /// Default values match R's fixest package for consistency.
     fn default() -> Self {
         Self {
-            // Default tolerance matches fixest's `fixest_options("demean_tol")`
             tol: 1e-6,
-            // Generous iteration limit to handle difficult convergence cases
             maxiter: 100_000,
-            // Warmup iterations before 2-FE sub-convergence (fixest default)
             iter_warmup: 15,
-            // Post-acceleration projection starts after this many iterations
             iter_proj_after_acc: 40,
-            // Grand acceleration frequency (every N iterations)
             iter_grand_acc: 4,
-            // SSR convergence check frequency
             ssr_check_interval: 40,
         }
     }
@@ -529,14 +409,12 @@ impl Default for FixestConfig {
 // =============================================================================
 
 /// Whether the iterative algorithm has converged.
-///
-/// Used throughout the demeaning module to represent the convergence state
-/// in a self-documenting way, avoiding ambiguous boolean returns.
-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub enum ConvergenceState {
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
+pub(crate) enum ConvergenceState {
     /// Algorithm has converged; iteration can stop.
     Converged,
     /// Algorithm has not yet converged; continue iterating.
+    #[default]
     NotConverged,
 }
 
@@ -546,20 +424,13 @@ pub enum ConvergenceState {
 
 /// Result of a demeaning operation (single column).
 #[derive(Debug, Clone)]
-pub struct DemeanResult {
-    /// Demeaned data (single column, length `n_obs`).
+pub(crate) struct DemeanResult {
+    /// Demeaned data (length: `n_obs`).
     pub demeaned: Vec<f64>,
 
-    /// Fixed effect coefficients in original FE order.
-    ///
-    /// The coefficients are laid out as:
-    /// ```text
-    /// [FE_0 coefficients | FE_1 coefficients | ... | FE_{n_fe-1} coefficients]
-    /// ```
-    /// where FE indices follow the original input order (before internal reordering).
-    ///
-    /// For FE `q`, coefficients are at indices `coef_start_original[q]..coef_start_original[q+1]`
-    /// where `coef_start_original` is the cumulative sum of `n_groups_original`.
+    /// Fixed effect coefficients in original FE order (length: `n_coef`).
+    /// Laid out as `[FE0_coefs..., FE1_coefs..., ...]` where FE0, FE1, etc.
+    /// are in the original input order (not reordered).
     pub fe_coefficients: Vec<f64>,
 
     /// Convergence state.
@@ -569,3 +440,23 @@ pub struct DemeanResult {
     #[allow(dead_code)]
     pub iterations: usize,
 }
+
+// =============================================================================
+// DemeanMultiResult
+// =============================================================================
+
+/// Result of demeaning multiple columns.
+///
+/// Returned by the [`demean`](super::demean) function which processes
+/// multiple columns in parallel.
+pub(crate) struct DemeanMultiResult {
+    /// Demeaned data with shape `(n_samples, n_features)`.
+    pub demeaned: Array2<f64>,
+
+    /// Fixed effect coefficients with shape `(n_coef, n_features)`.
+    /// Each column contains the FE coefficients for the corresponding input column.
+    pub fe_coefficients: Array2<f64>,
+
+    /// True if all columns converged successfully.
+    pub success: bool,
+}
diff --git a/tests/test_vs_fixest.py b/tests/test_vs_fixest.py
index dda1f9558..2585fbbfe 100644
--- a/tests/test_vs_fixest.py
+++ b/tests/test_vs_fixest.py
@@ -192,7 +192,8 @@ def check_relative_diff(x1, x2, tol, msg=None):
 SINGLE_F3 = ALL_F3[0]
 BACKEND_F3 = [
     *[("numba", t) for t in ALL_F3],
-    *[(b, SINGLE_F3) for b in ("jax", "rust", "cupy", "scipy")],
+    *[("rust", t) for t in ALL_F3],
+    *[(b, SINGLE_F3) for b in ("jax", "cupy", "scipy")],
 ]
 
 

From 369e24a7513fefff0cc8f093ee39303ddd91319c Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Sun, 11 Jan 2026 23:38:18 +0100
Subject: [PATCH 23/24] Simplify SSR computation loops in projection.rs

Remove manual 4x loop unrolling from compute_ssr methods in
TwoFEProjector and MultiFEProjector. LLVM auto-vectorizes simple
loops effectively, making manual unrolling unnecessary complexity.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 src/demean/projection.rs | 77 +++-------------------------------------
 1 file changed, 5 insertions(+), 72 deletions(-)

diff --git a/src/demean/projection.rs b/src/demean/projection.rs
index 567a068fa..cf2a2864d 100644
--- a/src/demean/projection.rs
+++ b/src/demean/projection.rs
@@ -151,17 +151,14 @@ impl Projector for TwoFEProjector<'_> {
         self.beta_sweeper.sweep(&coef[..self.n0], &mut self.scratch);
 
         // Compute SSR: Σ (input[i] - alpha[fe0[i]] - beta[fe1[i]])²
-        // Use 4x unrolling for better ILP
         let n_obs = self.n_obs;
-        let chunks = n_obs / 4;
-        let mut i = 0usize;
         let mut ssr = 0.0;
 
         // SAFETY: All pointer accesses are valid because:
         // - i < n_obs throughout (loop bounds ensure this)
         // - fe0_ptr, fe1_ptr point to arrays of length n_obs (from FixedEffectInfo)
         // - input_ptr points to array of length n_obs (from caller)
-        // - group IDs (g0_*, g1_*) are always < n0 or < n1 respectively
+        // - group IDs (g0, g1) are always < n0 or < n1 respectively
         //   (invariant from DemeanContext construction)
         // - alpha_ptr points to coef with length >= n0, beta_ptr to scratch with length n1
         unsafe {
@@ -171,46 +168,15 @@ impl Projector for TwoFEProjector<'_> {
             let fe0_ptr = self.fe0_group_ids_ptr;
             let fe1_ptr = self.fe1_group_ids_ptr;
 
-            for _ in 0..chunks {
-                let g0_0 = *fe0_ptr.add(i);
-                let g0_1 = *fe0_ptr.add(i + 1);
-                let g0_2 = *fe0_ptr.add(i + 2);
-                let g0_3 = *fe0_ptr.add(i + 3);
-
-                let g1_0 = *fe1_ptr.add(i);
-                let g1_1 = *fe1_ptr.add(i + 1);
-                let g1_2 = *fe1_ptr.add(i + 2);
-                let g1_3 = *fe1_ptr.add(i + 3);
-
-                debug_assert!(g0_0 < self.n0 && g0_1 < self.n0 && g0_2 < self.n0 && g0_3 < self.n0,
-                    "FE0 group ID out of bounds: max({}, {}, {}, {}) >= n0 ({})",
-                    g0_0, g0_1, g0_2, g0_3, self.n0);
-                debug_assert!(g1_0 < self.n1 && g1_1 < self.n1 && g1_2 < self.n1 && g1_3 < self.n1,
-                    "FE1 group ID out of bounds: max({}, {}, {}, {}) >= n1 ({})",
-                    g1_0, g1_1, g1_2, g1_3, self.n1);
-
-                let resid0 =
-                    *input_ptr.add(i) - *alpha_ptr.add(g0_0) - *beta_ptr.add(g1_0);
-                let resid1 =
-                    *input_ptr.add(i + 1) - *alpha_ptr.add(g0_1) - *beta_ptr.add(g1_1);
-                let resid2 =
-                    *input_ptr.add(i + 2) - *alpha_ptr.add(g0_2) - *beta_ptr.add(g1_2);
-                let resid3 =
-                    *input_ptr.add(i + 3) - *alpha_ptr.add(g0_3) - *beta_ptr.add(g1_3);
-
-                ssr += resid0 * resid0 + resid1 * resid1 + resid2 * resid2 + resid3 * resid3;
-                i += 4;
-            }
-
-            // Handle remainder
-            while i < n_obs {
+            for i in 0..n_obs {
                 let g0 = *fe0_ptr.add(i);
                 let g1 = *fe1_ptr.add(i);
+
                 debug_assert!(g0 < self.n0, "FE0 group ID ({}) >= n0 ({})", g0, self.n0);
                 debug_assert!(g1 < self.n1, "FE1 group ID ({}) >= n1 ({})", g1, self.n1);
+
                 let resid = *input_ptr.add(i) - *alpha_ptr.add(g0) - *beta_ptr.add(g1);
                 ssr += resid * resid;
-                i += 1;
             }
         }
         ssr
@@ -295,39 +261,7 @@ impl Projector for MultiFEProjector<'_> {
         // - coef_start + g < coef.len() because coef_start is the FE's offset and
         //   g < n_groups for that FE (DemeanContext guarantees this layout)
         unsafe {
-            // Main loop with 4x unrolling
-            let chunks = n_obs / 4;
-            let mut i = 0usize;
-
-            for _ in 0..chunks {
-                let mut sum0 = 0.0;
-                let mut sum1 = 0.0;
-                let mut sum2 = 0.0;
-                let mut sum3 = 0.0;
-
-                for &(group_ids_ptr, coef_start) in &self.fe_ptrs {
-                    let g0 = *group_ids_ptr.add(i);
-                    let g1 = *group_ids_ptr.add(i + 1);
-                    let g2 = *group_ids_ptr.add(i + 2);
-                    let g3 = *group_ids_ptr.add(i + 3);
-
-                    sum0 += *coef_ptr.add(coef_start + g0);
-                    sum1 += *coef_ptr.add(coef_start + g1);
-                    sum2 += *coef_ptr.add(coef_start + g2);
-                    sum3 += *coef_ptr.add(coef_start + g3);
-                }
-
-                let resid0 = *input_ptr.add(i) - sum0;
-                let resid1 = *input_ptr.add(i + 1) - sum1;
-                let resid2 = *input_ptr.add(i + 2) - sum2;
-                let resid3 = *input_ptr.add(i + 3) - sum3;
-
-                ssr += resid0 * resid0 + resid1 * resid1 + resid2 * resid2 + resid3 * resid3;
-                i += 4;
-            }
-
-            // Handle remainder
-            while i < n_obs {
+            for i in 0..n_obs {
                 let mut sum = 0.0;
                 for &(group_ids_ptr, coef_start) in &self.fe_ptrs {
                     let g = *group_ids_ptr.add(i);
@@ -335,7 +269,6 @@ impl Projector for MultiFEProjector<'_> {
                 }
                 let resid = *input_ptr.add(i) - sum;
                 ssr += resid * resid;
-                i += 1;
             }
         }
 

From dc413f874f9b037d4d295f060e7cb5ba312e3302 Mon Sep 17 00:00:00 2001
From: Kristof Schroeder <kristof_schroeder@web.de>
Date: Mon, 12 Jan 2026 11:33:29 +0100
Subject: [PATCH 24/24] Add configurable FE reordering via reorder_fe parameter

Previously, fixed effects were always reordered by size (largest first)
during demeaning. This adds a `reorder_fe` boolean parameter that allows
users to control this behavior. Default is `false` (no reordering).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 pyfixest/core/_core_impl.pyi |  1 +
 pyfixest/core/demean.py      |  5 ++++
 src/demean/accelerator.rs    |  2 +-
 src/demean/mod.rs            | 48 +++++++++++++++++++++++-------------
 src/demean/types.rs          | 28 ++++++++++++++-------
 5 files changed, 57 insertions(+), 27 deletions(-)

diff --git a/pyfixest/core/_core_impl.pyi b/pyfixest/core/_core_impl.pyi
index fe42826ef..6e4d7be27 100644
--- a/pyfixest/core/_core_impl.pyi
+++ b/pyfixest/core/_core_impl.pyi
@@ -22,6 +22,7 @@ def _demean_rs(
     weights: NDArray[np.float64] | None = None,
     tol: float = 1e-08,
     maxiter: int = 100_000,
+    reorder_fe: bool = False,
 ) -> DemeanResult: ...
 def _count_fixef_fully_nested_all_rs(
     all_fixef_array: NDArray,
diff --git a/pyfixest/core/demean.py b/pyfixest/core/demean.py
index 19cfa2998..566c0ff6a 100644
--- a/pyfixest/core/demean.py
+++ b/pyfixest/core/demean.py
@@ -10,6 +10,7 @@ def demean(
     weights: NDArray[np.float64],
     tol: float = 1e-08,
     maxiter: int = 100_000,
+    reorder_fe: bool = False,
 ) -> tuple[NDArray, bool]:
     """
     Demean an array.
@@ -30,6 +31,9 @@ def demean(
         Tolerance criterion for convergence. Defaults to 1e-08.
     maxiter : int, optional
         Maximum number of iterations. Defaults to 100_000.
+    reorder_fe : bool, optional
+        Whether to reorder fixed effects by size (largest first) before demeaning.
+        This can improve convergence for some datasets. Defaults to False.
 
     Returns
     -------
@@ -80,5 +84,6 @@ def demean(
         None if is_uniform else weights_f64,
         tol,
         maxiter,
+        reorder_fe,
     )
     return result["demeaned"], result["success"]
diff --git a/src/demean/accelerator.rs b/src/demean/accelerator.rs
index d535a357f..89308aa07 100644
--- a/src/demean/accelerator.rs
+++ b/src/demean/accelerator.rs
@@ -421,7 +421,7 @@ mod tests {
             flist[[i, 0]] = i % 10;
             flist[[i, 1]] = i % 5;
         }
-        let ctx = DemeanContext::new(&flist.view(), None);
+        let ctx = DemeanContext::new(&flist.view(), None, false);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
         (ctx, input)
     }
diff --git a/src/demean/mod.rs b/src/demean/mod.rs
index 34f255bf7..226cbfdba 100644
--- a/src/demean/mod.rs
+++ b/src/demean/mod.rs
@@ -88,6 +88,7 @@ impl<'a> ThreadLocalDemeaner<'a> {
 /// * `weights` - Per-observation weights, or None for unweighted
 /// * `tol` - Convergence tolerance
 /// * `maxiter` - Maximum iterations
+/// * `reorder_fe` - Whether to reorder FEs by size (largest first)
 ///
 /// # Returns
 ///
@@ -98,20 +99,22 @@ pub(crate) fn demean(
     weights: Option<&ArrayView1<f64>>,
     tol: f64,
     maxiter: usize,
+    reorder_fe: bool,
 ) -> DemeanMultiResult {
     let (n_samples, n_features) = x.dim();
 
     let config = FixestConfig {
         tol,
         maxiter,
+        reorder_fe,
         ..FixestConfig::default()
     };
 
     let not_converged = Arc::new(AtomicUsize::new(0));
     let mut demeaned = Array2::<f64>::zeros((n_samples, n_features));
 
-    // Create context (FEs are always reordered by size, matching fixest)
-    let ctx = DemeanContext::new(flist, weights);
+    // Create context with optional FE reordering
+    let ctx = DemeanContext::new(flist, weights, config.reorder_fe);
     let n_coef = ctx.dims.n_coef;
 
     let mut fe_coefficients = Array2::<f64>::zeros((n_coef, n_features));
@@ -175,6 +178,7 @@ pub(crate) fn demean(
 /// * `weights` - Per-observation weights, or None for unweighted (fast path)
 /// * `tol` - Convergence tolerance (default: 1e-8)
 /// * `maxiter` - Maximum iterations (default: 100_000)
+/// * `reorder_fe` - Whether to reorder FEs by size (default: false)
 ///
 /// # Returns
 ///
@@ -183,7 +187,7 @@ pub(crate) fn demean(
 /// - "fe_coefficients": Array of FE coefficients (n_coef, n_features)
 /// - "success": Boolean indicating convergence
 #[pyfunction]
-#[pyo3(signature = (x, flist, weights=None, tol=1e-8, maxiter=100_000))]
+#[pyo3(signature = (x, flist, weights=None, tol=1e-8, maxiter=100_000, reorder_fe=false))]
 pub fn _demean_rs<'py>(
     py: Python<'py>,
     x: PyReadonlyArray2<f64>,
@@ -191,12 +195,22 @@ pub fn _demean_rs<'py>(
     weights: Option<PyReadonlyArray1<f64>>,
     tol: f64,
     maxiter: usize,
+    reorder_fe: bool,
 ) -> PyResult<Bound<'py, PyDict>> {
     let x_arr = x.as_array();
     let flist_arr = flist.as_array();
     let weights_arr = weights.as_ref().map(|w| w.as_array());
 
-    let result = py.detach(|| demean(&x_arr, &flist_arr, weights_arr.as_ref(), tol, maxiter));
+    let result = py.detach(|| {
+        demean(
+            &x_arr,
+            &flist_arr,
+            weights_arr.as_ref(),
+            tol,
+            maxiter,
+            reorder_fe,
+        )
+    });
 
     let dict = PyDict::new(py);
     dict.set_item("demeaned", PyArray2::from_owned_array(py, result.demeaned))?;
@@ -226,7 +240,7 @@ mod tests {
         }
 
         // Unweighted case
-        let ctx = DemeanContext::new(&flist.view(), None);
+        let ctx = DemeanContext::new(&flist.view(), None, false);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
@@ -255,7 +269,7 @@ mod tests {
         }
 
         // Unweighted case
-        let ctx = DemeanContext::new(&flist.view(), None);
+        let ctx = DemeanContext::new(&flist.view(), None, false);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
@@ -298,7 +312,7 @@ mod tests {
             flist[[i, 0]] = i % n_groups;
         }
 
-        let ctx = DemeanContext::new(&flist.view(), None);
+        let ctx = DemeanContext::new(&flist.view(), None, false);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let mut demeaner = SingleFEDemeaner::new(&ctx);
@@ -346,7 +360,7 @@ mod tests {
         // Non-uniform weights: 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, ...
         let weights: ndarray::Array1<f64> = (0..n_obs).map(|i| 1.0 + (i % 3) as f64).collect();
         let ctx =
-            DemeanContext::new(&flist.view(), Some(&weights.view()));
+            DemeanContext::new(&flist.view(), Some(&weights.view()), false);
 
         assert!(
             ctx.weights.is_some(),
@@ -380,7 +394,7 @@ mod tests {
         }
 
         // Test with no weights (None) - unweighted case
-        let ctx_unweighted = DemeanContext::new(&flist.view(), None);
+        let ctx_unweighted = DemeanContext::new(&flist.view(), None, false);
         assert!(
             ctx_unweighted.weights.is_none(),
             "No weights should result in weights=None"
@@ -389,7 +403,7 @@ mod tests {
         // Test with weights (Some) - weighted case
         let weights: ndarray::Array1<f64> = (0..n_obs).map(|i| 1.0 + (i % 2) as f64).collect();
         let ctx_weighted =
-            DemeanContext::new(&flist.view(), Some(&weights.view()));
+            DemeanContext::new(&flist.view(), Some(&weights.view()), false);
         assert!(
             ctx_weighted.weights.is_some(),
             "Provided weights should result in weights=Some"
@@ -408,7 +422,7 @@ mod tests {
             flist[[i, 1]] = i % 5;
         }
 
-        let ctx = DemeanContext::new(&flist.view(), None);
+        let ctx = DemeanContext::new(&flist.view(), None, false);
         let config = FixestConfig::default();
 
         // Create a single demeaner and use it multiple times
@@ -448,7 +462,7 @@ mod tests {
     fn test_single_observation() {
         // Edge case: only 1 observation
         let flist = Array2::<usize>::zeros((1, 2));
-        let ctx = DemeanContext::new(&flist.view(), None);
+        let ctx = DemeanContext::new(&flist.view(), None, false);
 
         let input = vec![42.0];
         let config = FixestConfig::default();
@@ -469,7 +483,7 @@ mod tests {
         let n_obs = 50;
         let flist = Array2::<usize>::zeros((n_obs, 2)); // All zeros = single group each
 
-        let ctx = DemeanContext::new(&flist.view(), None);
+        let ctx = DemeanContext::new(&flist.view(), None, false);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
@@ -498,7 +512,7 @@ mod tests {
             flist[[i, 1]] = i % 5;
         }
 
-        let ctx = DemeanContext::new(&flist.view(), None);
+        let ctx = DemeanContext::new(&flist.view(), None, false);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
@@ -525,7 +539,7 @@ mod tests {
             .collect();
 
         let ctx =
-            DemeanContext::new(&flist.view(), Some(&weights.view()));
+            DemeanContext::new(&flist.view(), Some(&weights.view()), false);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig::default();
@@ -560,7 +574,7 @@ mod tests {
             flist[[i, 1]] = i % 5;
         }
 
-        let ctx = DemeanContext::new(&flist.view(), None);
+        let ctx = DemeanContext::new(&flist.view(), None, false);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         // Use maxiter=1 - algorithm may or may not converge depending on data
@@ -592,7 +606,7 @@ mod tests {
             flist[[i, 1]] = i % 3;
         }
 
-        let ctx = DemeanContext::new(&flist.view(), None);
+        let ctx = DemeanContext::new(&flist.view(), None, false);
         let input: Vec<f64> = (0..n_obs).map(|i| (i as f64) * 0.1).collect();
 
         let config = FixestConfig {
diff --git a/src/demean/types.rs b/src/demean/types.rs
index e1ba04aff..62f3a9eb4 100644
--- a/src/demean/types.rs
+++ b/src/demean/types.rs
@@ -111,8 +111,9 @@ pub(crate) struct FixedEffectInfo {
 ///
 /// # FE Ordering
 ///
-/// Fixed effects are always reordered by size (largest first) to match fixest's
-/// behavior and ensure optimal convergence properties.
+/// Fixed effects can optionally be reordered by size (largest first) via the
+/// `reorder_fe` parameter. When enabled, this matches fixest's behavior and
+/// can improve convergence for some datasets.
 ///
 /// # Uniform Weights Fast Path
 ///
@@ -139,15 +140,14 @@ pub struct DemeanContext {
 impl DemeanContext {
     /// Create a demeaning context from input arrays.
     ///
-    /// Fixed effects are automatically reordered by size (largest first) to
-    /// match fixest's behavior and ensure optimal convergence.
-    ///
     /// # Arguments
     ///
     /// * `flist` - Fixed effect group IDs with shape `(n_obs, n_fe)`.
     ///   Each row is one observation, each column is one fixed effect.
     ///   Values must be 0-indexed group IDs.
     /// * `weights` - Per-observation weights (length: `n_obs`), or None for unweighted.
+    /// * `reorder_fe` - If true, reorder FEs by size (largest first) before demeaning.
+    ///   This can improve convergence for some datasets.
     ///
     /// # Panics
     ///
@@ -160,7 +160,11 @@ impl DemeanContext {
     /// Groups with no observations (e.g., sparse group IDs) are handled by setting
     /// their weight to 1, matching fixest's approach. Since no observation belongs
     /// to these groups, their coefficients are never used in computations.
-    pub fn new(flist: &ArrayView2<usize>, weights: Option<&ArrayView1<f64>>) -> Self {
+    pub fn new(
+        flist: &ArrayView2<usize>,
+        weights: Option<&ArrayView1<f64>>,
+        reorder_fe: bool,
+    ) -> Self {
         let (n_obs, n_fe) = flist.dim();
 
         assert!(n_obs > 0, "Cannot create DemeanContext with 0 observations");
@@ -188,13 +192,13 @@ impl DemeanContext {
             })
             .collect();
 
-        // Always reorder FEs by size (largest first) - matches fixest behavior
-        let order: Vec<usize> = if n_fe > 1 {
+        // Optionally reorder FEs by size (largest first)
+        let order: Vec<usize> = if reorder_fe && n_fe > 1 {
             let mut indices: Vec<usize> = (0..n_fe).collect();
             indices.sort_by_key(|&i| std::cmp::Reverse(n_groups_original[i]));
             indices
         } else {
-            vec![0]
+            (0..n_fe).collect()
         };
 
         // Compute dimensions
@@ -388,6 +392,11 @@ pub(crate) struct FixestConfig {
 
     /// Iterations between SSR-based convergence checks.
     pub ssr_check_interval: usize,
+
+    /// Whether to reorder fixed effects by size (largest first) before demeaning.
+    /// When true, FEs are processed in order of decreasing group count, which
+    /// can improve convergence for some datasets. Default is false.
+    pub reorder_fe: bool,
 }
 
 impl Default for FixestConfig {
@@ -400,6 +409,7 @@ impl Default for FixestConfig {
             iter_proj_after_acc: 40,
             iter_grand_acc: 4,
             ssr_check_interval: 40,
+            reorder_fe: false,
         }
     }
 }