brontoguana · 3spky5u-oss · Apr 8, 2026
diff --git a/build.rs b/build.rs
@@ -138,36 +138,69 @@ fn compile_cuda_kernels() {
     };
 
     let out_dir = std::env::var("OUT_DIR").unwrap();
-    let ptx_path = format!("{out_dir}/decode_kernels.ptx");
 
-    if is_output_fresh(&[cu_src], &[&ptx_path]) {
+    // Compile sm_80 PTX (works on Ampere SM86/3090, Ada SM89, Hopper, Blackwell via JIT)
+    let ptx_sm80 = format!("{out_dir}/decode_kernels_sm80.ptx");
+
+    if is_output_fresh(&[cu_src], &[&ptx_sm80]) {
         println!("cargo:rustc-cfg=has_decode_kernels");
-        println!("cargo:warning=Reusing cached GPU decode kernels at {ptx_path}");
-        return;
+        println!("cargo:warning=Reusing cached GPU decode kernels at {ptx_sm80}");
+    } else {
+        let status = std::process::Command::new(&nvcc)
+            .args([
+                "-ptx",
+                "-arch=sm_80",
+                "-O3",
+                "--use_fast_math",
+                "-o", &ptx_sm80,
+                cu_src,
+            ])
+            .status();
+
+        match status {
+            Ok(s) if s.success() => {
+                println!("cargo:rustc-cfg=has_decode_kernels");
+                println!("cargo:warning=Compiled GPU decode kernels to sm_80 PTX ({ptx_sm80})");
+            }
+            Ok(s) => {
+                println!("cargo:warning=nvcc failed with status {s} — GPU decode kernels disabled");
+                return;
+            }
+            Err(e) => {
+                println!("cargo:warning=nvcc execution error: {e} — GPU decode kernels disabled");
+                return;
+            }
+        }
     }
 
-    // Compile .cu to .ptx targeting sm_80 (works on Ampere, Ada, Hopper)
-    let status = std::process::Command::new(&nvcc)
-        .args([
-            "-ptx",
-            "-arch=sm_80",
-            "-O3",
-            "--use_fast_math",
-            "-o", &ptx_path,
-            cu_src,
-        ])
-        .status();
+    // Compile sm_120 PTX (native Blackwell — better warp scheduling on RTX 50x0)
+    let ptx_sm120 = format!("{out_dir}/decode_kernels_sm120.ptx");
+    if is_output_fresh(&[cu_src], &[&ptx_sm120]) {
+        println!("cargo:rustc-cfg=has_decode_kernels_sm120");
+        println!("cargo:warning=Reusing cached sm_120 GPU decode kernels at {ptx_sm120}");
+    } else {
+        let status_120 = std::process::Command::new(&nvcc)
+            .args([
+                "-ptx",
+                "-arch=sm_120",
+                "-O3",
+                "--use_fast_math",
+                "-o", &ptx_sm120,
+                cu_src,
+            ])
+            .status();
 
-    match status {
-        Ok(s) if s.success() => {
-            println!("cargo:rustc-cfg=has_decode_kernels");
-            println!("cargo:warning=Compiled GPU decode kernels to PTX ({ptx_path})");
-        }
-        Ok(s) => {
-            println!("cargo:warning=nvcc failed with status {s} — GPU decode kernels disabled");
-        }
-        Err(e) => {
-            println!("cargo:warning=nvcc execution error: {e} — GPU decode kernels disabled");
+        match status_120 {
+            Ok(s) if s.success() => {
+                println!("cargo:rustc-cfg=has_decode_kernels_sm120");
+                println!("cargo:warning=Compiled GPU decode kernels to sm_120 PTX ({ptx_sm120})");
+            }
+            Ok(s) => {
+                println!("cargo:warning=nvcc sm_120 failed with status {s} — sm_80 PTX will be used via JIT");
+            }
+            Err(e) => {
+                println!("cargo:warning=nvcc sm_120 execution error: {e} — sm_80 PTX will be used via JIT");
+            }
         }
     }
 }

diff --git a/python/krasis/model.py b/python/krasis/model.py
@@ -1783,30 +1783,36 @@ def _init_gpu_prefill(self):
             len(self.gpu_prefill_managers), self.gpu_prefill_threshold,
         )
 
-    def _start_ram_watchdog(self, floor_pct: float = 5.0):
+    def _start_ram_watchdog(self, floor_pct: float = 0.5):
         """Start daemon thread that monitors system RAM and exits if too low.
 
-        Checks /proc/meminfo every second. If MemAvailable drops below
-        floor_pct% of MemTotal, logs an error and calls os._exit() to
+        Checks /proc/meminfo every second. If (MemAvailable + SwapFree) drops
+        below floor_pct% of MemTotal, logs an error and calls os._exit() to
         prevent a full system OOM that kills desktop processes.
 
+        Swap is counted as available headroom because NVMe swap is fast enough
+        to absorb transient spikes (e.g. during WriteCombined expert migration).
+
         Args:
-            floor_pct: Minimum % free RAM before forced exit (default 5%)
+            floor_pct: Minimum % free RAM+swap before forced exit (default 0.5%).
+                       Override via KRASIS_RAM_FLOOR_PERCENT env var.
         """
+        floor_pct = float(os.environ.get("KRASIS_RAM_FLOOR_PERCENT", str(floor_pct)))
+
         def _watchdog():
             while True:
                 time.sleep(1.0)
                 meminfo = _read_meminfo()
                 if not meminfo:
                     continue
                 total_kb = meminfo.get("MemTotal", 0)
-                avail_kb = meminfo.get("MemAvailable", 0)
+                avail_kb = meminfo.get("MemAvailable", 0) + meminfo.get("SwapFree", 0)
                 if total_kb == 0:
                     continue
                 pct_free = 100.0 * avail_kb / total_kb
                 if pct_free < floor_pct:
                     logger.error(
-                        "RAM WATCHDOG: %.1f%% free (%.1f GB available / %.1f GB total) "
+                        "RAM WATCHDOG: %.1f%% free (%.1f GB available+swap / %.1f GB total) "
                         "— below %.1f%% floor. Exiting to prevent system OOM!",
                         pct_free, avail_kb / 1024 / 1024,
                         total_kb / 1024 / 1024, floor_pct,
@@ -1815,7 +1821,7 @@ def _watchdog():
 
         t = threading.Thread(target=_watchdog, daemon=True, name="ram-watchdog")
         t.start()
-        logger.info("RAM watchdog started: will exit if < %.1f%% free", floor_pct)
+        logger.info("RAM watchdog started: will exit if < %.1f%% free (RAM+swap)", floor_pct)
 
     # ── Multi-GPU calibration: replicate weights + measure inference cost ──
 
@@ -4518,6 +4524,13 @@ def _register_attn_weight(w: torch.Tensor, layer_idx: int = -1,
         if self.krasis_engine is not None:
             store.setup_from_engine(self.krasis_engine)
 
+            # WriteCombined DMA staging: migrate expert weights from heap to WC memory.
+            # Must happen after setup_from_engine (pointers exist) but before
+            # allocate_prefill_engine (which reads the updated pointers).
+            if getattr(self, 'wc_alloc', False):
+                wc_msg = store.setup_wc_expert_memory(self.krasis_engine)
+                logger.info("WC expert memory: %s", wc_msg)
+
         # Register Nemotron MoE config (relu2, ungated, latent projections) — must come
         # after setup_from_engine which populates moe_layers[abs_layer_idx].
         if self.cfg.model_type == "nemotron_h":

diff --git a/python/krasis/server.py b/python/krasis/server.py
@@ -710,6 +710,11 @@ def _force_exit_handler(sig, frame):
                         help="Enable Session messenger bridge (default: off)")
     parser.add_argument("--test-endpoints", action="store_true", default=False,
                         help="Enable test-only endpoints (/v1/internal/prefill_logits)")
+    parser.add_argument("--wc-alloc", action="store_true", default=False,
+                        help="Use WriteCombined host memory for expert DMA staging. "
+                             "Bypasses CPU cache for ~64%% higher PCIe bandwidth "
+                             "(~46 GB/s vs ~28 GB/s on Gen5). Requires NVMe swap on "
+                             "RAM-constrained systems.")
     # Apply config file defaults, then parse CLI (CLI wins over config file)
     if config_defaults:
         parser.set_defaults(**config_defaults)
@@ -912,6 +917,11 @@ def fileno(self):
     # ── Set decode mode (GPU only) ──
     _model.decode_mode = "gpu"
 
+    # ── WriteCombined DMA staging ──
+    if getattr(args, 'wc_alloc', False):
+        _model.wc_alloc = True
+        _detail("WriteCombined expert DMA staging enabled (--wc-alloc)")
+
     # ── GPU decode store setup (before warmup so decode warmup can use it) ──
     _status("Setting up GPU decode store")
     gpu_store = _model.setup_gpu_decode_store()