Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 58 additions & 25 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,36 +138,69 @@ fn compile_cuda_kernels() {
};

let out_dir = std::env::var("OUT_DIR").unwrap();
let ptx_path = format!("{out_dir}/decode_kernels.ptx");

if is_output_fresh(&[cu_src], &[&ptx_path]) {
// Compile sm_80 PTX (works on Ampere SM86/3090, Ada SM89, Hopper, Blackwell via JIT)
let ptx_sm80 = format!("{out_dir}/decode_kernels_sm80.ptx");

if is_output_fresh(&[cu_src], &[&ptx_sm80]) {
println!("cargo:rustc-cfg=has_decode_kernels");
println!("cargo:warning=Reusing cached GPU decode kernels at {ptx_path}");
return;
println!("cargo:warning=Reusing cached GPU decode kernels at {ptx_sm80}");
} else {
let status = std::process::Command::new(&nvcc)
.args([
"-ptx",
"-arch=sm_80",
"-O3",
"--use_fast_math",
"-o", &ptx_sm80,
cu_src,
])
.status();

match status {
Ok(s) if s.success() => {
println!("cargo:rustc-cfg=has_decode_kernels");
println!("cargo:warning=Compiled GPU decode kernels to sm_80 PTX ({ptx_sm80})");
}
Ok(s) => {
println!("cargo:warning=nvcc failed with status {s} — GPU decode kernels disabled");
return;
}
Err(e) => {
println!("cargo:warning=nvcc execution error: {e} — GPU decode kernels disabled");
return;
}
}
}

// Compile .cu to .ptx targeting sm_80 (works on Ampere, Ada, Hopper)
let status = std::process::Command::new(&nvcc)
.args([
"-ptx",
"-arch=sm_80",
"-O3",
"--use_fast_math",
"-o", &ptx_path,
cu_src,
])
.status();
// Compile sm_120 PTX (native Blackwell — better warp scheduling on RTX 50x0)
let ptx_sm120 = format!("{out_dir}/decode_kernels_sm120.ptx");
if is_output_fresh(&[cu_src], &[&ptx_sm120]) {
println!("cargo:rustc-cfg=has_decode_kernels_sm120");
println!("cargo:warning=Reusing cached sm_120 GPU decode kernels at {ptx_sm120}");
} else {
let status_120 = std::process::Command::new(&nvcc)
.args([
"-ptx",
"-arch=sm_120",
"-O3",
"--use_fast_math",
"-o", &ptx_sm120,
cu_src,
])
.status();

match status {
Ok(s) if s.success() => {
println!("cargo:rustc-cfg=has_decode_kernels");
println!("cargo:warning=Compiled GPU decode kernels to PTX ({ptx_path})");
}
Ok(s) => {
println!("cargo:warning=nvcc failed with status {s} — GPU decode kernels disabled");
}
Err(e) => {
println!("cargo:warning=nvcc execution error: {e} — GPU decode kernels disabled");
match status_120 {
Ok(s) if s.success() => {
println!("cargo:rustc-cfg=has_decode_kernels_sm120");
println!("cargo:warning=Compiled GPU decode kernels to sm_120 PTX ({ptx_sm120})");
}
Ok(s) => {
println!("cargo:warning=nvcc sm_120 failed with status {s} — sm_80 PTX will be used via JIT");
}
Err(e) => {
println!("cargo:warning=nvcc sm_120 execution error: {e} — sm_80 PTX will be used via JIT");
}
}
}
}
Expand Down
27 changes: 20 additions & 7 deletions python/krasis/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1783,30 +1783,36 @@ def _init_gpu_prefill(self):
len(self.gpu_prefill_managers), self.gpu_prefill_threshold,
)

def _start_ram_watchdog(self, floor_pct: float = 5.0):
def _start_ram_watchdog(self, floor_pct: float = 0.5):
"""Start daemon thread that monitors system RAM and exits if too low.

Checks /proc/meminfo every second. If MemAvailable drops below
floor_pct% of MemTotal, logs an error and calls os._exit() to
Checks /proc/meminfo every second. If (MemAvailable + SwapFree) drops
below floor_pct% of MemTotal, logs an error and calls os._exit() to
prevent a full system OOM that kills desktop processes.

Swap is counted as available headroom because NVMe swap is fast enough
to absorb transient spikes (e.g. during WriteCombined expert migration).

Args:
floor_pct: Minimum % free RAM before forced exit (default 5%)
floor_pct: Minimum % free RAM+swap before forced exit (default 0.5%).
Override via KRASIS_RAM_FLOOR_PERCENT env var.
"""
floor_pct = float(os.environ.get("KRASIS_RAM_FLOOR_PERCENT", str(floor_pct)))

def _watchdog():
while True:
time.sleep(1.0)
meminfo = _read_meminfo()
if not meminfo:
continue
total_kb = meminfo.get("MemTotal", 0)
avail_kb = meminfo.get("MemAvailable", 0)
avail_kb = meminfo.get("MemAvailable", 0) + meminfo.get("SwapFree", 0)
if total_kb == 0:
continue
pct_free = 100.0 * avail_kb / total_kb
if pct_free < floor_pct:
logger.error(
"RAM WATCHDOG: %.1f%% free (%.1f GB available / %.1f GB total) "
"RAM WATCHDOG: %.1f%% free (%.1f GB available+swap / %.1f GB total) "
"— below %.1f%% floor. Exiting to prevent system OOM!",
pct_free, avail_kb / 1024 / 1024,
total_kb / 1024 / 1024, floor_pct,
Expand All @@ -1815,7 +1821,7 @@ def _watchdog():

t = threading.Thread(target=_watchdog, daemon=True, name="ram-watchdog")
t.start()
logger.info("RAM watchdog started: will exit if < %.1f%% free", floor_pct)
logger.info("RAM watchdog started: will exit if < %.1f%% free (RAM+swap)", floor_pct)

# ── Multi-GPU calibration: replicate weights + measure inference cost ──

Expand Down Expand Up @@ -4518,6 +4524,13 @@ def _register_attn_weight(w: torch.Tensor, layer_idx: int = -1,
if self.krasis_engine is not None:
store.setup_from_engine(self.krasis_engine)

# WriteCombined DMA staging: migrate expert weights from heap to WC memory.
# Must happen after setup_from_engine (pointers exist) but before
# allocate_prefill_engine (which reads the updated pointers).
if getattr(self, 'wc_alloc', False):
wc_msg = store.setup_wc_expert_memory(self.krasis_engine)
logger.info("WC expert memory: %s", wc_msg)

# Register Nemotron MoE config (relu2, ungated, latent projections) — must come
# after setup_from_engine which populates moe_layers[abs_layer_idx].
if self.cfg.model_type == "nemotron_h":
Expand Down
10 changes: 10 additions & 0 deletions python/krasis/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -710,6 +710,11 @@ def _force_exit_handler(sig, frame):
help="Enable Session messenger bridge (default: off)")
parser.add_argument("--test-endpoints", action="store_true", default=False,
help="Enable test-only endpoints (/v1/internal/prefill_logits)")
parser.add_argument("--wc-alloc", action="store_true", default=False,
help="Use WriteCombined host memory for expert DMA staging. "
"Bypasses CPU cache for ~64%% higher PCIe bandwidth "
"(~46 GB/s vs ~28 GB/s on Gen5). Requires NVMe swap on "
"RAM-constrained systems.")
# Apply config file defaults, then parse CLI (CLI wins over config file)
if config_defaults:
parser.set_defaults(**config_defaults)
Expand Down Expand Up @@ -912,6 +917,11 @@ def fileno(self):
# ── Set decode mode (GPU only) ──
_model.decode_mode = "gpu"

# ── WriteCombined DMA staging ──
if getattr(args, 'wc_alloc', False):
_model.wc_alloc = True
_detail("WriteCombined expert DMA staging enabled (--wc-alloc)")

# ── GPU decode store setup (before warmup so decode warmup can use it) ──
_status("Setting up GPU decode store")
gpu_store = _model.setup_gpu_decode_store()
Expand Down
Loading