architehc · galic1987 · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026
diff --git a/QUICK_FIXES.md b/QUICK_FIXES.md
@@ -76,47 +76,50 @@ tokio::fs::write(&global_memory_path, content).await?;
 
 ---
 
-### 3. Fix FIM Instruction Injection
+### 3. ✅ FIXED: FIM Instruction Injection
 
-**File:** `src/tools/fim.rs` ~line 94
+**File:** `src/tools/fim.rs` lines 15-77 (sanitize function), line 143 (usage)
 
-**Current (BAD):**
-```rust
-let prompt = format!(
-    "<|fim_prefix|>{}
-// Instruction: {}
-<|fim_suffix|>{}
-<|fim_middle|>",
-    prefix, instruction, suffix
-);
-```
+**Status:** FIXED ✓ - Comprehensive sanitization implemented
 
-**Fix:**
+**Implementation:**
 ```rust
-// Sanitize instruction to prevent prompt injection
-fn sanitize_fim_instruction(s: &str) -> String {
-    s.replace("<|", "")
-     .replace("|>", "")
-     .replace("// Instruction:", "")
-     .trim()
-     .to_string()
+/// Sanitize a user-provided FIM instruction to prevent prompt injection.
+fn sanitize_fim_instruction(raw: &str) -> String {
+    // Step 1: Remove FIM / special model tokens
+    let fim_tokens: &[&str] = &[
+        "<|fim_prefix|>", "<|fim_suffix|>", "<|fim_middle|>",
+        "<|endoftext|>", "<|file_separator|>",
+        "<|im_start|>", "<|im_end|>", "<|pad|>",
+    ];
+    // Case-insensitive regex replacement...
+
+    // Step 2: Remove injection patterns
+    let injection_patterns: &[&str] = &[
+        r"(?i)ignore\s+(all\s+)?previous",
+        r"(?i)disregard\s+(all\s+)?(previous\s+)?instructions?",
+        r"(?i)system\s*:",
+        r"(?i)new\s+instructions?\s*:",
+    ];
+
+    // Step 3: Truncate to max length
+    // Safely handles multi-byte characters
 }
+```
 
-let sanitized = sanitize_fim_instruction(&args.instruction);
-let prompt = format!(
-    "<|fim_prefix|>{}
-<|fim_suffix|>{}
-<|fim_middle|>",
-    prefix, suffix
-);
-// Pass instruction via separate field if API supports it
+**Used in execute():**
+```rust
+let instruction = sanitize_fim_instruction(raw_instruction);
 ```
 
+**Tests:** 10+ comprehensive tests covering FIM tokens, injection patterns, truncation, and edge cases - all passing.
+
 ---
 
+
 ## 🟡 HIGH: Fix These Next
 
-### 4. Add Validation for Critical Config Fields
+### 3. Add Validation for Critical Config Fields
 
 **File:** `src/config/mod.rs` in `validate()` method
 
@@ -269,7 +272,7 @@ binary_size_mb: measure_binary_size().await?,
 - [ ] Fix blocking I/O in `execution.rs`
 - [ ] Fix blocking I/O in `checkpointing.rs`
 - [ ] Fix test mode bypass in `file.rs`
-- [ ] Fix FIM injection in `fim.rs`
+- [x] Fix FIM injection in `fim.rs` (COMPLETED - see `sanitize_fim_instruction` function)
 - [ ] Add config validation
 - [ ] Add API spawning limits
 - [ ] Fix token cache contention

diff --git a/README.md b/README.md
@@ -350,32 +350,42 @@ python -m sglang.launch_server \
   --host 0.0.0.0
 ```
 
-**Dual RTX 4090 — Qwen3-VL-30B-A3B (vision + tool calling):**
+**Dual RTX 4090 — Qwen3.5-27B-FP8 (hybrid Mamba/Attention, vLLM):**
 
 ```bash
-python -m sglang.launch_server \
-  --model-path Qwen/Qwen3-VL-30B-A3B-Thinking-FP8 \
-  --trust-remote-code \
+# Qwen3.5-27B-FP8 on 2x RTX 4090 (46 GB total)
+# Hybrid Gated Attention + Gated DeltaNet architecture, 131K context, ~24 tok/s
+export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
+export NCCL_P2P_DISABLE=1  # WSL2 workaround
+export NCCL_IB_DISABLE=1
+export NCCL_SHM_DISABLE=0
+
+vllm serve Qwen/Qwen3.5-27B-FP8 \
   --tensor-parallel-size 2 \
-  --enable-multimodal \
-  --context-length 131072 \
-  --attention-backend flashinfer \
-  --mem-fraction-static 0.85 \
-  --max-running-requests 32 \
-  --chunked-prefill-size 8192 \
-  --max-prefill-tokens 65536 \
-  --kv-cache-dtype fp8_e5m2 \
-  --disable-custom-all-reduce \
-  --cuda-graph-max-bs 32 \
+  --kv-cache-dtype fp8 \
+  --gpu-memory-utilization 0.90 \
+  --max-model-len 131072 \
+  --max-num-seqs 6 \
+  --mamba-cache-dtype float16 \
+  --enable-prefix-caching \
   --reasoning-parser qwen3 \
-  --tool-call-parser qwen \
-  --port 8000 \
-  --host 0.0.0.0
+  --enable-auto-tool-choice \
+  --tool-call-parser qwen3_coder \
+  --served-model-name qwen3.5-27b \
+  --trust-remote-code \
+  --host 0.0.0.0 \
+  --port 8000
 ```
 
-> **Tip:** When using SGLang with tool calling, set `native_function_calling = true` in your `selfware.toml`. This uses OpenAI-compatible function calling instead of XML parsing, which is more reliable with small models.
+> **Notes:**
+> - FP8 weights (~27 GB) + fp8 KV cache gives ~1.94x concurrency at 131K context
+> - `--mamba-cache-dtype float16` reduces SSM state memory for the hybrid DeltaNet layers
+> - For 262K context, set `--max-model-len 262144` (concurrency drops to ~1x)
+> - Decode throughput: ~24 tok/s per request on 2x RTX 4090
+
+> **Tip:** When using vLLM with tool calling, set `native_function_calling = true` in your `selfware.toml`. Selfware supports both `reasoning_content` (SGLang/llama.cpp) and `reasoning` (vLLM) response fields.
 
-### vLLM
+### vLLM (single GPU)
 
 ```bash
 vllm serve Qwen/Qwen3.5-4B --port 8000 --tensor-parallel-size 1 \

diff --git a/sab_results/scorecards/qwen3_5_27b_fp8.md b/sab_results/scorecards/qwen3_5_27b_fp8.md
@@ -0,0 +1,86 @@
+# SAB Scorecard: Qwen3.5-27B-FP8
+
+## Summary
+
+| Metric | Value |
+|--------|-------|
+| Date | 2026-03-15 |
+| Model | Qwen/Qwen3.5-27B-FP8 |
+| Backend | vLLM (v0.17.1rc1) |
+| Hardware | 2x NVIDIA RTX 4090 (46 GB total) |
+| Context | 262,144 tokens |
+| KV Cache | FP8 |
+| Mamba Cache | float16 |
+| Concurrency | ~1x at 262K |
+| Decode Speed | ~24 tok/s |
+| Overall Score (raw) | 93/100 |
+| Overall Score (weighted) | **95/100** |
+| Overall Rating | **BLOOM** |
+| Duration | 55m 49s (2 parallel) |
+
+## Rating Distribution
+
+| Rating | Count |
+|--------|-------|
+| BLOOM | 17 |
+| GROW | 2 |
+| WILT | 0 |
+| FROST | 1 |
+
+## Detailed Results
+
+| Scenario | Difficulty | Score | Rating | Duration |
+|----------|-----------|-------|--------|----------|
+| `easy_calculator` | easy | 100/100 | BLOOM | 68s |
+| `easy_string_ops` | easy | 100/100 | BLOOM | 116s |
+| `medium_json_merge` | medium | 100/100 | BLOOM | 58s |
+| `medium_bitset` | medium | 100/100 | BLOOM | 156s |
+| `hard_scheduler` | hard | 100/100 | BLOOM | 78s |
+| `hard_event_bus` | hard | 100/100 | BLOOM | 155s |
+| `expert_async_race` | expert | 100/100 | BLOOM | 126s |
+| `security_audit` | hard | 100/100 | BLOOM | 165s |
+| `perf_optimization` | hard | 100/100 | BLOOM | 756s |
+| `codegen_task_runner` | hard | 100/100 | BLOOM | 97s |
+| `testgen_ringbuf` | medium | 80/100 | GROW | 329s |
+| `refactor_monolith` | medium | 80/100 | GROW | 503s |
+| `viz_svg_chart` | easy | 100/100 | BLOOM | 252s |
+| `viz_ascii_table` | easy | 0/100 | FROST | 1810s (timeout) |
+| `viz_histogram` | easy-medium | 100/100 | BLOOM | 126s |
+| `viz_sparkline` | easy-medium | 100/100 | BLOOM | 77s |
+| `viz_progress_bar` | medium | 100/100 | BLOOM | 233s |
+| `viz_maze_gen` | medium | 100/100 | BLOOM | 164s |
+| `unsafe_scanner` | hard | 100/100 | BLOOM | 174s |
+| `actor_pdvr` | hard | 100/100 | BLOOM | 541s |
+
+## Architecture Notes
+
+Qwen3.5-27B uses a hybrid **Gated Attention + Gated DeltaNet** (linear attention)
+architecture with 64 layers: 16 blocks of (3x DeltaNet + 1x Gated Attention).
+This is similar to Mamba-style SSM hybrids but uses DeltaNet instead of Mamba for
+the recurrent layers. vLLM handles this via its hybrid KV cache manager with
+experimental prefix caching support (`mamba_cache_mode = "align"`).
+
+## vLLM Serve Command
+
+```bash
+export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
+export NCCL_P2P_DISABLE=1
+export NCCL_IB_DISABLE=1
+export NCCL_SHM_DISABLE=0
+
+vllm serve Qwen/Qwen3.5-27B-FP8 \
+    --tensor-parallel-size 2 \
+    --kv-cache-dtype fp8 \
+    --gpu-memory-utilization 0.88 \
+    --max-model-len 262144 \
+    --max-num-seqs 2 \
+    --mamba-cache-dtype float16 \
+    --enable-prefix-caching \
+    --reasoning-parser qwen3 \
+    --enable-auto-tool-choice \
+    --tool-call-parser qwen3_coder \
+    --served-model-name qwen3.5-27b \
+    --trust-remote-code \
+    --host 0.0.0.0 \
+    --port 8000
+```
diff --git a/selfware.toml b/selfware.toml
@@ -1,8 +1,8 @@
 # Set this to your OpenAI-compatible API endpoint (e.g., local vLLM/sglang server, ngrok tunnel, etc.)
 # You can set this via environment variable SELFWARE_ENDPOINT or in your config
-endpoint = "https://crazyshit.ngrok.io/v1"
-model = "txn545/Qwen3.5-122B-A10B-NVFP4"
-max_tokens = 98304
+endpoint = "http://localhost:8000/v1"
+model = "qwen3.5-27b"
+max_tokens = 32768
 
 # Safety section is nested - that part was correct
 [safety]
@@ -15,7 +15,7 @@ max_iterations = 5000
 step_timeout_secs = 600
 # Enable native function calling (requires backend support like sglang --tool-call-parser)
 native_function_calling = true
-token_budget = 224000
+token_budget = 262144
 
 [continuous_work]
 enabled = true

diff --git a/src/agent/checkpointing.rs b/src/agent/checkpointing.rs
@@ -296,13 +296,13 @@ impl Agent {
             );
         }
 
-        // Save global episodic memory — offloaded to a background thread
+        // Save global episodic memory — using tokio::fs for async I/O
         // to avoid blocking the Tokio executor on synchronous filesystem I/O.
         let data_dir = dirs::data_local_dir()
             .unwrap_or_else(|| std::path::PathBuf::from("."))
             .join("selfware");
 
-        // Serialize in the main thread (cheap), write to disk in background (slow I/O)
+        // Serialize in the main thread (cheap), write to disk asynchronously (slow I/O)
         let memory_content = serde_json::to_string_pretty(&self.cognitive_state.episodic_memory)?;
 
         let engine_path = data_dir.join("improvement_engine.json");
@@ -313,16 +313,16 @@ impl Agent {
             info!("Saved self-improvement engine state");
         }
 
-        let bg_data_dir = data_dir.clone();
-        std::thread::spawn(move || {
-            let memory_path = bg_data_dir.join("global_episodic_memory.json");
+        let memory_path = data_dir.join("global_episodic_memory.json");
+        let content = memory_content;
+        tokio::spawn(async move {
             if let Some(parent) = memory_path.parent() {
-                if let Err(e) = std::fs::create_dir_all(parent) {
+                if let Err(e) = tokio::fs::create_dir_all(parent).await {
                     tracing::warn!("Failed to create episodic memory dir: {}", e);
                     return;
                 }
             }
-            if let Err(e) = std::fs::write(&memory_path, memory_content) {
+            if let Err(e) = tokio::fs::write(&memory_path, content).await {
                 tracing::warn!("Failed to write episodic memory: {}", e);
             } else {
                 tracing::info!("Saved global episodic memory (background)");