From 0182d497c7ecbe5e9cf5fadf38707661f26c9725 Mon Sep 17 00:00:00 2001
From: mrveiss <martins.veiss@gmail.com>
Date: Thu, 26 Mar 2026 17:51:43 +0200
Subject: [PATCH] chore: replace stale mistral:7b-instruct refs with qwen3.5:9b
 (#2418)

---
 .../ai-ml/llm_model_investigation_summary.md  |  2 +-
 .../ai-ml/llm_model_optimization_analysis.md  |  2 +-
 autobot-shared/ssot_config.py                 | 10 +++---
 docs/ROADMAP_2025.md                          | 10 +++---
 docs/api/environment-variables.md             |  6 ++--
 .../EFFICIENT_INFERENCE_DESIGN.md             | 10 +++---
 .../SSOT_CONFIGURATION_ARCHITECTURE.md        |  8 ++---
 docs/developer/04-configuration.md            |  2 +-
 docs/developer/DISTRIBUTED_TRACING.md         |  2 +-
 docs/developer/ROLES.md                       |  2 +-
 docs/developer/SSOT_CONFIG_GUIDE.md           |  2 +-
 .../developer/THINKING_TOOLS_CONFIGURATION.md |  6 ++--
 docs/developer/TIERED_MODEL_ROUTING.md        | 12 +++----
 docs/guides/chat-ollama-configuration.md      | 32 +++++++++----------
 docs/guides/llm-middleware-telemetry.md       |  2 +-
 .../2026-02-02-agent-llm-config-design.md     |  2 +-
 ...6-02-02-agent-llm-config-implementation.md |  6 ++--
 ...26-02-02-config-registry-implementation.md |  2 +-
 ...2026-02-02-phase3-client-library-design.md |  4 +--
 .../plans/2026-02-02-phase3-implementation.md |  2 +-
 .../2026-02-02-service-discovery-design.md    |  2 +-
 ...-02-03-tiered-model-distribution-design.md |  6 ++--
 22 files changed, 66 insertions(+), 66 deletions(-)

diff --git a/autobot-infrastructure/shared/analysis/ai-ml/llm_model_investigation_summary.md b/autobot-infrastructure/shared/analysis/ai-ml/llm_model_investigation_summary.md
index f17ece414..70e4adc1f 100644
--- a/autobot-infrastructure/shared/analysis/ai-ml/llm_model_investigation_summary.md
+++ b/autobot-infrastructure/shared/analysis/ai-ml/llm_model_investigation_summary.md
@@ -53,7 +53,7 @@ SPECIALIZED MISSING:
 ❌ codellama:7b-instruct         - Code analysis optimization
 ❌ phi3:3.8b                     - Fast inference model
 ❌ qwen2.5:7b                    - Enhanced reasoning
-❌ mistral:7b-instruct           - Alternative reasoning
+❌ qwen3.5:9b                    - Default reasoning model
 ```
 
 ---
diff --git a/autobot-infrastructure/shared/analysis/ai-ml/llm_model_optimization_analysis.md b/autobot-infrastructure/shared/analysis/ai-ml/llm_model_optimization_analysis.md
index 199094c16..6ea2a0bda 100644
--- a/autobot-infrastructure/shared/analysis/ai-ml/llm_model_optimization_analysis.md
+++ b/autobot-infrastructure/shared/analysis/ai-ml/llm_model_optimization_analysis.md
@@ -120,7 +120,7 @@ ollama pull codellama:7b-instruct              # For code-specific tasks
 
 # Optional advanced models
 ollama pull qwen2.5:7b                          # For general reasoning
-ollama pull mistral:7b-instruct                # Alternative reasoning model
+ollama pull qwen3.5:9b                         # Default reasoning model
 ```
 
 ### 2. Update Configuration Files
diff --git a/autobot-shared/ssot_config.py b/autobot-shared/ssot_config.py
index bf7b31cc6..6201bba87 100644
--- a/autobot-shared/ssot_config.py
+++ b/autobot-shared/ssot_config.py
@@ -228,7 +228,7 @@ def get_ollama_endpoint_for_model(self, model_name: str) -> str:
         """Route Ollama requests to GPU or CPU endpoint by model (#1070).
 
         Args:
-            model_name: Ollama model name (e.g. 'mistral:7b-instruct')
+            model_name: Ollama model name (e.g. 'qwen3.5:9b')
 
         Returns:
             Ollama base URL (no /api suffix)
@@ -316,7 +316,7 @@ def get_model_for_agent(self, agent_id: str) -> str:
             agent_id: Agent identifier (e.g., 'orchestrator', 'research', 'code_analysis')
 
         Returns:
-            Model name (e.g., 'gpt-4', 'claude-3-opus', 'mistral:7b-instruct')
+            Model name (e.g., 'gpt-4', 'claude-3-opus', 'qwen3.5:9b')
 
         Example:
             # In .env:
@@ -898,7 +898,7 @@ class AutoBotConfig(BaseSettings):
         config = get_config()
         backend = config.backend_url  # http://172.16.168.20:8001
         redis = config.redis_url  # redis://172.16.168.23:6379
-        model = config.llm.default_model  # mistral:7b-instruct
+        model = config.llm.default_model  # qwen3.5:9b
     """
 
     model_config = SettingsConfigDict(
@@ -1223,7 +1223,7 @@ def get_agent_llm_config_explicit(agent_id: str) -> dict:
     Each agent MUST have its own provider, endpoint, and model via environment variables:
     - AUTOBOT_{AGENT_ID}_PROVIDER (e.g., AUTOBOT_ORCHESTRATOR_PROVIDER=ollama)
     - AUTOBOT_{AGENT_ID}_ENDPOINT (e.g., AUTOBOT_ORCHESTRATOR_ENDPOINT=http://127.0.0.1:11434)
-    - AUTOBOT_{AGENT_ID}_MODEL (e.g., AUTOBOT_ORCHESTRATOR_MODEL=mistral:7b-instruct)
+    - AUTOBOT_{AGENT_ID}_MODEL (e.g., AUTOBOT_ORCHESTRATOR_MODEL=qwen3.5:9b)
 
     Raises AgentConfigurationError if any setting is missing.
 
@@ -1306,7 +1306,7 @@ def get_agent_model_explicit(agent_id: str) -> str:
         raise AgentConfigurationError(
             f"Agent '{agent_id}' requires explicit LLM model configuration. "
             f"Set {env_key} in .env file. "
-            f"Example: {env_key}=mistral:7b-instruct"
+            f"Example: {env_key}=qwen3.5:9b"
         )
     return model
 
diff --git a/docs/ROADMAP_2025.md b/docs/ROADMAP_2025.md
index d4527ca9e..5c6051636 100644
--- a/docs/ROADMAP_2025.md
+++ b/docs/ROADMAP_2025.md
@@ -90,7 +90,7 @@ This section documents key architectural decisions where the original plan was r
 
 **Current Implementation** (Temporary):
 
-- **Mistral 7B Instruct** (`mistral:7b-instruct`) - Used for ALL task types:
+- **Qwen 3.5 9B** (`qwen3.5:9b`) - Used for ALL task types:
   - Default LLM, Embedding, Classification, Reasoning
   - RAG, Coding, Orchestrator, Agent tasks
   - Research, Analysis, Planning
@@ -112,10 +112,10 @@ This section documents key architectural decisions where the original plan was r
 **Current Configuration** (from `.env`):
 
 ```bash
-AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct
-AUTOBOT_EMBEDDING_MODEL=mistral:7b-instruct
-AUTOBOT_CLASSIFICATION_MODEL=mistral:7b-instruct  # TODO: Use 1B model
-AUTOBOT_REASONING_MODEL=mistral:7b-instruct
+AUTOBOT_DEFAULT_LLM_MODEL=qwen3.5:9b
+AUTOBOT_EMBEDDING_MODEL=qwen3.5:9b
+AUTOBOT_CLASSIFICATION_MODEL=qwen3.5:9b  # TODO: Use 1B model
+AUTOBOT_REASONING_MODEL=qwen3.5:9b
 # Future: tiered model distribution for specialized agents
 ```
 
diff --git a/docs/api/environment-variables.md b/docs/api/environment-variables.md
index 0cbc8948a..52557bc39 100644
--- a/docs/api/environment-variables.md
+++ b/docs/api/environment-variables.md
@@ -17,7 +17,7 @@ AutoBot supports comprehensive configuration through environment variables with
 
 | Variable | Default | Description |
 |----------|---------|-------------|
-| `AUTOBOT_DEFAULT_LLM_MODEL` | `mistral:7b-instruct` | **Primary** - Default LLM model for all tasks |
+| `AUTOBOT_DEFAULT_LLM_MODEL` | `qwen3.5:9b` | **Primary** - Default LLM model for all tasks |
 | `AUTOBOT_OLLAMA_HOST` | `172.16.168.24` | Ollama server host (AI Stack VM) |
 | `AUTOBOT_OLLAMA_PORT` | `11434` | Ollama server port |
 | `AUTOBOT_OLLAMA_ENDPOINT` | `http://${HOST}:${PORT}/api/generate` | Ollama API endpoint |
@@ -119,8 +119,8 @@ The frontend uses Vite environment variables with the `VITE_` prefix:
 
 ### Setting Default LLM Model
 ```bash
-export AUTOBOT_DEFAULT_LLM_MODEL="mistral:7b-instruct"
-export AUTOBOT_ORCHESTRATOR_LLM="mistral:7b-instruct"
+export AUTOBOT_DEFAULT_LLM_MODEL="qwen3.5:9b"
+export AUTOBOT_ORCHESTRATOR_LLM="qwen3.5:9b"
 ```
 
 ### Using Different Backend Port
diff --git a/docs/architecture/EFFICIENT_INFERENCE_DESIGN.md b/docs/architecture/EFFICIENT_INFERENCE_DESIGN.md
index 743e3ec3d..55cfb9772 100644
--- a/docs/architecture/EFFICIENT_INFERENCE_DESIGN.md
+++ b/docs/architecture/EFFICIENT_INFERENCE_DESIGN.md
@@ -43,7 +43,7 @@ This document describes a **latency-focused** inference optimization architectur
 AutoBot's LLM infrastructure:
 - **Ollama** (primary) - Local inference at `127.0.0.1:11434`
 - **vLLM** - High-performance inference with prefix caching
-- **Default model:** `mistral:7b-instruct`
+- **Default model:** `qwen3.5:9b`
 - **Current latency:** ~500ms first token
 
 ### Problem with AirLLM Approach
@@ -547,17 +547,17 @@ QUANTIZED_MODEL_REGISTRY = {
 Ollama models already support quantization via GGUF format:
 
 ```bash
-# Current: mistral:7b-instruct (FP16, ~14GB)
+# Current: qwen3.5:9b (FP16, ~14GB)
 # Optimized options:
-ollama pull mistral:7b-instruct-q4_K_M   # 4-bit, ~4GB, slight quality loss
-ollama pull mistral:7b-instruct-q8_0     # 8-bit, ~8GB, minimal quality loss
+ollama pull qwen3.5:9b-q4_K_M   # 4-bit, ~4GB, slight quality loss
+ollama pull qwen3.5:9b-q8_0     # 8-bit, ~8GB, minimal quality loss
 ```
 
 **Update `.env` for quantized Ollama models:**
 
 ```bash
 # Use quantized model for better performance
-AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct-q8_0
+AUTOBOT_DEFAULT_LLM_MODEL=qwen3.5:9b-q8_0
 ```
 
 ---
diff --git a/docs/architecture/SSOT_CONFIGURATION_ARCHITECTURE.md b/docs/architecture/SSOT_CONFIGURATION_ARCHITECTURE.md
index 580383a5d..3d2605287 100644
--- a/docs/architecture/SSOT_CONFIGURATION_ARCHITECTURE.md
+++ b/docs/architecture/SSOT_CONFIGURATION_ARCHITECTURE.md
@@ -205,7 +205,7 @@ AUTOBOT_MAIN_MACHINE_IP=172.16.168.20
 AUTOBOT_FRONTEND_VM_IP=172.16.168.21
 AUTOBOT_REDIS_VM_IP=172.16.168.23
 AUTOBOT_BACKEND_PORT=8001
-AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct
+AUTOBOT_DEFAULT_LLM_MODEL=qwen3.5:9b
 
 Layer 2: Frozen Code Defaults (Emergency Fallback)
 --------------------------------------------------
@@ -314,7 +314,7 @@ AUTOBOT_PORT_GRAFANA=3000
 # -----------------------------------------------------------------------------
 # LLM CONFIGURATION
 # -----------------------------------------------------------------------------
-AUTOBOT_LLM_DEFAULT_MODEL=mistral:7b-instruct
+AUTOBOT_LLM_DEFAULT_MODEL=qwen3.5:9b
 AUTOBOT_LLM_EMBEDDING_MODEL=nomic-embed-text:latest
 AUTOBOT_LLM_CLASSIFICATION_MODEL=gemma2:2b
 AUTOBOT_LLM_PROVIDER=ollama
@@ -411,7 +411,7 @@ class PortConfig(BaseSettings):
 
 class LLMConfig(BaseSettings):
     """LLM configuration"""
-    default_model: str = Field(alias="AUTOBOT_LLM_DEFAULT_MODEL", default="mistral:7b-instruct")
+    default_model: str = Field(alias="AUTOBOT_LLM_DEFAULT_MODEL", default="qwen3.5:9b")
     embedding_model: str = Field(alias="AUTOBOT_LLM_EMBEDDING_MODEL", default="nomic-embed-text:latest")
     provider: str = Field(alias="AUTOBOT_LLM_PROVIDER", default="ollama")
     timeout: int = Field(alias="AUTOBOT_LLM_TIMEOUT", default=120)
@@ -589,7 +589,7 @@ export function getConfig(): AutoBotConfig {
   };
 
   const llm: LLMConfig = {
-    defaultModel: getEnv('VITE_LLM_DEFAULT_MODEL', 'mistral:7b-instruct'),
+    defaultModel: getEnv('VITE_LLM_DEFAULT_MODEL', 'qwen3.5:9b'),
     embeddingModel: getEnv('VITE_LLM_EMBEDDING_MODEL', 'nomic-embed-text:latest'),
     provider: getEnv('VITE_LLM_PROVIDER', 'ollama'),
     timeout: getEnvNumber('VITE_LLM_TIMEOUT', 120),
diff --git a/docs/developer/04-configuration.md b/docs/developer/04-configuration.md
index fb78a0acf..883178b2d 100644
--- a/docs/developer/04-configuration.md
+++ b/docs/developer/04-configuration.md
@@ -268,7 +268,7 @@ AutoBot supports environment variable overrides using the `AUTOBOT_` prefix:
 |----------|-------------|---------|---------|
 | `AUTOBOT_BACKEND_PORT` | `backend.server_port` | Backend server port | `8002` |
 | `AUTOBOT_BACKEND_HOST` | `backend.server_host` | Backend bind address | `127.0.0.1` |
-| `AUTOBOT_DEFAULT_LLM_MODEL` | `llm_config.ollama.model` | **Primary** - Default LLM model | `mistral:7b-instruct` |
+| `AUTOBOT_DEFAULT_LLM_MODEL` | `llm_config.ollama.model` | **Primary** - Default LLM model | `qwen3.5:9b` |
 | `AUTOBOT_OLLAMA_HOST` | `llm_config.ollama.host` | Ollama server URL | `http://ollama:11434` |
 | `AUTOBOT_OLLAMA_PORT` | `llm_config.ollama.port` | Ollama server port | `11434` |
 | `AUTOBOT_ORCHESTRATOR_LLM` | `llm_config.orchestrator_llm` | Orchestrator LLM | `gpt-4` |
diff --git a/docs/developer/DISTRIBUTED_TRACING.md b/docs/developer/DISTRIBUTED_TRACING.md
index 2c179011c..b0b11ab12 100644
--- a/docs/developer/DISTRIBUTED_TRACING.md
+++ b/docs/developer/DISTRIBUTED_TRACING.md
@@ -114,7 +114,7 @@ Examples:
 #### LLM Spans
 ```python
 "llm.provider": "ollama",
-"llm.model": "mistral:7b-instruct",
+"llm.model": "qwen3.5:9b",
 "llm.streaming": True,
 "llm.temperature": 0.7,
 "llm.prompt_messages": 3,
diff --git a/docs/developer/ROLES.md b/docs/developer/ROLES.md
index 94f5be027..a1ade7de6 100644
--- a/docs/developer/ROLES.md
+++ b/docs/developer/ROLES.md
@@ -375,7 +375,7 @@ These conflicts drive the default fleet layout:
 | **External deps** | — |
 | **Ansible playbook** | `playbooks/deploy_role.yml` |
 | **Source path** | — (binary install from ollama.ai) |
-| **GPU models** | mistral:7b-instruct, deepseek-r1:14b, codellama:13b |
+| **GPU models** | qwen3.5:9b, deepseek-r1:14b, codellama:13b |
 | **Concurrency** | max_loaded=5, num_parallel=4, keep_alive=10m |
 | **Special hardware** | NVIDIA GPU required. Auto-detected via nvidia-smi. |
 | **Degraded without** | Large model inference — system falls back to CPU models or cloud providers |
diff --git a/docs/developer/SSOT_CONFIG_GUIDE.md b/docs/developer/SSOT_CONFIG_GUIDE.md
index 82dde9123..277433618 100644
--- a/docs/developer/SSOT_CONFIG_GUIDE.md
+++ b/docs/developer/SSOT_CONFIG_GUIDE.md
@@ -415,7 +415,7 @@ All infrastructure configuration (IPs, ports, hosts) is in `.env`:
 AUTOBOT_BACKEND_HOST=172.16.168.20
 AUTOBOT_REDIS_HOST=172.16.168.23
 AUTOBOT_OLLAMA_HOST=127.0.0.1
-AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct
+AUTOBOT_DEFAULT_LLM_MODEL=qwen3.5:9b
 ```
 
 ### What Goes Where?
diff --git a/docs/developer/THINKING_TOOLS_CONFIGURATION.md b/docs/developer/THINKING_TOOLS_CONFIGURATION.md
index b7750ff7e..665a2af7c 100644
--- a/docs/developer/THINKING_TOOLS_CONFIGURATION.md
+++ b/docs/developer/THINKING_TOOLS_CONFIGURATION.md
@@ -205,7 +205,7 @@ def query(self, ...):
 **Ensure Mistral is Default Model** (Required for Tool Calling):
 ```bash
 # In .env file:
-AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct
+AUTOBOT_DEFAULT_LLM_MODEL=qwen3.5:9b
 ```
 
 **Why Mistral?**
@@ -217,7 +217,7 @@ AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct
 **Verify**:
 ```bash
 grep "AUTOBOT_DEFAULT_LLM_MODEL" .env
-# Should output: AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct
+# Should output: AUTOBOT_DEFAULT_LLM_MODEL=qwen3.5:9b
 ```
 
 ---
@@ -324,7 +324,7 @@ Here's the implementation plan...
 2. **Verify model is Mistral**:
    ```bash
    grep "AUTOBOT_DEFAULT_LLM_MODEL" .env
-   # Should be: mistral:7b-instruct
+   # Should be: qwen3.5:9b
    ```
 
 3. **Check system prompt loaded**:
diff --git a/docs/developer/TIERED_MODEL_ROUTING.md b/docs/developer/TIERED_MODEL_ROUTING.md
index 7881e6391..4e9d612b2 100644
--- a/docs/developer/TIERED_MODEL_ROUTING.md
+++ b/docs/developer/TIERED_MODEL_ROUTING.md
@@ -24,7 +24,7 @@ Tiered Model Routing automatically selects the most appropriate LLM model based
 
 **Default Models:**
 - Simple Tier: `gemma2:2b` (fast, low resource)
-- Complex Tier: `mistral:7b-instruct` (capable, comprehensive)
+- Complex Tier: `qwen3.5:9b` (capable, comprehensive)
 
 ### Complexity Scoring
 
@@ -105,7 +105,7 @@ AUTOBOT_COMPLEXITY_THRESHOLD=3.0
 
 # Model assignments
 AUTOBOT_MODEL_TIER_SIMPLE=gemma2:2b
-AUTOBOT_MODEL_TIER_COMPLEX=mistral:7b-instruct
+AUTOBOT_MODEL_TIER_COMPLEX=qwen3.5:9b
 
 # Fallback behavior (default: true)
 # If simple tier fails, automatically retry with complex tier
@@ -131,7 +131,7 @@ enabled = tier_config.get("enabled", True)
 
 # Get models
 simple_model = tier_config.get("models", {}).get("simple", "gemma2:2b")
-complex_model = tier_config.get("models", {}).get("complex", "mistral:7b-instruct")
+complex_model = tier_config.get("models", {}).get("complex", "qwen3.5:9b")
 
 # Get threshold
 threshold = tier_config.get("complexity_threshold", 3.0)
@@ -208,7 +208,7 @@ Get current tiered routing configuration.
   "complexity_threshold": 3.0,
   "models": {
     "simple": "gemma2:2b",
-    "complex": "mistral:7b-instruct"
+    "complex": "qwen3.5:9b"
   },
   "fallback_to_complex": true,
   "logging": {
@@ -321,10 +321,10 @@ curl -X POST http://localhost:8001/api/llm/tiered-routing/config \
 When `log_routing_decisions` is enabled, routing decisions are logged:
 
 ```
-INFO - Tiered routing: mistral:7b-instruct -> gemma2:2b
+INFO - Tiered routing: qwen3.5:9b -> gemma2:2b
        (score=1.8, tier=simple, reason=Low complexity request with minimal indicators)
 
-INFO - Tiered routing: selected mistral:7b-instruct
+INFO - Tiered routing: selected qwen3.5:9b
        (score=5.4, tier=complex)
 
 WARNING - Tiered routing fallback triggered: simple -> complex tier
diff --git a/docs/guides/chat-ollama-configuration.md b/docs/guides/chat-ollama-configuration.md
index 99f04625b..818f5349c 100644
--- a/docs/guides/chat-ollama-configuration.md
+++ b/docs/guides/chat-ollama-configuration.md
@@ -178,7 +178,7 @@ backend:
       # When set, models in gpu_models are routed here instead of the default.
       # gpu_endpoint: http://172.16.168.20:11434
       # gpu_models:
-      #   - "mistral:7b-instruct"
+      #   - "qwen3.5:9b"
       #   - "deepseek-r1:14b"
       #   - "codellama:13b"
 
@@ -404,7 +404,7 @@ Model selection follows its own priority chain, defined in
 ```
 [1] config.yaml: backend.llm.local.providers.ollama.selected_model
 [2] Environment: AUTOBOT_DEFAULT_LLM_MODEL
-[3] ModelConstants.DEFAULT_OLLAMA_MODEL (from ConfigRegistry -> "mistral:7b-instruct")
+[3] ModelConstants.DEFAULT_OLLAMA_MODEL (from ConfigRegistry -> "qwen3.5:9b")
 ```
 
 ### Source Code Reference
@@ -527,7 +527,7 @@ The system-wide model defaults are defined in `constants/model_constants.py`:
 ```python
 from constants.model_constants import ModelConstants
 
-ModelConstants.DEFAULT_OLLAMA_MODEL     # "mistral:7b-instruct" (from ConfigRegistry)
+ModelConstants.DEFAULT_OLLAMA_MODEL     # "qwen3.5:9b" (from ConfigRegistry)
 ModelConstants.DEFAULT_OPENAI_MODEL     # "gpt-4"
 ModelConstants.DEFAULT_ANTHROPIC_MODEL  # "claude-3-5-sonnet-20241022"
 ModelConstants.EMBEDDING_MODEL          # "nomic-embed-text:latest"
@@ -825,7 +825,7 @@ Ollama listens on `http://127.0.0.1:11434` by default.
 
 ```bash
 # Pull the default model
-ollama pull mistral:7b-instruct
+ollama pull qwen3.5:9b
 
 # Or pull a different model
 ollama pull llama3.2
@@ -842,7 +842,7 @@ curl -s http://127.0.0.1:11434/api/tags | python3 -m json.tool
 
 # Test generation
 curl -s http://127.0.0.1:11434/api/generate \
-  -d '{"model": "mistral:7b-instruct", "prompt": "Hello", "stream": false}' \
+  -d '{"model": "qwen3.5:9b", "prompt": "Hello", "stream": false}' \
   | python3 -c "import json,sys; print(json.load(sys.stdin).get('response','')[:200])"
 ```
 
@@ -852,7 +852,7 @@ Option A -- Environment variables (quick, non-persistent):
 
 ```bash
 export AUTOBOT_OLLAMA_HOST=127.0.0.1
-export AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct
+export AUTOBOT_DEFAULT_LLM_MODEL=qwen3.5:9b
 ```
 
 Option B -- config.yaml (persistent, recommended):
@@ -863,7 +863,7 @@ backend:
   llm:
     ollama:
       endpoint: http://127.0.0.1:11434
-      selected_model: "mistral:7b-instruct"
+      selected_model: "qwen3.5:9b"
 
 infrastructure:
   hosts:
@@ -1017,7 +1017,7 @@ backend:
       endpoint: http://127.0.0.1:11434           # CPU endpoint (default)
       gpu_endpoint: http://172.16.168.20:11434    # GPU-accelerated endpoint
       gpu_models:
-        - "mistral:7b-instruct"
+        - "qwen3.5:9b"
         - "deepseek-r1:14b"
         - "codellama:13b"
 ```
@@ -1059,7 +1059,7 @@ Expected output:
 
 ```
 [ChatWorkflowManager] Making Ollama request to: http://172.16.168.20:11434/api/generate
-[ChatWorkflowManager] Using model: mistral:7b-instruct
+[ChatWorkflowManager] Using model: qwen3.5:9b
 ```
 
 ---
@@ -1125,7 +1125,7 @@ ULTIMATE_FALLBACK_CONFIG = {
         "OLLAMA_URL",
         os.getenv("OLLAMA_HOST", "http://localhost:11434"),
     ),
-    "llm_model": os.getenv("AUTOBOT_DEFAULT_LLM_MODEL", "mistral:7b-instruct"),
+    "llm_model": os.getenv("AUTOBOT_DEFAULT_LLM_MODEL", "qwen3.5:9b"),
     "llm_timeout": 30,
     "llm_temperature": 0.7,
 }
@@ -1299,7 +1299,7 @@ async def main() -> None:
     # Step 1: Verify Ollama
     models = await check_ollama()
     if not models:
-        logger.error("No models found. Pull one with: ollama pull mistral:7b-instruct")
+        logger.error("No models found. Pull one with: ollama pull qwen3.5:9b")
         sys.exit(1)
 
     model_name = models[0]  # Use the first available model
@@ -1332,14 +1332,14 @@ curl -fsSL https://ollama.ai/install.sh | sh
 sudo systemctl enable --now ollama
 
 # 3. Pull a model
-ollama pull mistral:7b-instruct
+ollama pull qwen3.5:9b
 
 # 4. Verify
 curl -s http://127.0.0.1:11434/api/tags | python3 -m json.tool
 
 # 5. Set environment (or edit config.yaml)
 export AUTOBOT_OLLAMA_HOST=127.0.0.1
-export AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct
+export AUTOBOT_DEFAULT_LLM_MODEL=qwen3.5:9b
 
 # 6. Restart backend
 sudo systemctl restart autobot-backend
@@ -1399,7 +1399,7 @@ journalctl -u autobot-backend --since "1 minute ago" | grep "Using model"
 **Fixes:**
 
 - Pull the model: `ollama pull <model_name>`
-- Verify the model name matches exactly (including tag): `mistral:7b-instruct` vs `mistral:latest`
+- Verify the model name matches exactly (including tag): `qwen3.5:9b` vs `mistral:latest`
 - Update config to use an installed model
 
 ### Slow Responses
@@ -1419,7 +1419,7 @@ free -h      # Check available memory
 
 **Fixes:**
 
-- Use a smaller model (e.g., `mistral:7b-instruct` instead of `llama3.2:70b`)
+- Use a smaller model (e.g., `qwen3.5:9b` instead of `llama3.2:70b`)
 - Enable GPU acceleration if available (Ollama auto-detects CUDA GPUs)
 - Configure GPU routing in `config.yaml` (see [GPU Model Routing](#8-gpu-model-routing))
 - Increase system RAM or GPU VRAM
@@ -1512,7 +1512,7 @@ ls -la autobot-backend/data/conversation_transcripts/ | head -5
 |---------|----------------|---------------------|---------|
 | Ollama endpoint | `backend.llm.ollama.endpoint` | `AUTOBOT_OLLAMA_ENDPOINT` | `http://127.0.0.1:11434` |
 | Ollama host | `infrastructure.hosts.ollama` | `AUTOBOT_OLLAMA_HOST` | `127.0.0.1` |
-| Selected model | `backend.llm.ollama.selected_model` | `AUTOBOT_DEFAULT_LLM_MODEL` | `mistral:7b-instruct` |
+| Selected model | `backend.llm.ollama.selected_model` | `AUTOBOT_DEFAULT_LLM_MODEL` | `qwen3.5:9b` |
 | GPU endpoint | `backend.llm.ollama.gpu_endpoint` | -- | (none) |
 | GPU models | `backend.llm.ollama.gpu_models` | -- | `[]` |
 | LLM provider | `backend.llm.active_provider` | `AUTOBOT_LLM_PROVIDER` | `ollama` |
diff --git a/docs/guides/llm-middleware-telemetry.md b/docs/guides/llm-middleware-telemetry.md
index a61b11470..9ad6186ed 100644
--- a/docs/guides/llm-middleware-telemetry.md
+++ b/docs/guides/llm-middleware-telemetry.md
@@ -931,7 +931,7 @@ backend:
       # GPU endpoint for model-to-endpoint routing (#1070)
       # gpu_endpoint: http://172.16.168.20:11434
       # gpu_models:
-      #   - "mistral:7b-instruct"
+      #   - "qwen3.5:9b"
       #   - "deepseek-r1:14b"
 
 # Fallback path for _get_ollama_endpoint_fallback() via get_host("ollama")
diff --git a/docs/plans/2026-02-02-agent-llm-config-design.md b/docs/plans/2026-02-02-agent-llm-config-design.md
index 6b1a9c00b..eaccba588 100644
--- a/docs/plans/2026-02-02-agent-llm-config-design.md
+++ b/docs/plans/2026-02-02-agent-llm-config-design.md
@@ -197,7 +197,7 @@ CREATE INDEX idx_agents_is_default ON agents(is_default);
 
 -- Seed default agent
 INSERT INTO agents (agent_id, name, llm_provider, llm_model, is_default)
-VALUES ('default', 'Default Agent', 'ollama', 'mistral:7b-instruct', TRUE);
+VALUES ('default', 'Default Agent', 'ollama', 'qwen3.5:9b', TRUE);
 ```
 
 ## 9. Phase 2 Scope
diff --git a/docs/plans/2026-02-02-agent-llm-config-implementation.md b/docs/plans/2026-02-02-agent-llm-config-implementation.md
index 0006358c7..e4aae7af4 100644
--- a/docs/plans/2026-02-02-agent-llm-config-implementation.md
+++ b/docs/plans/2026-02-02-agent-llm-config-implementation.md
@@ -239,7 +239,7 @@ def run_migration(db_path: str = "slm.db") -> bool:
         cursor.execute("""
             INSERT INTO agents (agent_id, name, description, llm_provider, llm_model, is_default)
             VALUES ('default', 'Default Agent', 'Fallback agent for unconfigured requests',
-                    'ollama', 'mistral:7b-instruct', TRUE)
+                    'ollama', 'qwen3.5:9b', TRUE)
         """)
         logger.info("Seeded default agent")
 
@@ -678,7 +678,7 @@ Expected: Migration complete, default agent seeded
 **Step 2: Verify table exists**
 
 Run: `cd slm-server && sqlite3 slm.db "SELECT agent_id, name, llm_provider, llm_model, is_default FROM agents;"`
-Expected: `default|Default Agent|ollama|mistral:7b-instruct|1`
+Expected: `default|Default Agent|ollama|qwen3.5:9b|1`
 
 **Step 3: Test API endpoint (manual or curl)**
 
@@ -937,7 +937,7 @@ class SLMClient:
         return {
             "llm_provider": "ollama",
             "llm_endpoint": "http://127.0.0.1:11434",
-            "llm_model": "mistral:7b-instruct",
+            "llm_model": "qwen3.5:9b",
             "llm_timeout": 30,
             "llm_temperature": 0.7,
             "llm_max_tokens": None,
diff --git a/docs/plans/2026-02-02-config-registry-implementation.md b/docs/plans/2026-02-02-config-registry-implementation.md
index c54574ce1..a392f7f2d 100644
--- a/docs/plans/2026-02-02-config-registry-implementation.md
+++ b/docs/plans/2026-02-02-config-registry-implementation.md
@@ -566,7 +566,7 @@ REGISTRY_DEFAULTS = {
     "browser.host": "172.16.168.25",
     "browser.port": "3000",
     # LLM defaults
-    "llm.default_model": "mistral:7b-instruct",
+    "llm.default_model": "qwen3.5:9b",
     "llm.embedding_model": "nomic-embed-text:latest",
     # Timeouts
     "timeout.http": "30",
diff --git a/docs/plans/2026-02-02-phase3-client-library-design.md b/docs/plans/2026-02-02-phase3-client-library-design.md
index 83c5168dd..f734522a5 100644
--- a/docs/plans/2026-02-02-phase3-client-library-design.md
+++ b/docs/plans/2026-02-02-phase3-client-library-design.md
@@ -168,8 +168,8 @@ export async function discoverService(
 |------------|---------------|--------------|
 | TIER_1 | llama3.2:1b | ollama |
 | TIER_2 | llama3.2:3b | ollama |
-| TIER_3 | mistral:7b-instruct | ollama |
-| TIER_4 | mistral:7b-instruct | ollama |
+| TIER_3 | qwen3.5:9b | ollama |
+| TIER_4 | qwen3.5:9b | ollama |
 
 ---
 
diff --git a/docs/plans/2026-02-02-phase3-implementation.md b/docs/plans/2026-02-02-phase3-implementation.md
index 148415878..321bad938 100644
--- a/docs/plans/2026-02-02-phase3-implementation.md
+++ b/docs/plans/2026-02-02-phase3-implementation.md
@@ -423,7 +423,7 @@ async def seed_agents():
                 continue
 
             # Determine model from config
-            default_model = config.get("default_model", "mistral:7b-instruct")
+            default_model = config.get("default_model", "qwen3.5:9b")
 
             # Create agent
             agent = Agent(
diff --git a/docs/plans/2026-02-02-service-discovery-design.md b/docs/plans/2026-02-02-service-discovery-design.md
index 1a7dc2642..828ec5f47 100644
--- a/docs/plans/2026-02-02-service-discovery-design.md
+++ b/docs/plans/2026-02-02-service-discovery-design.md
@@ -167,7 +167,7 @@ async def get_config(node_id: str, key: str) -> Optional[str]:
 |-----|---------------|-------------|
 | `llm.provider` | `ollama`, `openai`, `anthropic` | Provider type |
 | `llm.endpoint` | `http://127.0.0.1:11434` | API base URL |
-| `llm.model` | `mistral:7b-instruct`, `gpt-4` | Default model |
+| `llm.model` | `qwen3.5:9b`, `gpt-4` | Default model |
 | `llm.api_key` | `sk-...` | API key (encrypted at rest) |
 | `llm.timeout` | `30` | Request timeout seconds |
 
diff --git a/docs/plans/2026-02-03-tiered-model-distribution-design.md b/docs/plans/2026-02-03-tiered-model-distribution-design.md
index 80eb8664d..c78b583f1 100644
--- a/docs/plans/2026-02-03-tiered-model-distribution-design.md
+++ b/docs/plans/2026-02-03-tiered-model-distribution-design.md
@@ -12,7 +12,7 @@ Implement tiered model distribution to achieve 50-75% reduction in resource usag
 ## Goals
 
 - Route simple requests (complexity < 3) to lightweight model (`gemma2:2b`)
-- Route complex requests (complexity >= 3) to capable model (`mistral:7b-instruct`)
+- Route complex requests (complexity >= 3) to capable model (`qwen3.5:9b`)
 - Achieve 50%+ reduction in compute resources for simple tasks
 - Maintain response quality through intelligent routing
 
@@ -81,7 +81,7 @@ tiered_routing:
   complexity_threshold: 3
   models:
     simple: "gemma2:2b"
-    complex: "mistral:7b-instruct"
+    complex: "qwen3.5:9b"
   fallback_to_complex: true
   logging:
     log_scores: true
@@ -113,7 +113,7 @@ tiered_routing:
 ## Success Criteria
 
 - [ ] Simple requests (< 3 complexity) handled by `gemma2:2b`
-- [ ] Complex requests (>= 3 complexity) handled by `mistral:7b-instruct`
+- [ ] Complex requests (>= 3 complexity) handled by `qwen3.5:9b`
 - [ ] 50%+ reduction in compute resources for simple tasks
 - [ ] No degradation in response quality (validated by sampling)
 - [ ] Metrics visible in system dashboard