From 0182d497c7ecbe5e9cf5fadf38707661f26c9725 Mon Sep 17 00:00:00 2001 From: mrveiss Date: Thu, 26 Mar 2026 17:51:43 +0200 Subject: [PATCH] chore: replace stale mistral:7b-instruct refs with qwen3.5:9b (#2418) --- .../ai-ml/llm_model_investigation_summary.md | 2 +- .../ai-ml/llm_model_optimization_analysis.md | 2 +- autobot-shared/ssot_config.py | 10 +++--- docs/ROADMAP_2025.md | 10 +++--- docs/api/environment-variables.md | 6 ++-- .../EFFICIENT_INFERENCE_DESIGN.md | 10 +++--- .../SSOT_CONFIGURATION_ARCHITECTURE.md | 8 ++--- docs/developer/04-configuration.md | 2 +- docs/developer/DISTRIBUTED_TRACING.md | 2 +- docs/developer/ROLES.md | 2 +- docs/developer/SSOT_CONFIG_GUIDE.md | 2 +- .../developer/THINKING_TOOLS_CONFIGURATION.md | 6 ++-- docs/developer/TIERED_MODEL_ROUTING.md | 12 +++---- docs/guides/chat-ollama-configuration.md | 32 +++++++++---------- docs/guides/llm-middleware-telemetry.md | 2 +- .../2026-02-02-agent-llm-config-design.md | 2 +- ...6-02-02-agent-llm-config-implementation.md | 6 ++-- ...26-02-02-config-registry-implementation.md | 2 +- ...2026-02-02-phase3-client-library-design.md | 4 +-- .../plans/2026-02-02-phase3-implementation.md | 2 +- .../2026-02-02-service-discovery-design.md | 2 +- ...-02-03-tiered-model-distribution-design.md | 6 ++-- 22 files changed, 66 insertions(+), 66 deletions(-) diff --git a/autobot-infrastructure/shared/analysis/ai-ml/llm_model_investigation_summary.md b/autobot-infrastructure/shared/analysis/ai-ml/llm_model_investigation_summary.md index f17ece414..70e4adc1f 100644 --- a/autobot-infrastructure/shared/analysis/ai-ml/llm_model_investigation_summary.md +++ b/autobot-infrastructure/shared/analysis/ai-ml/llm_model_investigation_summary.md @@ -53,7 +53,7 @@ SPECIALIZED MISSING: ❌ codellama:7b-instruct - Code analysis optimization ❌ phi3:3.8b - Fast inference model ❌ qwen2.5:7b - Enhanced reasoning -❌ mistral:7b-instruct - Alternative reasoning +❌ qwen3.5:9b - Default reasoning model ``` --- diff --git a/autobot-infrastructure/shared/analysis/ai-ml/llm_model_optimization_analysis.md b/autobot-infrastructure/shared/analysis/ai-ml/llm_model_optimization_analysis.md index 199094c16..6ea2a0bda 100644 --- a/autobot-infrastructure/shared/analysis/ai-ml/llm_model_optimization_analysis.md +++ b/autobot-infrastructure/shared/analysis/ai-ml/llm_model_optimization_analysis.md @@ -120,7 +120,7 @@ ollama pull codellama:7b-instruct # For code-specific tasks # Optional advanced models ollama pull qwen2.5:7b # For general reasoning -ollama pull mistral:7b-instruct # Alternative reasoning model +ollama pull qwen3.5:9b # Default reasoning model ``` ### 2. Update Configuration Files diff --git a/autobot-shared/ssot_config.py b/autobot-shared/ssot_config.py index bf7b31cc6..6201bba87 100644 --- a/autobot-shared/ssot_config.py +++ b/autobot-shared/ssot_config.py @@ -228,7 +228,7 @@ def get_ollama_endpoint_for_model(self, model_name: str) -> str: """Route Ollama requests to GPU or CPU endpoint by model (#1070). Args: - model_name: Ollama model name (e.g. 'mistral:7b-instruct') + model_name: Ollama model name (e.g. 'qwen3.5:9b') Returns: Ollama base URL (no /api suffix) @@ -316,7 +316,7 @@ def get_model_for_agent(self, agent_id: str) -> str: agent_id: Agent identifier (e.g., 'orchestrator', 'research', 'code_analysis') Returns: - Model name (e.g., 'gpt-4', 'claude-3-opus', 'mistral:7b-instruct') + Model name (e.g., 'gpt-4', 'claude-3-opus', 'qwen3.5:9b') Example: # In .env: @@ -898,7 +898,7 @@ class AutoBotConfig(BaseSettings): config = get_config() backend = config.backend_url # http://172.16.168.20:8001 redis = config.redis_url # redis://172.16.168.23:6379 - model = config.llm.default_model # mistral:7b-instruct + model = config.llm.default_model # qwen3.5:9b """ model_config = SettingsConfigDict( @@ -1223,7 +1223,7 @@ def get_agent_llm_config_explicit(agent_id: str) -> dict: Each agent MUST have its own provider, endpoint, and model via environment variables: - AUTOBOT_{AGENT_ID}_PROVIDER (e.g., AUTOBOT_ORCHESTRATOR_PROVIDER=ollama) - AUTOBOT_{AGENT_ID}_ENDPOINT (e.g., AUTOBOT_ORCHESTRATOR_ENDPOINT=http://127.0.0.1:11434) - - AUTOBOT_{AGENT_ID}_MODEL (e.g., AUTOBOT_ORCHESTRATOR_MODEL=mistral:7b-instruct) + - AUTOBOT_{AGENT_ID}_MODEL (e.g., AUTOBOT_ORCHESTRATOR_MODEL=qwen3.5:9b) Raises AgentConfigurationError if any setting is missing. @@ -1306,7 +1306,7 @@ def get_agent_model_explicit(agent_id: str) -> str: raise AgentConfigurationError( f"Agent '{agent_id}' requires explicit LLM model configuration. " f"Set {env_key} in .env file. " - f"Example: {env_key}=mistral:7b-instruct" + f"Example: {env_key}=qwen3.5:9b" ) return model diff --git a/docs/ROADMAP_2025.md b/docs/ROADMAP_2025.md index d4527ca9e..5c6051636 100644 --- a/docs/ROADMAP_2025.md +++ b/docs/ROADMAP_2025.md @@ -90,7 +90,7 @@ This section documents key architectural decisions where the original plan was r **Current Implementation** (Temporary): -- **Mistral 7B Instruct** (`mistral:7b-instruct`) - Used for ALL task types: +- **Qwen 3.5 9B** (`qwen3.5:9b`) - Used for ALL task types: - Default LLM, Embedding, Classification, Reasoning - RAG, Coding, Orchestrator, Agent tasks - Research, Analysis, Planning @@ -112,10 +112,10 @@ This section documents key architectural decisions where the original plan was r **Current Configuration** (from `.env`): ```bash -AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct -AUTOBOT_EMBEDDING_MODEL=mistral:7b-instruct -AUTOBOT_CLASSIFICATION_MODEL=mistral:7b-instruct # TODO: Use 1B model -AUTOBOT_REASONING_MODEL=mistral:7b-instruct +AUTOBOT_DEFAULT_LLM_MODEL=qwen3.5:9b +AUTOBOT_EMBEDDING_MODEL=qwen3.5:9b +AUTOBOT_CLASSIFICATION_MODEL=qwen3.5:9b # TODO: Use 1B model +AUTOBOT_REASONING_MODEL=qwen3.5:9b # Future: tiered model distribution for specialized agents ``` diff --git a/docs/api/environment-variables.md b/docs/api/environment-variables.md index 0cbc8948a..52557bc39 100644 --- a/docs/api/environment-variables.md +++ b/docs/api/environment-variables.md @@ -17,7 +17,7 @@ AutoBot supports comprehensive configuration through environment variables with | Variable | Default | Description | |----------|---------|-------------| -| `AUTOBOT_DEFAULT_LLM_MODEL` | `mistral:7b-instruct` | **Primary** - Default LLM model for all tasks | +| `AUTOBOT_DEFAULT_LLM_MODEL` | `qwen3.5:9b` | **Primary** - Default LLM model for all tasks | | `AUTOBOT_OLLAMA_HOST` | `172.16.168.24` | Ollama server host (AI Stack VM) | | `AUTOBOT_OLLAMA_PORT` | `11434` | Ollama server port | | `AUTOBOT_OLLAMA_ENDPOINT` | `http://${HOST}:${PORT}/api/generate` | Ollama API endpoint | @@ -119,8 +119,8 @@ The frontend uses Vite environment variables with the `VITE_` prefix: ### Setting Default LLM Model ```bash -export AUTOBOT_DEFAULT_LLM_MODEL="mistral:7b-instruct" -export AUTOBOT_ORCHESTRATOR_LLM="mistral:7b-instruct" +export AUTOBOT_DEFAULT_LLM_MODEL="qwen3.5:9b" +export AUTOBOT_ORCHESTRATOR_LLM="qwen3.5:9b" ``` ### Using Different Backend Port diff --git a/docs/architecture/EFFICIENT_INFERENCE_DESIGN.md b/docs/architecture/EFFICIENT_INFERENCE_DESIGN.md index 743e3ec3d..55cfb9772 100644 --- a/docs/architecture/EFFICIENT_INFERENCE_DESIGN.md +++ b/docs/architecture/EFFICIENT_INFERENCE_DESIGN.md @@ -43,7 +43,7 @@ This document describes a **latency-focused** inference optimization architectur AutoBot's LLM infrastructure: - **Ollama** (primary) - Local inference at `127.0.0.1:11434` - **vLLM** - High-performance inference with prefix caching -- **Default model:** `mistral:7b-instruct` +- **Default model:** `qwen3.5:9b` - **Current latency:** ~500ms first token ### Problem with AirLLM Approach @@ -547,17 +547,17 @@ QUANTIZED_MODEL_REGISTRY = { Ollama models already support quantization via GGUF format: ```bash -# Current: mistral:7b-instruct (FP16, ~14GB) +# Current: qwen3.5:9b (FP16, ~14GB) # Optimized options: -ollama pull mistral:7b-instruct-q4_K_M # 4-bit, ~4GB, slight quality loss -ollama pull mistral:7b-instruct-q8_0 # 8-bit, ~8GB, minimal quality loss +ollama pull qwen3.5:9b-q4_K_M # 4-bit, ~4GB, slight quality loss +ollama pull qwen3.5:9b-q8_0 # 8-bit, ~8GB, minimal quality loss ``` **Update `.env` for quantized Ollama models:** ```bash # Use quantized model for better performance -AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct-q8_0 +AUTOBOT_DEFAULT_LLM_MODEL=qwen3.5:9b-q8_0 ``` --- diff --git a/docs/architecture/SSOT_CONFIGURATION_ARCHITECTURE.md b/docs/architecture/SSOT_CONFIGURATION_ARCHITECTURE.md index 580383a5d..3d2605287 100644 --- a/docs/architecture/SSOT_CONFIGURATION_ARCHITECTURE.md +++ b/docs/architecture/SSOT_CONFIGURATION_ARCHITECTURE.md @@ -205,7 +205,7 @@ AUTOBOT_MAIN_MACHINE_IP=172.16.168.20 AUTOBOT_FRONTEND_VM_IP=172.16.168.21 AUTOBOT_REDIS_VM_IP=172.16.168.23 AUTOBOT_BACKEND_PORT=8001 -AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct +AUTOBOT_DEFAULT_LLM_MODEL=qwen3.5:9b Layer 2: Frozen Code Defaults (Emergency Fallback) -------------------------------------------------- @@ -314,7 +314,7 @@ AUTOBOT_PORT_GRAFANA=3000 # ----------------------------------------------------------------------------- # LLM CONFIGURATION # ----------------------------------------------------------------------------- -AUTOBOT_LLM_DEFAULT_MODEL=mistral:7b-instruct +AUTOBOT_LLM_DEFAULT_MODEL=qwen3.5:9b AUTOBOT_LLM_EMBEDDING_MODEL=nomic-embed-text:latest AUTOBOT_LLM_CLASSIFICATION_MODEL=gemma2:2b AUTOBOT_LLM_PROVIDER=ollama @@ -411,7 +411,7 @@ class PortConfig(BaseSettings): class LLMConfig(BaseSettings): """LLM configuration""" - default_model: str = Field(alias="AUTOBOT_LLM_DEFAULT_MODEL", default="mistral:7b-instruct") + default_model: str = Field(alias="AUTOBOT_LLM_DEFAULT_MODEL", default="qwen3.5:9b") embedding_model: str = Field(alias="AUTOBOT_LLM_EMBEDDING_MODEL", default="nomic-embed-text:latest") provider: str = Field(alias="AUTOBOT_LLM_PROVIDER", default="ollama") timeout: int = Field(alias="AUTOBOT_LLM_TIMEOUT", default=120) @@ -589,7 +589,7 @@ export function getConfig(): AutoBotConfig { }; const llm: LLMConfig = { - defaultModel: getEnv('VITE_LLM_DEFAULT_MODEL', 'mistral:7b-instruct'), + defaultModel: getEnv('VITE_LLM_DEFAULT_MODEL', 'qwen3.5:9b'), embeddingModel: getEnv('VITE_LLM_EMBEDDING_MODEL', 'nomic-embed-text:latest'), provider: getEnv('VITE_LLM_PROVIDER', 'ollama'), timeout: getEnvNumber('VITE_LLM_TIMEOUT', 120), diff --git a/docs/developer/04-configuration.md b/docs/developer/04-configuration.md index fb78a0acf..883178b2d 100644 --- a/docs/developer/04-configuration.md +++ b/docs/developer/04-configuration.md @@ -268,7 +268,7 @@ AutoBot supports environment variable overrides using the `AUTOBOT_` prefix: |----------|-------------|---------|---------| | `AUTOBOT_BACKEND_PORT` | `backend.server_port` | Backend server port | `8002` | | `AUTOBOT_BACKEND_HOST` | `backend.server_host` | Backend bind address | `127.0.0.1` | -| `AUTOBOT_DEFAULT_LLM_MODEL` | `llm_config.ollama.model` | **Primary** - Default LLM model | `mistral:7b-instruct` | +| `AUTOBOT_DEFAULT_LLM_MODEL` | `llm_config.ollama.model` | **Primary** - Default LLM model | `qwen3.5:9b` | | `AUTOBOT_OLLAMA_HOST` | `llm_config.ollama.host` | Ollama server URL | `http://ollama:11434` | | `AUTOBOT_OLLAMA_PORT` | `llm_config.ollama.port` | Ollama server port | `11434` | | `AUTOBOT_ORCHESTRATOR_LLM` | `llm_config.orchestrator_llm` | Orchestrator LLM | `gpt-4` | diff --git a/docs/developer/DISTRIBUTED_TRACING.md b/docs/developer/DISTRIBUTED_TRACING.md index 2c179011c..b0b11ab12 100644 --- a/docs/developer/DISTRIBUTED_TRACING.md +++ b/docs/developer/DISTRIBUTED_TRACING.md @@ -114,7 +114,7 @@ Examples: #### LLM Spans ```python "llm.provider": "ollama", -"llm.model": "mistral:7b-instruct", +"llm.model": "qwen3.5:9b", "llm.streaming": True, "llm.temperature": 0.7, "llm.prompt_messages": 3, diff --git a/docs/developer/ROLES.md b/docs/developer/ROLES.md index 94f5be027..a1ade7de6 100644 --- a/docs/developer/ROLES.md +++ b/docs/developer/ROLES.md @@ -375,7 +375,7 @@ These conflicts drive the default fleet layout: | **External deps** | — | | **Ansible playbook** | `playbooks/deploy_role.yml` | | **Source path** | — (binary install from ollama.ai) | -| **GPU models** | mistral:7b-instruct, deepseek-r1:14b, codellama:13b | +| **GPU models** | qwen3.5:9b, deepseek-r1:14b, codellama:13b | | **Concurrency** | max_loaded=5, num_parallel=4, keep_alive=10m | | **Special hardware** | NVIDIA GPU required. Auto-detected via nvidia-smi. | | **Degraded without** | Large model inference — system falls back to CPU models or cloud providers | diff --git a/docs/developer/SSOT_CONFIG_GUIDE.md b/docs/developer/SSOT_CONFIG_GUIDE.md index 82dde9123..277433618 100644 --- a/docs/developer/SSOT_CONFIG_GUIDE.md +++ b/docs/developer/SSOT_CONFIG_GUIDE.md @@ -415,7 +415,7 @@ All infrastructure configuration (IPs, ports, hosts) is in `.env`: AUTOBOT_BACKEND_HOST=172.16.168.20 AUTOBOT_REDIS_HOST=172.16.168.23 AUTOBOT_OLLAMA_HOST=127.0.0.1 -AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct +AUTOBOT_DEFAULT_LLM_MODEL=qwen3.5:9b ``` ### What Goes Where? diff --git a/docs/developer/THINKING_TOOLS_CONFIGURATION.md b/docs/developer/THINKING_TOOLS_CONFIGURATION.md index b7750ff7e..665a2af7c 100644 --- a/docs/developer/THINKING_TOOLS_CONFIGURATION.md +++ b/docs/developer/THINKING_TOOLS_CONFIGURATION.md @@ -205,7 +205,7 @@ def query(self, ...): **Ensure Mistral is Default Model** (Required for Tool Calling): ```bash # In .env file: -AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct +AUTOBOT_DEFAULT_LLM_MODEL=qwen3.5:9b ``` **Why Mistral?** @@ -217,7 +217,7 @@ AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct **Verify**: ```bash grep "AUTOBOT_DEFAULT_LLM_MODEL" .env -# Should output: AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct +# Should output: AUTOBOT_DEFAULT_LLM_MODEL=qwen3.5:9b ``` --- @@ -324,7 +324,7 @@ Here's the implementation plan... 2. **Verify model is Mistral**: ```bash grep "AUTOBOT_DEFAULT_LLM_MODEL" .env - # Should be: mistral:7b-instruct + # Should be: qwen3.5:9b ``` 3. **Check system prompt loaded**: diff --git a/docs/developer/TIERED_MODEL_ROUTING.md b/docs/developer/TIERED_MODEL_ROUTING.md index 7881e6391..4e9d612b2 100644 --- a/docs/developer/TIERED_MODEL_ROUTING.md +++ b/docs/developer/TIERED_MODEL_ROUTING.md @@ -24,7 +24,7 @@ Tiered Model Routing automatically selects the most appropriate LLM model based **Default Models:** - Simple Tier: `gemma2:2b` (fast, low resource) -- Complex Tier: `mistral:7b-instruct` (capable, comprehensive) +- Complex Tier: `qwen3.5:9b` (capable, comprehensive) ### Complexity Scoring @@ -105,7 +105,7 @@ AUTOBOT_COMPLEXITY_THRESHOLD=3.0 # Model assignments AUTOBOT_MODEL_TIER_SIMPLE=gemma2:2b -AUTOBOT_MODEL_TIER_COMPLEX=mistral:7b-instruct +AUTOBOT_MODEL_TIER_COMPLEX=qwen3.5:9b # Fallback behavior (default: true) # If simple tier fails, automatically retry with complex tier @@ -131,7 +131,7 @@ enabled = tier_config.get("enabled", True) # Get models simple_model = tier_config.get("models", {}).get("simple", "gemma2:2b") -complex_model = tier_config.get("models", {}).get("complex", "mistral:7b-instruct") +complex_model = tier_config.get("models", {}).get("complex", "qwen3.5:9b") # Get threshold threshold = tier_config.get("complexity_threshold", 3.0) @@ -208,7 +208,7 @@ Get current tiered routing configuration. "complexity_threshold": 3.0, "models": { "simple": "gemma2:2b", - "complex": "mistral:7b-instruct" + "complex": "qwen3.5:9b" }, "fallback_to_complex": true, "logging": { @@ -321,10 +321,10 @@ curl -X POST http://localhost:8001/api/llm/tiered-routing/config \ When `log_routing_decisions` is enabled, routing decisions are logged: ``` -INFO - Tiered routing: mistral:7b-instruct -> gemma2:2b +INFO - Tiered routing: qwen3.5:9b -> gemma2:2b (score=1.8, tier=simple, reason=Low complexity request with minimal indicators) -INFO - Tiered routing: selected mistral:7b-instruct +INFO - Tiered routing: selected qwen3.5:9b (score=5.4, tier=complex) WARNING - Tiered routing fallback triggered: simple -> complex tier diff --git a/docs/guides/chat-ollama-configuration.md b/docs/guides/chat-ollama-configuration.md index 99f04625b..818f5349c 100644 --- a/docs/guides/chat-ollama-configuration.md +++ b/docs/guides/chat-ollama-configuration.md @@ -178,7 +178,7 @@ backend: # When set, models in gpu_models are routed here instead of the default. # gpu_endpoint: http://172.16.168.20:11434 # gpu_models: - # - "mistral:7b-instruct" + # - "qwen3.5:9b" # - "deepseek-r1:14b" # - "codellama:13b" @@ -404,7 +404,7 @@ Model selection follows its own priority chain, defined in ``` [1] config.yaml: backend.llm.local.providers.ollama.selected_model [2] Environment: AUTOBOT_DEFAULT_LLM_MODEL -[3] ModelConstants.DEFAULT_OLLAMA_MODEL (from ConfigRegistry -> "mistral:7b-instruct") +[3] ModelConstants.DEFAULT_OLLAMA_MODEL (from ConfigRegistry -> "qwen3.5:9b") ``` ### Source Code Reference @@ -527,7 +527,7 @@ The system-wide model defaults are defined in `constants/model_constants.py`: ```python from constants.model_constants import ModelConstants -ModelConstants.DEFAULT_OLLAMA_MODEL # "mistral:7b-instruct" (from ConfigRegistry) +ModelConstants.DEFAULT_OLLAMA_MODEL # "qwen3.5:9b" (from ConfigRegistry) ModelConstants.DEFAULT_OPENAI_MODEL # "gpt-4" ModelConstants.DEFAULT_ANTHROPIC_MODEL # "claude-3-5-sonnet-20241022" ModelConstants.EMBEDDING_MODEL # "nomic-embed-text:latest" @@ -825,7 +825,7 @@ Ollama listens on `http://127.0.0.1:11434` by default. ```bash # Pull the default model -ollama pull mistral:7b-instruct +ollama pull qwen3.5:9b # Or pull a different model ollama pull llama3.2 @@ -842,7 +842,7 @@ curl -s http://127.0.0.1:11434/api/tags | python3 -m json.tool # Test generation curl -s http://127.0.0.1:11434/api/generate \ - -d '{"model": "mistral:7b-instruct", "prompt": "Hello", "stream": false}' \ + -d '{"model": "qwen3.5:9b", "prompt": "Hello", "stream": false}' \ | python3 -c "import json,sys; print(json.load(sys.stdin).get('response','')[:200])" ``` @@ -852,7 +852,7 @@ Option A -- Environment variables (quick, non-persistent): ```bash export AUTOBOT_OLLAMA_HOST=127.0.0.1 -export AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct +export AUTOBOT_DEFAULT_LLM_MODEL=qwen3.5:9b ``` Option B -- config.yaml (persistent, recommended): @@ -863,7 +863,7 @@ backend: llm: ollama: endpoint: http://127.0.0.1:11434 - selected_model: "mistral:7b-instruct" + selected_model: "qwen3.5:9b" infrastructure: hosts: @@ -1017,7 +1017,7 @@ backend: endpoint: http://127.0.0.1:11434 # CPU endpoint (default) gpu_endpoint: http://172.16.168.20:11434 # GPU-accelerated endpoint gpu_models: - - "mistral:7b-instruct" + - "qwen3.5:9b" - "deepseek-r1:14b" - "codellama:13b" ``` @@ -1059,7 +1059,7 @@ Expected output: ``` [ChatWorkflowManager] Making Ollama request to: http://172.16.168.20:11434/api/generate -[ChatWorkflowManager] Using model: mistral:7b-instruct +[ChatWorkflowManager] Using model: qwen3.5:9b ``` --- @@ -1125,7 +1125,7 @@ ULTIMATE_FALLBACK_CONFIG = { "OLLAMA_URL", os.getenv("OLLAMA_HOST", "http://localhost:11434"), ), - "llm_model": os.getenv("AUTOBOT_DEFAULT_LLM_MODEL", "mistral:7b-instruct"), + "llm_model": os.getenv("AUTOBOT_DEFAULT_LLM_MODEL", "qwen3.5:9b"), "llm_timeout": 30, "llm_temperature": 0.7, } @@ -1299,7 +1299,7 @@ async def main() -> None: # Step 1: Verify Ollama models = await check_ollama() if not models: - logger.error("No models found. Pull one with: ollama pull mistral:7b-instruct") + logger.error("No models found. Pull one with: ollama pull qwen3.5:9b") sys.exit(1) model_name = models[0] # Use the first available model @@ -1332,14 +1332,14 @@ curl -fsSL https://ollama.ai/install.sh | sh sudo systemctl enable --now ollama # 3. Pull a model -ollama pull mistral:7b-instruct +ollama pull qwen3.5:9b # 4. Verify curl -s http://127.0.0.1:11434/api/tags | python3 -m json.tool # 5. Set environment (or edit config.yaml) export AUTOBOT_OLLAMA_HOST=127.0.0.1 -export AUTOBOT_DEFAULT_LLM_MODEL=mistral:7b-instruct +export AUTOBOT_DEFAULT_LLM_MODEL=qwen3.5:9b # 6. Restart backend sudo systemctl restart autobot-backend @@ -1399,7 +1399,7 @@ journalctl -u autobot-backend --since "1 minute ago" | grep "Using model" **Fixes:** - Pull the model: `ollama pull ` -- Verify the model name matches exactly (including tag): `mistral:7b-instruct` vs `mistral:latest` +- Verify the model name matches exactly (including tag): `qwen3.5:9b` vs `mistral:latest` - Update config to use an installed model ### Slow Responses @@ -1419,7 +1419,7 @@ free -h # Check available memory **Fixes:** -- Use a smaller model (e.g., `mistral:7b-instruct` instead of `llama3.2:70b`) +- Use a smaller model (e.g., `qwen3.5:9b` instead of `llama3.2:70b`) - Enable GPU acceleration if available (Ollama auto-detects CUDA GPUs) - Configure GPU routing in `config.yaml` (see [GPU Model Routing](#8-gpu-model-routing)) - Increase system RAM or GPU VRAM @@ -1512,7 +1512,7 @@ ls -la autobot-backend/data/conversation_transcripts/ | head -5 |---------|----------------|---------------------|---------| | Ollama endpoint | `backend.llm.ollama.endpoint` | `AUTOBOT_OLLAMA_ENDPOINT` | `http://127.0.0.1:11434` | | Ollama host | `infrastructure.hosts.ollama` | `AUTOBOT_OLLAMA_HOST` | `127.0.0.1` | -| Selected model | `backend.llm.ollama.selected_model` | `AUTOBOT_DEFAULT_LLM_MODEL` | `mistral:7b-instruct` | +| Selected model | `backend.llm.ollama.selected_model` | `AUTOBOT_DEFAULT_LLM_MODEL` | `qwen3.5:9b` | | GPU endpoint | `backend.llm.ollama.gpu_endpoint` | -- | (none) | | GPU models | `backend.llm.ollama.gpu_models` | -- | `[]` | | LLM provider | `backend.llm.active_provider` | `AUTOBOT_LLM_PROVIDER` | `ollama` | diff --git a/docs/guides/llm-middleware-telemetry.md b/docs/guides/llm-middleware-telemetry.md index a61b11470..9ad6186ed 100644 --- a/docs/guides/llm-middleware-telemetry.md +++ b/docs/guides/llm-middleware-telemetry.md @@ -931,7 +931,7 @@ backend: # GPU endpoint for model-to-endpoint routing (#1070) # gpu_endpoint: http://172.16.168.20:11434 # gpu_models: - # - "mistral:7b-instruct" + # - "qwen3.5:9b" # - "deepseek-r1:14b" # Fallback path for _get_ollama_endpoint_fallback() via get_host("ollama") diff --git a/docs/plans/2026-02-02-agent-llm-config-design.md b/docs/plans/2026-02-02-agent-llm-config-design.md index 6b1a9c00b..eaccba588 100644 --- a/docs/plans/2026-02-02-agent-llm-config-design.md +++ b/docs/plans/2026-02-02-agent-llm-config-design.md @@ -197,7 +197,7 @@ CREATE INDEX idx_agents_is_default ON agents(is_default); -- Seed default agent INSERT INTO agents (agent_id, name, llm_provider, llm_model, is_default) -VALUES ('default', 'Default Agent', 'ollama', 'mistral:7b-instruct', TRUE); +VALUES ('default', 'Default Agent', 'ollama', 'qwen3.5:9b', TRUE); ``` ## 9. Phase 2 Scope diff --git a/docs/plans/2026-02-02-agent-llm-config-implementation.md b/docs/plans/2026-02-02-agent-llm-config-implementation.md index 0006358c7..e4aae7af4 100644 --- a/docs/plans/2026-02-02-agent-llm-config-implementation.md +++ b/docs/plans/2026-02-02-agent-llm-config-implementation.md @@ -239,7 +239,7 @@ def run_migration(db_path: str = "slm.db") -> bool: cursor.execute(""" INSERT INTO agents (agent_id, name, description, llm_provider, llm_model, is_default) VALUES ('default', 'Default Agent', 'Fallback agent for unconfigured requests', - 'ollama', 'mistral:7b-instruct', TRUE) + 'ollama', 'qwen3.5:9b', TRUE) """) logger.info("Seeded default agent") @@ -678,7 +678,7 @@ Expected: Migration complete, default agent seeded **Step 2: Verify table exists** Run: `cd slm-server && sqlite3 slm.db "SELECT agent_id, name, llm_provider, llm_model, is_default FROM agents;"` -Expected: `default|Default Agent|ollama|mistral:7b-instruct|1` +Expected: `default|Default Agent|ollama|qwen3.5:9b|1` **Step 3: Test API endpoint (manual or curl)** @@ -937,7 +937,7 @@ class SLMClient: return { "llm_provider": "ollama", "llm_endpoint": "http://127.0.0.1:11434", - "llm_model": "mistral:7b-instruct", + "llm_model": "qwen3.5:9b", "llm_timeout": 30, "llm_temperature": 0.7, "llm_max_tokens": None, diff --git a/docs/plans/2026-02-02-config-registry-implementation.md b/docs/plans/2026-02-02-config-registry-implementation.md index c54574ce1..a392f7f2d 100644 --- a/docs/plans/2026-02-02-config-registry-implementation.md +++ b/docs/plans/2026-02-02-config-registry-implementation.md @@ -566,7 +566,7 @@ REGISTRY_DEFAULTS = { "browser.host": "172.16.168.25", "browser.port": "3000", # LLM defaults - "llm.default_model": "mistral:7b-instruct", + "llm.default_model": "qwen3.5:9b", "llm.embedding_model": "nomic-embed-text:latest", # Timeouts "timeout.http": "30", diff --git a/docs/plans/2026-02-02-phase3-client-library-design.md b/docs/plans/2026-02-02-phase3-client-library-design.md index 83c5168dd..f734522a5 100644 --- a/docs/plans/2026-02-02-phase3-client-library-design.md +++ b/docs/plans/2026-02-02-phase3-client-library-design.md @@ -168,8 +168,8 @@ export async function discoverService( |------------|---------------|--------------| | TIER_1 | llama3.2:1b | ollama | | TIER_2 | llama3.2:3b | ollama | -| TIER_3 | mistral:7b-instruct | ollama | -| TIER_4 | mistral:7b-instruct | ollama | +| TIER_3 | qwen3.5:9b | ollama | +| TIER_4 | qwen3.5:9b | ollama | --- diff --git a/docs/plans/2026-02-02-phase3-implementation.md b/docs/plans/2026-02-02-phase3-implementation.md index 148415878..321bad938 100644 --- a/docs/plans/2026-02-02-phase3-implementation.md +++ b/docs/plans/2026-02-02-phase3-implementation.md @@ -423,7 +423,7 @@ async def seed_agents(): continue # Determine model from config - default_model = config.get("default_model", "mistral:7b-instruct") + default_model = config.get("default_model", "qwen3.5:9b") # Create agent agent = Agent( diff --git a/docs/plans/2026-02-02-service-discovery-design.md b/docs/plans/2026-02-02-service-discovery-design.md index 1a7dc2642..828ec5f47 100644 --- a/docs/plans/2026-02-02-service-discovery-design.md +++ b/docs/plans/2026-02-02-service-discovery-design.md @@ -167,7 +167,7 @@ async def get_config(node_id: str, key: str) -> Optional[str]: |-----|---------------|-------------| | `llm.provider` | `ollama`, `openai`, `anthropic` | Provider type | | `llm.endpoint` | `http://127.0.0.1:11434` | API base URL | -| `llm.model` | `mistral:7b-instruct`, `gpt-4` | Default model | +| `llm.model` | `qwen3.5:9b`, `gpt-4` | Default model | | `llm.api_key` | `sk-...` | API key (encrypted at rest) | | `llm.timeout` | `30` | Request timeout seconds | diff --git a/docs/plans/2026-02-03-tiered-model-distribution-design.md b/docs/plans/2026-02-03-tiered-model-distribution-design.md index 80eb8664d..c78b583f1 100644 --- a/docs/plans/2026-02-03-tiered-model-distribution-design.md +++ b/docs/plans/2026-02-03-tiered-model-distribution-design.md @@ -12,7 +12,7 @@ Implement tiered model distribution to achieve 50-75% reduction in resource usag ## Goals - Route simple requests (complexity < 3) to lightweight model (`gemma2:2b`) -- Route complex requests (complexity >= 3) to capable model (`mistral:7b-instruct`) +- Route complex requests (complexity >= 3) to capable model (`qwen3.5:9b`) - Achieve 50%+ reduction in compute resources for simple tasks - Maintain response quality through intelligent routing @@ -81,7 +81,7 @@ tiered_routing: complexity_threshold: 3 models: simple: "gemma2:2b" - complex: "mistral:7b-instruct" + complex: "qwen3.5:9b" fallback_to_complex: true logging: log_scores: true @@ -113,7 +113,7 @@ tiered_routing: ## Success Criteria - [ ] Simple requests (< 3 complexity) handled by `gemma2:2b` -- [ ] Complex requests (>= 3 complexity) handled by `mistral:7b-instruct` +- [ ] Complex requests (>= 3 complexity) handled by `qwen3.5:9b` - [ ] 50%+ reduction in compute resources for simple tasks - [ ] No degradation in response quality (validated by sampling) - [ ] Metrics visible in system dashboard