From e2d2ea53d641267cb249634a06d8c9d9107e9f62 Mon Sep 17 00:00:00 2001 From: Federico Liva Date: Thu, 12 Mar 2026 12:38:32 +0100 Subject: [PATCH 1/5] fix(images): fix Telegram image pipeline - dual bug in vision model routing and ClaudeCode driver Root cause 1: Vision model swap (kernel.rs) changed the model to qwen-vl-plus but left the provider as claude-code, routing images to the wrong driver. Root cause 2: ClaudeCodeDriver.build_prompt() called text_content() which silently dropped all ContentBlock::Image and ContentBlock::ImageUrl blocks. Fix: - kernel.rs: vision model swap now also updates the provider - claude_code.rs: full image support via temp files passed with --files flag (handles base64, data URIs, and HTTP URLs) - All other drivers: ensure ImageUrl content blocks are handled - compactor.rs: handle ImageUrl in conversation compaction - bridge.rs: improved image dispatch reliability Closes #528 Co-Authored-By: Claude Opus 4.6 --- Cargo.lock | 28 ++-- crates/openfang-api/src/routes.rs | 1 + crates/openfang-channels/src/bridge.rs | 43 +++-- crates/openfang-kernel/src/kernel.rs | 28 ++++ crates/openfang-memory/src/session.rs | 3 + crates/openfang-runtime/src/compactor.rs | 3 + .../openfang-runtime/src/drivers/anthropic.rs | 6 + .../src/drivers/claude_code.rs | 157 +++++++++++++++++- crates/openfang-runtime/src/drivers/gemini.rs | 8 + crates/openfang-runtime/src/drivers/openai.rs | 7 + crates/openfang-types/src/config.rs | 5 + crates/openfang-types/src/message.rs | 7 + 12 files changed, 257 insertions(+), 39 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 73ba5baf3..6c7c32eb5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3792,7 +3792,7 @@ dependencies = [ [[package]] name = "openfang-api" -version = "0.3.46" +version = "0.3.47" dependencies = [ "async-trait", "axum", @@ -3829,7 +3829,7 @@ dependencies = [ [[package]] name = "openfang-channels" -version = "0.3.46" +version = "0.3.47" dependencies = [ "async-trait", "axum", @@ -3861,7 +3861,7 @@ dependencies = [ [[package]] name = "openfang-cli" -version = "0.3.46" +version = "0.3.47" dependencies = [ "clap", "clap_complete", @@ -3888,7 +3888,7 @@ dependencies = [ [[package]] name = "openfang-desktop" -version = "0.3.46" +version = "0.3.47" dependencies = [ "axum", "open", @@ -3914,7 +3914,7 @@ dependencies = [ [[package]] name = "openfang-extensions" -version = "0.3.46" +version = "0.3.47" dependencies = [ "aes-gcm", "argon2", @@ -3942,7 +3942,7 @@ dependencies = [ [[package]] name = "openfang-hands" -version = "0.3.46" +version = "0.3.47" dependencies = [ "chrono", "dashmap", @@ -3959,7 +3959,7 @@ dependencies = [ [[package]] name = "openfang-kernel" -version = "0.3.46" +version = "0.3.47" dependencies = [ "async-trait", "chrono", @@ -3996,7 +3996,7 @@ dependencies = [ [[package]] name = "openfang-memory" -version = "0.3.46" +version = "0.3.47" dependencies = [ "async-trait", "chrono", @@ -4015,7 +4015,7 @@ dependencies = [ [[package]] name = "openfang-migrate" -version = "0.3.46" +version = "0.3.47" dependencies = [ "chrono", "dirs 6.0.0", @@ -4034,7 +4034,7 @@ dependencies = [ [[package]] name = "openfang-runtime" -version = "0.3.46" +version = "0.3.47" dependencies = [ "anyhow", "async-trait", @@ -4068,7 +4068,7 @@ dependencies = [ [[package]] name = "openfang-skills" -version = "0.3.46" +version = "0.3.47" dependencies = [ "chrono", "hex", @@ -4091,7 +4091,7 @@ dependencies = [ [[package]] name = "openfang-types" -version = "0.3.46" +version = "0.3.47" dependencies = [ "async-trait", "chrono", @@ -4110,7 +4110,7 @@ dependencies = [ [[package]] name = "openfang-wire" -version = "0.3.46" +version = "0.3.47" dependencies = [ "async-trait", "chrono", @@ -8773,7 +8773,7 @@ checksum = "b9cc00251562a284751c9973bace760d86c0276c471b4be569fe6b068ee97a56" [[package]] name = "xtask" -version = "0.3.46" +version = "0.3.47" [[package]] name = "yoke" diff --git a/crates/openfang-api/src/routes.rs b/crates/openfang-api/src/routes.rs index d659d7169..223df0940 100644 --- a/crates/openfang-api/src/routes.rs +++ b/crates/openfang-api/src/routes.rs @@ -6852,6 +6852,7 @@ pub async fn set_provider_key( model: model_id, api_key_env: env_var.clone(), base_url: None, + vision_model: None, }; let mut guard = state .kernel diff --git a/crates/openfang-channels/src/bridge.rs b/crates/openfang-channels/src/bridge.rs index 3a8c3d68a..24e5bee23 100644 --- a/crates/openfang-channels/src/bridge.rs +++ b/crates/openfang-channels/src/bridge.rs @@ -528,25 +528,34 @@ async fn dispatch_message( return; } - // For images: download, base64 encode, and send as multimodal content blocks + // For images: build content blocks with the image URL for vision models. + // We pass the original URL rather than downloading + base64-encoding because + // many providers (DashScope/Qwen, OpenAI) prefer or require direct URLs. if let ChannelContent::Image { ref url, ref caption } = message.content { - let blocks = download_image_to_blocks(url, caption.as_deref()).await; - if blocks.iter().any(|b| matches!(b, ContentBlock::Image { .. })) { - // We have actual image data — send as structured blocks for vision - dispatch_with_blocks( - blocks, - message, - handle, - router, - adapter, - ct_str, - thread_id, - output_format, - ) - .await; - return; + let mut blocks = Vec::new(); + if let Some(cap) = caption { + if !cap.is_empty() { + blocks.push(ContentBlock::Text { + text: cap.clone(), + provider_metadata: None, + }); + } } - // Image download failed — fall through to text description below + blocks.push(ContentBlock::ImageUrl { + url: url.clone(), + }); + dispatch_with_blocks( + blocks, + message, + handle, + router, + adapter, + ct_str, + thread_id, + output_format, + ) + .await; + return; } let text = match &message.content { diff --git a/crates/openfang-kernel/src/kernel.rs b/crates/openfang-kernel/src/kernel.rs index 0b1b1cbe2..0f56139fd 100644 --- a/crates/openfang-kernel/src/kernel.rs +++ b/crates/openfang-kernel/src/kernel.rs @@ -2262,6 +2262,34 @@ impl OpenFangKernel { } } + // If message contains images and a vision model is configured, swap to it. + // Many text models (e.g. qwen-plus) don't support image input — the vision + // model (e.g. qwen-vl-plus) handles multimodal content correctly. + if let Some(ref blocks) = content_blocks { + let has_images = blocks.iter().any(|b| { + matches!( + b, + openfang_types::message::ContentBlock::Image { .. } + | openfang_types::message::ContentBlock::ImageUrl { .. } + ) + }); + if has_images { + if let Some(ref vision_model) = self.config.default_model.vision_model { + info!( + agent = %manifest.name, + default_model = %manifest.model.model, + vision_model = %vision_model, + "Swapping to vision model for image content" + ); + manifest.model.model = vision_model.clone(); + // The vision model lives on the same provider as the default + // model. Without this swap, an agent using e.g. claude-code + // would try to send the image to the wrong driver. + manifest.model.provider = self.config.default_model.provider.clone(); + } + } + } + let driver = self.resolve_driver(&manifest)?; // Look up model's actual context window from the catalog diff --git a/crates/openfang-memory/src/session.rs b/crates/openfang-memory/src/session.rs index 74862c372..a7d1a83aa 100644 --- a/crates/openfang-memory/src/session.rs +++ b/crates/openfang-memory/src/session.rs @@ -584,6 +584,9 @@ impl SessionStore { ContentBlock::Image { media_type, .. } => { text_parts.push(format!("[image: {media_type}]")); } + ContentBlock::ImageUrl { ref url } => { + text_parts.push(format!("[image: {url}]")); + } ContentBlock::Thinking { thinking } => { text_parts.push(format!( "[thinking: {}]", diff --git a/crates/openfang-runtime/src/compactor.rs b/crates/openfang-runtime/src/compactor.rs index 855705469..e9e246a86 100644 --- a/crates/openfang-runtime/src/compactor.rs +++ b/crates/openfang-runtime/src/compactor.rs @@ -399,6 +399,9 @@ fn build_conversation_text(messages: &[Message], config: &CompactionConfig) -> S ContentBlock::Image { media_type, .. } => { conversation_text.push_str(&format!("[Image: {media_type}]\n\n")); } + ContentBlock::ImageUrl { url } => { + conversation_text.push_str(&format!("[Image: {url}]\n\n")); + } ContentBlock::Thinking { .. } => {} ContentBlock::Unknown => {} } diff --git a/crates/openfang-runtime/src/drivers/anthropic.rs b/crates/openfang-runtime/src/drivers/anthropic.rs index 857774e26..4d79c2a81 100644 --- a/crates/openfang-runtime/src/drivers/anthropic.rs +++ b/crates/openfang-runtime/src/drivers/anthropic.rs @@ -573,6 +573,12 @@ fn convert_message(msg: &Message) -> ApiMessage { data: data.clone(), }, }), + ContentBlock::ImageUrl { url } => { + // Anthropic requires base64; pass as text description for now. + Some(ApiContentBlock::Text { + text: format!("[Image: {url}]"), + }) + } ContentBlock::ToolUse { id, name, input, .. } => Some(ApiContentBlock::ToolUse { id: id.clone(), name: name.clone(), diff --git a/crates/openfang-runtime/src/drivers/claude_code.rs b/crates/openfang-runtime/src/drivers/claude_code.rs index 1cdfe3b44..a677512c1 100644 --- a/crates/openfang-runtime/src/drivers/claude_code.rs +++ b/crates/openfang-runtime/src/drivers/claude_code.rs @@ -7,8 +7,9 @@ use crate::llm_driver::{CompletionRequest, CompletionResponse, LlmDriver, LlmError, StreamEvent}; use async_trait::async_trait; -use openfang_types::message::{ContentBlock, Role, StopReason, TokenUsage}; +use openfang_types::message::{ContentBlock, MessageContent, Role, StopReason, TokenUsage}; use serde::Deserialize; +use std::path::PathBuf; use tokio::io::AsyncBufReadExt; use tracing::{debug, warn}; @@ -90,8 +91,12 @@ impl ClaudeCodeDriver { } /// Build a text prompt from the completion request messages. - fn build_prompt(request: &CompletionRequest) -> String { + /// + /// Image content blocks are represented as `[Attached image: ]` + /// placeholders — the actual image files are passed via `--files`. + fn build_prompt(request: &CompletionRequest, image_files: &[PathBuf]) -> String { let mut parts = Vec::new(); + let mut img_idx = 0; if let Some(ref sys) = request.system { parts.push(format!("[System]\n{sys}")); @@ -103,15 +108,135 @@ impl ClaudeCodeDriver { Role::Assistant => "Assistant", Role::System => "System", }; - let text = msg.content.text_content(); - if !text.is_empty() { - parts.push(format!("[{role_label}]\n{text}")); + + let mut msg_parts = Vec::new(); + + match &msg.content { + MessageContent::Text(s) => { + if !s.is_empty() { + msg_parts.push(s.clone()); + } + } + MessageContent::Blocks(blocks) => { + for block in blocks { + match block { + ContentBlock::Text { text, .. } => { + if !text.is_empty() { + msg_parts.push(text.clone()); + } + } + ContentBlock::Image { .. } | ContentBlock::ImageUrl { .. } => { + if img_idx < image_files.len() { + let fname = image_files[img_idx] + .file_name() + .map(|n| n.to_string_lossy().to_string()) + .unwrap_or_else(|| format!("image_{img_idx}")); + msg_parts.push(format!("[Attached image: {fname}]")); + img_idx += 1; + } + } + _ => {} + } + } + } + } + + if !msg_parts.is_empty() { + let combined = msg_parts.join("\n"); + parts.push(format!("[{role_label}]\n{combined}")); } } parts.join("\n\n") } + /// Extract image content blocks from messages and write them to temp files. + /// + /// Returns the list of temp file paths. The caller is responsible for + /// cleaning them up after the CLI finishes. + async fn extract_images_to_temp(request: &CompletionRequest) -> Vec { + use base64::Engine; + + let mut paths = Vec::new(); + let ts = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis()) + .unwrap_or(0); + + for msg in &request.messages { + if let MessageContent::Blocks(blocks) = &msg.content { + for (i, block) in blocks.iter().enumerate() { + match block { + ContentBlock::Image { media_type, data } => { + let ext = media_type + .strip_prefix("image/") + .unwrap_or("png"); + let path = PathBuf::from(format!( + "/tmp/openfang-img-{ts}-{i}.{ext}" + )); + if let Ok(bytes) = + base64::engine::general_purpose::STANDARD.decode(data) + { + if std::fs::write(&path, &bytes).is_ok() { + paths.push(path); + } + } + } + ContentBlock::ImageUrl { url } => { + // If it's a data URI, decode it; otherwise download + if let Some(rest) = url.strip_prefix("data:") { + // data:image/png;base64, + if let Some((meta, b64)) = rest.split_once(",") { + let ext = meta + .split(';') + .next() + .and_then(|m| m.strip_prefix("image/")) + .unwrap_or("png"); + let path = PathBuf::from(format!( + "/tmp/openfang-img-{ts}-{i}.{ext}" + )); + if let Ok(bytes) = + base64::engine::general_purpose::STANDARD.decode(b64) + { + if std::fs::write(&path, &bytes).is_ok() { + paths.push(path); + } + } + } + } else { + // HTTP(S) URL — try to download + let path = PathBuf::from(format!( + "/tmp/openfang-img-{ts}-{i}.jpg" + )); + match reqwest::get(url).await { + Ok(resp) => { + if let Ok(bytes) = resp.bytes().await { + if std::fs::write(&path, &bytes).is_ok() { + paths.push(path); + } + } + } + Err(e) => { + warn!(url = %url, error = %e, "Failed to download image for Claude CLI"); + } + } + } + } + _ => {} + } + } + } + } + paths + } + + /// Clean up temporary image files. + fn cleanup_temp_images(paths: &[PathBuf]) { + for p in paths { + let _ = std::fs::remove_file(p); + } + } + /// Map a model ID like "claude-code/opus" to CLI --model flag value. fn model_flag(model: &str) -> Option { let stripped = model @@ -196,7 +321,8 @@ impl LlmDriver for ClaudeCodeDriver { &self, request: CompletionRequest, ) -> Result { - let prompt = Self::build_prompt(&request); + let image_files = Self::extract_images_to_temp(&request).await; + let prompt = Self::build_prompt(&request, &image_files); let model_flag = Self::model_flag(&request.model); let mut cmd = tokio::process::Command::new(&self.cli_path); @@ -213,6 +339,11 @@ impl LlmDriver for ClaudeCodeDriver { cmd.arg("--model").arg(model); } + // Attach image files so the CLI can see them + for img_path in &image_files { + cmd.arg("--files").arg(img_path); + } + Self::apply_env_filter(&mut cmd); cmd.stdout(std::process::Stdio::piped()); @@ -230,6 +361,7 @@ impl LlmDriver for ClaudeCodeDriver { )))?; if !output.status.success() { + Self::cleanup_temp_images(&image_files); let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); let stdout = String::from_utf8_lossy(&output.stdout).trim().to_string(); let detail = if !stderr.is_empty() { &stderr } else { &stdout }; @@ -261,6 +393,7 @@ impl LlmDriver for ClaudeCodeDriver { }); } + Self::cleanup_temp_images(&image_files); let stdout = String::from_utf8_lossy(&output.stdout); // Try JSON parse first @@ -299,7 +432,8 @@ impl LlmDriver for ClaudeCodeDriver { request: CompletionRequest, tx: tokio::sync::mpsc::Sender, ) -> Result { - let prompt = Self::build_prompt(&request); + let image_files = Self::extract_images_to_temp(&request).await; + let prompt = Self::build_prompt(&request, &image_files); let model_flag = Self::model_flag(&request.model); let mut cmd = tokio::process::Command::new(&self.cli_path); @@ -317,6 +451,11 @@ impl LlmDriver for ClaudeCodeDriver { cmd.arg("--model").arg(model); } + // Attach image files so the CLI can see them + for img_path in &image_files { + cmd.arg("--files").arg(img_path); + } + Self::apply_env_filter(&mut cmd); cmd.stdout(std::process::Stdio::piped()); @@ -412,6 +551,8 @@ impl LlmDriver for ClaudeCodeDriver { .await .map_err(|e| LlmError::Http(format!("Claude CLI wait failed: {e}")))?; + Self::cleanup_temp_images(&image_files); + if !status.success() { warn!(code = ?status.code(), "Claude CLI exited with error"); } @@ -486,7 +627,7 @@ mod tests { thinking: None, }; - let prompt = ClaudeCodeDriver::build_prompt(&request); + let prompt = ClaudeCodeDriver::build_prompt(&request, &[]); assert!(prompt.contains("[System]")); assert!(prompt.contains("You are helpful.")); assert!(prompt.contains("[User]")); diff --git a/crates/openfang-runtime/src/drivers/gemini.rs b/crates/openfang-runtime/src/drivers/gemini.rs index f70a9efe0..651b6088e 100644 --- a/crates/openfang-runtime/src/drivers/gemini.rs +++ b/crates/openfang-runtime/src/drivers/gemini.rs @@ -291,6 +291,14 @@ fn convert_messages( }, }); } + ContentBlock::ImageUrl { url } => { + // Gemini supports fileData for URL-based images; + // fall back to a text description if not supported. + parts.push(GeminiPart::Text { + text: format!("[Image: {url}]"), + thought_signature: None, + }); + } ContentBlock::ToolResult { content, tool_name, .. } => { diff --git a/crates/openfang-runtime/src/drivers/openai.rs b/crates/openfang-runtime/src/drivers/openai.rs index 15a5a6657..c52e772cf 100644 --- a/crates/openfang-runtime/src/drivers/openai.rs +++ b/crates/openfang-runtime/src/drivers/openai.rs @@ -278,6 +278,13 @@ impl LlmDriver for OpenAIDriver { }, }); } + ContentBlock::ImageUrl { url } => { + parts.push(OaiContentPart::ImageUrl { + image_url: OaiImageUrl { + url: url.clone(), + }, + }); + } ContentBlock::Thinking { .. } => {} _ => {} } diff --git a/crates/openfang-types/src/config.rs b/crates/openfang-types/src/config.rs index 76c3a5107..811784008 100644 --- a/crates/openfang-types/src/config.rs +++ b/crates/openfang-types/src/config.rs @@ -1393,6 +1393,10 @@ pub struct DefaultModelConfig { pub api_key_env: String, /// Optional base URL override. pub base_url: Option, + /// Optional vision-capable model for image messages. + /// When set, agents receiving images will automatically use this model + /// instead of the default (which may not support vision). + pub vision_model: Option, } impl Default for DefaultModelConfig { @@ -1402,6 +1406,7 @@ impl Default for DefaultModelConfig { model: "claude-sonnet-4-20250514".to_string(), api_key_env: "ANTHROPIC_API_KEY".to_string(), base_url: None, + vision_model: None, } } } diff --git a/crates/openfang-types/src/message.rs b/crates/openfang-types/src/message.rs index 99be59571..72a608e88 100644 --- a/crates/openfang-types/src/message.rs +++ b/crates/openfang-types/src/message.rs @@ -56,6 +56,12 @@ pub enum ContentBlock { /// Base64-encoded image data. data: String, }, + /// A URL-referenced image (for providers like DashScope that prefer URLs over base64). + #[serde(rename = "image_url")] + ImageUrl { + /// The URL of the image. + url: String, + }, /// A tool use request from the assistant. #[serde(rename = "tool_use")] ToolUse { @@ -144,6 +150,7 @@ impl MessageContent { ContentBlock::Thinking { thinking } => thinking.len(), ContentBlock::ToolUse { .. } | ContentBlock::Image { .. } + | ContentBlock::ImageUrl { .. } | ContentBlock::Unknown => 0, }) .sum(), From 1dcf579062171830350f7e51e5b5afebe43b33cc Mon Sep 17 00:00:00 2001 From: Federico Liva Date: Thu, 12 Mar 2026 13:35:58 +0100 Subject: [PATCH 2/5] fix: skip vision model swap when current model already supports vision MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check model catalog's supports_vision flag before swapping. Models like claude-opus-4-6 handle images natively — no need to swap to a separate vision model (which may use a different, unconfigured provider). Also warn when images arrive but no vision fallback is available. Co-Authored-By: Claude Opus 4.6 --- crates/openfang-kernel/src/kernel.rs | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/crates/openfang-kernel/src/kernel.rs b/crates/openfang-kernel/src/kernel.rs index 0f56139fd..d25a2d401 100644 --- a/crates/openfang-kernel/src/kernel.rs +++ b/crates/openfang-kernel/src/kernel.rs @@ -2262,9 +2262,9 @@ impl OpenFangKernel { } } - // If message contains images and a vision model is configured, swap to it. - // Many text models (e.g. qwen-plus) don't support image input — the vision - // model (e.g. qwen-vl-plus) handles multimodal content correctly. + // If message contains images and the current model doesn't support vision, + // swap to the configured vision model. Models that already support vision + // (e.g. claude-opus-4-6) keep running — no swap needed. if let Some(ref blocks) = content_blocks { let has_images = blocks.iter().any(|b| { matches!( @@ -2274,7 +2274,20 @@ impl OpenFangKernel { ) }); if has_images { - if let Some(ref vision_model) = self.config.default_model.vision_model { + let current_supports_vision = self + .model_catalog + .read() + .ok() + .and_then(|cat| cat.find_model(&manifest.model.model).map(|m| m.supports_vision)) + .unwrap_or(false); + + if current_supports_vision { + info!( + agent = %manifest.name, + model = %manifest.model.model, + "Current model supports vision — skipping swap" + ); + } else if let Some(ref vision_model) = self.config.default_model.vision_model { info!( agent = %manifest.name, default_model = %manifest.model.model, @@ -2286,6 +2299,12 @@ impl OpenFangKernel { // model. Without this swap, an agent using e.g. claude-code // would try to send the image to the wrong driver. manifest.model.provider = self.config.default_model.provider.clone(); + } else { + warn!( + agent = %manifest.name, + model = %manifest.model.model, + "Image received but model lacks vision and no vision_model configured" + ); } } } From be0b72a8ada5887ba3f81087e6f624ab08911713 Mon Sep 17 00:00:00 2001 From: Federico Liva Date: Thu, 12 Mar 2026 13:39:39 +0100 Subject: [PATCH 3/5] fix: vision model swap respects config priority over current model When vision_model is explicitly set in config, always use it (forced override). Only fall back to the current agent model when no vision_model is configured and the model supports vision natively. Co-Authored-By: Claude Opus 4.6 --- crates/openfang-kernel/src/kernel.rs | 56 ++++++++++++++-------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/crates/openfang-kernel/src/kernel.rs b/crates/openfang-kernel/src/kernel.rs index d25a2d401..2d5656f15 100644 --- a/crates/openfang-kernel/src/kernel.rs +++ b/crates/openfang-kernel/src/kernel.rs @@ -2262,9 +2262,10 @@ impl OpenFangKernel { } } - // If message contains images and the current model doesn't support vision, - // swap to the configured vision model. Models that already support vision - // (e.g. claude-opus-4-6) keep running — no swap needed. + // Vision model selection for image content. + // Priority: 1) explicit vision_model from config (forced override) + // 2) current agent model if it supports vision (no swap needed) + // 3) error — no vision capability available if let Some(ref blocks) = content_blocks { let has_images = blocks.iter().any(|b| { matches!( @@ -2274,37 +2275,38 @@ impl OpenFangKernel { ) }); if has_images { - let current_supports_vision = self - .model_catalog - .read() - .ok() - .and_then(|cat| cat.find_model(&manifest.model.model).map(|m| m.supports_vision)) - .unwrap_or(false); - - if current_supports_vision { + if let Some(ref vision_model) = self.config.default_model.vision_model { + // Explicit vision_model configured — always use it info!( agent = %manifest.name, - model = %manifest.model.model, - "Current model supports vision — skipping swap" - ); - } else if let Some(ref vision_model) = self.config.default_model.vision_model { - info!( - agent = %manifest.name, - default_model = %manifest.model.model, + current_model = %manifest.model.model, vision_model = %vision_model, - "Swapping to vision model for image content" + "Swapping to configured vision model for image content" ); manifest.model.model = vision_model.clone(); - // The vision model lives on the same provider as the default - // model. Without this swap, an agent using e.g. claude-code - // would try to send the image to the wrong driver. manifest.model.provider = self.config.default_model.provider.clone(); } else { - warn!( - agent = %manifest.name, - model = %manifest.model.model, - "Image received but model lacks vision and no vision_model configured" - ); + // No vision_model forced — check if current model handles vision + let current_supports_vision = self + .model_catalog + .read() + .ok() + .and_then(|cat| cat.find_model(&manifest.model.model).map(|m| m.supports_vision)) + .unwrap_or(false); + + if current_supports_vision { + info!( + agent = %manifest.name, + model = %manifest.model.model, + "Current model supports vision — no swap needed" + ); + } else { + warn!( + agent = %manifest.name, + model = %manifest.model.model, + "Image received but no vision_model configured and current model lacks vision support" + ); + } } } } From 4522be3ac9ec8f6949bcdbb02ed26cdb5a6f8b86 Mon Sep 17 00:00:00 2001 From: Federico Liva Date: Thu, 12 Mar 2026 14:05:44 +0100 Subject: [PATCH 4/5] fix: use --file instead of --files for Claude Code CLI The CLI option is --file (singular), not --files. Co-Authored-By: Claude Opus 4.6 --- crates/openfang-runtime/src/drivers/claude_code.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/openfang-runtime/src/drivers/claude_code.rs b/crates/openfang-runtime/src/drivers/claude_code.rs index a677512c1..53e2526f3 100644 --- a/crates/openfang-runtime/src/drivers/claude_code.rs +++ b/crates/openfang-runtime/src/drivers/claude_code.rs @@ -93,7 +93,7 @@ impl ClaudeCodeDriver { /// Build a text prompt from the completion request messages. /// /// Image content blocks are represented as `[Attached image: ]` - /// placeholders — the actual image files are passed via `--files`. + /// placeholders — the actual image files are passed via `--file`. fn build_prompt(request: &CompletionRequest, image_files: &[PathBuf]) -> String { let mut parts = Vec::new(); let mut img_idx = 0; @@ -341,7 +341,7 @@ impl LlmDriver for ClaudeCodeDriver { // Attach image files so the CLI can see them for img_path in &image_files { - cmd.arg("--files").arg(img_path); + cmd.arg("--file").arg(img_path); } Self::apply_env_filter(&mut cmd); @@ -453,7 +453,7 @@ impl LlmDriver for ClaudeCodeDriver { // Attach image files so the CLI can see them for img_path in &image_files { - cmd.arg("--files").arg(img_path); + cmd.arg("--file").arg(img_path); } Self::apply_env_filter(&mut cmd); From 0cc01b068267132ebbf3fe071d5778be0e0f2cdd Mon Sep 17 00:00:00 2001 From: Federico Liva Date: Thu, 12 Mar 2026 14:43:57 +0100 Subject: [PATCH 5/5] fix: use @path syntax for local images instead of --file flag The --file flag requires a session token for file downloads (Files API). Instead, embed @/tmp/image.jpg directly in the prompt text, which tells Claude Code CLI to read the local file natively. Co-Authored-By: Claude Opus 4.6 --- .../src/drivers/claude_code.rs | 21 ++++--------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/crates/openfang-runtime/src/drivers/claude_code.rs b/crates/openfang-runtime/src/drivers/claude_code.rs index 53e2526f3..d1a220443 100644 --- a/crates/openfang-runtime/src/drivers/claude_code.rs +++ b/crates/openfang-runtime/src/drivers/claude_code.rs @@ -92,8 +92,8 @@ impl ClaudeCodeDriver { /// Build a text prompt from the completion request messages. /// - /// Image content blocks are represented as `[Attached image: ]` - /// placeholders — the actual image files are passed via `--file`. + /// Image content blocks are referenced using Claude Code's `@path` syntax, + /// which tells the CLI to read the local file directly. fn build_prompt(request: &CompletionRequest, image_files: &[PathBuf]) -> String { let mut parts = Vec::new(); let mut img_idx = 0; @@ -127,11 +127,8 @@ impl ClaudeCodeDriver { } ContentBlock::Image { .. } | ContentBlock::ImageUrl { .. } => { if img_idx < image_files.len() { - let fname = image_files[img_idx] - .file_name() - .map(|n| n.to_string_lossy().to_string()) - .unwrap_or_else(|| format!("image_{img_idx}")); - msg_parts.push(format!("[Attached image: {fname}]")); + let path = &image_files[img_idx]; + msg_parts.push(format!("@{}", path.display())); img_idx += 1; } } @@ -339,11 +336,6 @@ impl LlmDriver for ClaudeCodeDriver { cmd.arg("--model").arg(model); } - // Attach image files so the CLI can see them - for img_path in &image_files { - cmd.arg("--file").arg(img_path); - } - Self::apply_env_filter(&mut cmd); cmd.stdout(std::process::Stdio::piped()); @@ -451,11 +443,6 @@ impl LlmDriver for ClaudeCodeDriver { cmd.arg("--model").arg(model); } - // Attach image files so the CLI can see them - for img_path in &image_files { - cmd.arg("--file").arg(img_path); - } - Self::apply_env_filter(&mut cmd); cmd.stdout(std::process::Stdio::piped());