diff --git a/Cargo.lock b/Cargo.lock index 73ba5baf3..6c7c32eb5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3792,7 +3792,7 @@ dependencies = [ [[package]] name = "openfang-api" -version = "0.3.46" +version = "0.3.47" dependencies = [ "async-trait", "axum", @@ -3829,7 +3829,7 @@ dependencies = [ [[package]] name = "openfang-channels" -version = "0.3.46" +version = "0.3.47" dependencies = [ "async-trait", "axum", @@ -3861,7 +3861,7 @@ dependencies = [ [[package]] name = "openfang-cli" -version = "0.3.46" +version = "0.3.47" dependencies = [ "clap", "clap_complete", @@ -3888,7 +3888,7 @@ dependencies = [ [[package]] name = "openfang-desktop" -version = "0.3.46" +version = "0.3.47" dependencies = [ "axum", "open", @@ -3914,7 +3914,7 @@ dependencies = [ [[package]] name = "openfang-extensions" -version = "0.3.46" +version = "0.3.47" dependencies = [ "aes-gcm", "argon2", @@ -3942,7 +3942,7 @@ dependencies = [ [[package]] name = "openfang-hands" -version = "0.3.46" +version = "0.3.47" dependencies = [ "chrono", "dashmap", @@ -3959,7 +3959,7 @@ dependencies = [ [[package]] name = "openfang-kernel" -version = "0.3.46" +version = "0.3.47" dependencies = [ "async-trait", "chrono", @@ -3996,7 +3996,7 @@ dependencies = [ [[package]] name = "openfang-memory" -version = "0.3.46" +version = "0.3.47" dependencies = [ "async-trait", "chrono", @@ -4015,7 +4015,7 @@ dependencies = [ [[package]] name = "openfang-migrate" -version = "0.3.46" +version = "0.3.47" dependencies = [ "chrono", "dirs 6.0.0", @@ -4034,7 +4034,7 @@ dependencies = [ [[package]] name = "openfang-runtime" -version = "0.3.46" +version = "0.3.47" dependencies = [ "anyhow", "async-trait", @@ -4068,7 +4068,7 @@ dependencies = [ [[package]] name = "openfang-skills" -version = "0.3.46" +version = "0.3.47" dependencies = [ "chrono", "hex", @@ -4091,7 +4091,7 @@ dependencies = [ [[package]] name = "openfang-types" -version = "0.3.46" +version = "0.3.47" dependencies = [ "async-trait", "chrono", @@ -4110,7 +4110,7 @@ dependencies = [ [[package]] name = "openfang-wire" -version = "0.3.46" +version = "0.3.47" dependencies = [ "async-trait", "chrono", @@ -8773,7 +8773,7 @@ checksum = "b9cc00251562a284751c9973bace760d86c0276c471b4be569fe6b068ee97a56" [[package]] name = "xtask" -version = "0.3.46" +version = "0.3.47" [[package]] name = "yoke" diff --git a/crates/openfang-api/src/routes.rs b/crates/openfang-api/src/routes.rs index d659d7169..223df0940 100644 --- a/crates/openfang-api/src/routes.rs +++ b/crates/openfang-api/src/routes.rs @@ -6852,6 +6852,7 @@ pub async fn set_provider_key( model: model_id, api_key_env: env_var.clone(), base_url: None, + vision_model: None, }; let mut guard = state .kernel diff --git a/crates/openfang-channels/src/bridge.rs b/crates/openfang-channels/src/bridge.rs index 3a8c3d68a..24e5bee23 100644 --- a/crates/openfang-channels/src/bridge.rs +++ b/crates/openfang-channels/src/bridge.rs @@ -528,25 +528,34 @@ async fn dispatch_message( return; } - // For images: download, base64 encode, and send as multimodal content blocks + // For images: build content blocks with the image URL for vision models. + // We pass the original URL rather than downloading + base64-encoding because + // many providers (DashScope/Qwen, OpenAI) prefer or require direct URLs. if let ChannelContent::Image { ref url, ref caption } = message.content { - let blocks = download_image_to_blocks(url, caption.as_deref()).await; - if blocks.iter().any(|b| matches!(b, ContentBlock::Image { .. })) { - // We have actual image data — send as structured blocks for vision - dispatch_with_blocks( - blocks, - message, - handle, - router, - adapter, - ct_str, - thread_id, - output_format, - ) - .await; - return; + let mut blocks = Vec::new(); + if let Some(cap) = caption { + if !cap.is_empty() { + blocks.push(ContentBlock::Text { + text: cap.clone(), + provider_metadata: None, + }); + } } - // Image download failed — fall through to text description below + blocks.push(ContentBlock::ImageUrl { + url: url.clone(), + }); + dispatch_with_blocks( + blocks, + message, + handle, + router, + adapter, + ct_str, + thread_id, + output_format, + ) + .await; + return; } let text = match &message.content { diff --git a/crates/openfang-kernel/src/kernel.rs b/crates/openfang-kernel/src/kernel.rs index 0b1b1cbe2..2d5656f15 100644 --- a/crates/openfang-kernel/src/kernel.rs +++ b/crates/openfang-kernel/src/kernel.rs @@ -2262,6 +2262,55 @@ impl OpenFangKernel { } } + // Vision model selection for image content. + // Priority: 1) explicit vision_model from config (forced override) + // 2) current agent model if it supports vision (no swap needed) + // 3) error — no vision capability available + if let Some(ref blocks) = content_blocks { + let has_images = blocks.iter().any(|b| { + matches!( + b, + openfang_types::message::ContentBlock::Image { .. } + | openfang_types::message::ContentBlock::ImageUrl { .. } + ) + }); + if has_images { + if let Some(ref vision_model) = self.config.default_model.vision_model { + // Explicit vision_model configured — always use it + info!( + agent = %manifest.name, + current_model = %manifest.model.model, + vision_model = %vision_model, + "Swapping to configured vision model for image content" + ); + manifest.model.model = vision_model.clone(); + manifest.model.provider = self.config.default_model.provider.clone(); + } else { + // No vision_model forced — check if current model handles vision + let current_supports_vision = self + .model_catalog + .read() + .ok() + .and_then(|cat| cat.find_model(&manifest.model.model).map(|m| m.supports_vision)) + .unwrap_or(false); + + if current_supports_vision { + info!( + agent = %manifest.name, + model = %manifest.model.model, + "Current model supports vision — no swap needed" + ); + } else { + warn!( + agent = %manifest.name, + model = %manifest.model.model, + "Image received but no vision_model configured and current model lacks vision support" + ); + } + } + } + } + let driver = self.resolve_driver(&manifest)?; // Look up model's actual context window from the catalog diff --git a/crates/openfang-memory/src/session.rs b/crates/openfang-memory/src/session.rs index 74862c372..a7d1a83aa 100644 --- a/crates/openfang-memory/src/session.rs +++ b/crates/openfang-memory/src/session.rs @@ -584,6 +584,9 @@ impl SessionStore { ContentBlock::Image { media_type, .. } => { text_parts.push(format!("[image: {media_type}]")); } + ContentBlock::ImageUrl { ref url } => { + text_parts.push(format!("[image: {url}]")); + } ContentBlock::Thinking { thinking } => { text_parts.push(format!( "[thinking: {}]", diff --git a/crates/openfang-runtime/src/compactor.rs b/crates/openfang-runtime/src/compactor.rs index 855705469..e9e246a86 100644 --- a/crates/openfang-runtime/src/compactor.rs +++ b/crates/openfang-runtime/src/compactor.rs @@ -399,6 +399,9 @@ fn build_conversation_text(messages: &[Message], config: &CompactionConfig) -> S ContentBlock::Image { media_type, .. } => { conversation_text.push_str(&format!("[Image: {media_type}]\n\n")); } + ContentBlock::ImageUrl { url } => { + conversation_text.push_str(&format!("[Image: {url}]\n\n")); + } ContentBlock::Thinking { .. } => {} ContentBlock::Unknown => {} } diff --git a/crates/openfang-runtime/src/drivers/anthropic.rs b/crates/openfang-runtime/src/drivers/anthropic.rs index 857774e26..4d79c2a81 100644 --- a/crates/openfang-runtime/src/drivers/anthropic.rs +++ b/crates/openfang-runtime/src/drivers/anthropic.rs @@ -573,6 +573,12 @@ fn convert_message(msg: &Message) -> ApiMessage { data: data.clone(), }, }), + ContentBlock::ImageUrl { url } => { + // Anthropic requires base64; pass as text description for now. + Some(ApiContentBlock::Text { + text: format!("[Image: {url}]"), + }) + } ContentBlock::ToolUse { id, name, input, .. } => Some(ApiContentBlock::ToolUse { id: id.clone(), name: name.clone(), diff --git a/crates/openfang-runtime/src/drivers/claude_code.rs b/crates/openfang-runtime/src/drivers/claude_code.rs index 1cdfe3b44..d1a220443 100644 --- a/crates/openfang-runtime/src/drivers/claude_code.rs +++ b/crates/openfang-runtime/src/drivers/claude_code.rs @@ -7,8 +7,9 @@ use crate::llm_driver::{CompletionRequest, CompletionResponse, LlmDriver, LlmError, StreamEvent}; use async_trait::async_trait; -use openfang_types::message::{ContentBlock, Role, StopReason, TokenUsage}; +use openfang_types::message::{ContentBlock, MessageContent, Role, StopReason, TokenUsage}; use serde::Deserialize; +use std::path::PathBuf; use tokio::io::AsyncBufReadExt; use tracing::{debug, warn}; @@ -90,8 +91,12 @@ impl ClaudeCodeDriver { } /// Build a text prompt from the completion request messages. - fn build_prompt(request: &CompletionRequest) -> String { + /// + /// Image content blocks are referenced using Claude Code's `@path` syntax, + /// which tells the CLI to read the local file directly. + fn build_prompt(request: &CompletionRequest, image_files: &[PathBuf]) -> String { let mut parts = Vec::new(); + let mut img_idx = 0; if let Some(ref sys) = request.system { parts.push(format!("[System]\n{sys}")); @@ -103,15 +108,132 @@ impl ClaudeCodeDriver { Role::Assistant => "Assistant", Role::System => "System", }; - let text = msg.content.text_content(); - if !text.is_empty() { - parts.push(format!("[{role_label}]\n{text}")); + + let mut msg_parts = Vec::new(); + + match &msg.content { + MessageContent::Text(s) => { + if !s.is_empty() { + msg_parts.push(s.clone()); + } + } + MessageContent::Blocks(blocks) => { + for block in blocks { + match block { + ContentBlock::Text { text, .. } => { + if !text.is_empty() { + msg_parts.push(text.clone()); + } + } + ContentBlock::Image { .. } | ContentBlock::ImageUrl { .. } => { + if img_idx < image_files.len() { + let path = &image_files[img_idx]; + msg_parts.push(format!("@{}", path.display())); + img_idx += 1; + } + } + _ => {} + } + } + } + } + + if !msg_parts.is_empty() { + let combined = msg_parts.join("\n"); + parts.push(format!("[{role_label}]\n{combined}")); } } parts.join("\n\n") } + /// Extract image content blocks from messages and write them to temp files. + /// + /// Returns the list of temp file paths. The caller is responsible for + /// cleaning them up after the CLI finishes. + async fn extract_images_to_temp(request: &CompletionRequest) -> Vec { + use base64::Engine; + + let mut paths = Vec::new(); + let ts = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis()) + .unwrap_or(0); + + for msg in &request.messages { + if let MessageContent::Blocks(blocks) = &msg.content { + for (i, block) in blocks.iter().enumerate() { + match block { + ContentBlock::Image { media_type, data } => { + let ext = media_type + .strip_prefix("image/") + .unwrap_or("png"); + let path = PathBuf::from(format!( + "/tmp/openfang-img-{ts}-{i}.{ext}" + )); + if let Ok(bytes) = + base64::engine::general_purpose::STANDARD.decode(data) + { + if std::fs::write(&path, &bytes).is_ok() { + paths.push(path); + } + } + } + ContentBlock::ImageUrl { url } => { + // If it's a data URI, decode it; otherwise download + if let Some(rest) = url.strip_prefix("data:") { + // data:image/png;base64, + if let Some((meta, b64)) = rest.split_once(",") { + let ext = meta + .split(';') + .next() + .and_then(|m| m.strip_prefix("image/")) + .unwrap_or("png"); + let path = PathBuf::from(format!( + "/tmp/openfang-img-{ts}-{i}.{ext}" + )); + if let Ok(bytes) = + base64::engine::general_purpose::STANDARD.decode(b64) + { + if std::fs::write(&path, &bytes).is_ok() { + paths.push(path); + } + } + } + } else { + // HTTP(S) URL — try to download + let path = PathBuf::from(format!( + "/tmp/openfang-img-{ts}-{i}.jpg" + )); + match reqwest::get(url).await { + Ok(resp) => { + if let Ok(bytes) = resp.bytes().await { + if std::fs::write(&path, &bytes).is_ok() { + paths.push(path); + } + } + } + Err(e) => { + warn!(url = %url, error = %e, "Failed to download image for Claude CLI"); + } + } + } + } + _ => {} + } + } + } + } + paths + } + + /// Clean up temporary image files. + fn cleanup_temp_images(paths: &[PathBuf]) { + for p in paths { + let _ = std::fs::remove_file(p); + } + } + /// Map a model ID like "claude-code/opus" to CLI --model flag value. fn model_flag(model: &str) -> Option { let stripped = model @@ -196,7 +318,8 @@ impl LlmDriver for ClaudeCodeDriver { &self, request: CompletionRequest, ) -> Result { - let prompt = Self::build_prompt(&request); + let image_files = Self::extract_images_to_temp(&request).await; + let prompt = Self::build_prompt(&request, &image_files); let model_flag = Self::model_flag(&request.model); let mut cmd = tokio::process::Command::new(&self.cli_path); @@ -230,6 +353,7 @@ impl LlmDriver for ClaudeCodeDriver { )))?; if !output.status.success() { + Self::cleanup_temp_images(&image_files); let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); let stdout = String::from_utf8_lossy(&output.stdout).trim().to_string(); let detail = if !stderr.is_empty() { &stderr } else { &stdout }; @@ -261,6 +385,7 @@ impl LlmDriver for ClaudeCodeDriver { }); } + Self::cleanup_temp_images(&image_files); let stdout = String::from_utf8_lossy(&output.stdout); // Try JSON parse first @@ -299,7 +424,8 @@ impl LlmDriver for ClaudeCodeDriver { request: CompletionRequest, tx: tokio::sync::mpsc::Sender, ) -> Result { - let prompt = Self::build_prompt(&request); + let image_files = Self::extract_images_to_temp(&request).await; + let prompt = Self::build_prompt(&request, &image_files); let model_flag = Self::model_flag(&request.model); let mut cmd = tokio::process::Command::new(&self.cli_path); @@ -412,6 +538,8 @@ impl LlmDriver for ClaudeCodeDriver { .await .map_err(|e| LlmError::Http(format!("Claude CLI wait failed: {e}")))?; + Self::cleanup_temp_images(&image_files); + if !status.success() { warn!(code = ?status.code(), "Claude CLI exited with error"); } @@ -486,7 +614,7 @@ mod tests { thinking: None, }; - let prompt = ClaudeCodeDriver::build_prompt(&request); + let prompt = ClaudeCodeDriver::build_prompt(&request, &[]); assert!(prompt.contains("[System]")); assert!(prompt.contains("You are helpful.")); assert!(prompt.contains("[User]")); diff --git a/crates/openfang-runtime/src/drivers/gemini.rs b/crates/openfang-runtime/src/drivers/gemini.rs index f70a9efe0..651b6088e 100644 --- a/crates/openfang-runtime/src/drivers/gemini.rs +++ b/crates/openfang-runtime/src/drivers/gemini.rs @@ -291,6 +291,14 @@ fn convert_messages( }, }); } + ContentBlock::ImageUrl { url } => { + // Gemini supports fileData for URL-based images; + // fall back to a text description if not supported. + parts.push(GeminiPart::Text { + text: format!("[Image: {url}]"), + thought_signature: None, + }); + } ContentBlock::ToolResult { content, tool_name, .. } => { diff --git a/crates/openfang-runtime/src/drivers/openai.rs b/crates/openfang-runtime/src/drivers/openai.rs index 15a5a6657..c52e772cf 100644 --- a/crates/openfang-runtime/src/drivers/openai.rs +++ b/crates/openfang-runtime/src/drivers/openai.rs @@ -278,6 +278,13 @@ impl LlmDriver for OpenAIDriver { }, }); } + ContentBlock::ImageUrl { url } => { + parts.push(OaiContentPart::ImageUrl { + image_url: OaiImageUrl { + url: url.clone(), + }, + }); + } ContentBlock::Thinking { .. } => {} _ => {} } diff --git a/crates/openfang-types/src/config.rs b/crates/openfang-types/src/config.rs index 76c3a5107..811784008 100644 --- a/crates/openfang-types/src/config.rs +++ b/crates/openfang-types/src/config.rs @@ -1393,6 +1393,10 @@ pub struct DefaultModelConfig { pub api_key_env: String, /// Optional base URL override. pub base_url: Option, + /// Optional vision-capable model for image messages. + /// When set, agents receiving images will automatically use this model + /// instead of the default (which may not support vision). + pub vision_model: Option, } impl Default for DefaultModelConfig { @@ -1402,6 +1406,7 @@ impl Default for DefaultModelConfig { model: "claude-sonnet-4-20250514".to_string(), api_key_env: "ANTHROPIC_API_KEY".to_string(), base_url: None, + vision_model: None, } } } diff --git a/crates/openfang-types/src/message.rs b/crates/openfang-types/src/message.rs index 99be59571..72a608e88 100644 --- a/crates/openfang-types/src/message.rs +++ b/crates/openfang-types/src/message.rs @@ -56,6 +56,12 @@ pub enum ContentBlock { /// Base64-encoded image data. data: String, }, + /// A URL-referenced image (for providers like DashScope that prefer URLs over base64). + #[serde(rename = "image_url")] + ImageUrl { + /// The URL of the image. + url: String, + }, /// A tool use request from the assistant. #[serde(rename = "tool_use")] ToolUse { @@ -144,6 +150,7 @@ impl MessageContent { ContentBlock::Thinking { thinking } => thinking.len(), ContentBlock::ToolUse { .. } | ContentBlock::Image { .. } + | ContentBlock::ImageUrl { .. } | ContentBlock::Unknown => 0, }) .sum(),