Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions pkg/chat/chat.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,44 @@ const (
ImageURLDetailAuto ImageURLDetail = "auto"
)

// FileSourceType indicates how the file should be referenced in API calls
type FileSourceType string

const (
// FileSourceTypeNone means no file reference, use URL or base64
FileSourceTypeNone FileSourceType = ""
// FileSourceTypeFileID means the file was uploaded and should be referenced by ID
FileSourceTypeFileID FileSourceType = "file_id"
// FileSourceTypeFileURI means the file was uploaded and should be referenced by URI (Gemini)
FileSourceTypeFileURI FileSourceType = "file_uri"
// FileSourceTypeLocalPath means the file is a local path that needs to be uploaded/converted
FileSourceTypeLocalPath FileSourceType = "local_path"
)

// FileReference contains information about a file attachment
type FileReference struct {
// SourceType indicates how this file should be referenced
SourceType FileSourceType `json:"source_type,omitempty"`
// FileID is the provider-specific file identifier (for FileSourceTypeFileID)
FileID string `json:"file_id,omitempty"`
// FileURI is the file URI (for FileSourceTypeFileURI, used by Gemini)
FileURI string `json:"file_uri,omitempty"`
// LocalPath is the path to a local file (for FileSourceTypeLocalPath)
LocalPath string `json:"local_path,omitempty"`
// MimeType is the MIME type of the file
MimeType string `json:"mime_type,omitempty"`
// Provider identifies which provider this reference is for (when uploaded)
Provider string `json:"provider,omitempty"`
}

type MessageImageURL struct {
// URL contains a data URL (base64) or a public HTTP(S) URL
URL string `json:"url,omitempty"`
Detail ImageURLDetail `json:"detail,omitempty"`

// FileRef contains file reference info when the image was uploaded via Files API
// or references a local file path that needs to be processed
FileRef *FileReference `json:"file_ref,omitempty"`
}

type Message struct {
Expand Down
85 changes: 41 additions & 44 deletions pkg/cli/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package cli
import (
"cmp"
"context"
"encoding/base64"
"encoding/json"
"fmt"
"io"
Expand Down Expand Up @@ -307,55 +306,29 @@ func ParseAttachCommand(userInput string) (messageText, attachPath string) {
return messageText, attachPath
}

// CreateUserMessageWithAttachment creates a user message with optional image attachment
// CreateUserMessageWithAttachment creates a user message with optional image attachment.
// Instead of converting to base64, this stores the file path for later processing
// by the provider (which may use Files API or base64 as appropriate).
func CreateUserMessageWithAttachment(userContent, attachmentPath string) *session.Message {
if attachmentPath == "" {
return session.UserMessage(userContent)
}

// Convert file to data URL
dataURL, err := fileToDataURL(attachmentPath)
// Resolve to absolute path
absPath, err := filepath.Abs(attachmentPath)
if err != nil {
slog.Warn("Failed to attach file", "path", attachmentPath, "error", err)
slog.Warn("Failed to resolve attachment path", "path", attachmentPath, "error", err)
return session.UserMessage(userContent)
}

// Ensure we have some text content when attaching a file
textContent := cmp.Or(strings.TrimSpace(userContent), "Please analyze this attached file.")

// Create message with multi-content including text and image
multiContent := []chat.MessagePart{
{
Type: chat.MessagePartTypeText,
Text: textContent,
},
{
Type: chat.MessagePartTypeImageURL,
ImageURL: &chat.MessageImageURL{
URL: dataURL,
Detail: chat.ImageURLDetailAuto,
},
},
}

return session.UserMessage("", multiContent...)
}

// fileToDataURL converts a file to a data URL
func fileToDataURL(filePath string) (string, error) {
// Check if file exists
if _, err := os.Stat(filePath); os.IsNotExist(err) {
return "", fmt.Errorf("file does not exist: %s", filePath)
}

// Read file content
fileBytes, err := os.ReadFile(filePath)
if err != nil {
return "", fmt.Errorf("failed to read file: %w", err)
if _, err := os.Stat(absPath); os.IsNotExist(err) {
slog.Warn("Attachment file does not exist", "path", absPath)
return session.UserMessage(userContent)
}

// Determine MIME type based on file extension
ext := strings.ToLower(filepath.Ext(filePath))
// Determine MIME type from extension
ext := strings.ToLower(filepath.Ext(absPath))
var mimeType string
switch ext {
case ".jpg", ".jpeg":
Expand All @@ -370,15 +343,39 @@ func fileToDataURL(filePath string) (string, error) {
mimeType = "image/bmp"
case ".svg":
mimeType = "image/svg+xml"
case ".pdf":
mimeType = "application/pdf"
default:
return "", fmt.Errorf("unsupported image format: %s", ext)
slog.Warn("Unsupported file format for attachment", "path", absPath, "ext", ext)
return session.UserMessage(userContent)
}

// Encode to base64
encoded := base64.StdEncoding.EncodeToString(fileBytes)
slog.Debug("Creating message with file attachment",
"path", absPath,
"mime_type", mimeType)

// Ensure we have some text content when attaching a file
textContent := cmp.Or(strings.TrimSpace(userContent), "Please analyze this attached file.")

// Create data URL
dataURL := fmt.Sprintf("data:%s;base64,%s", mimeType, encoded)
// Create message with file reference (not base64)
// The provider will handle uploading via Files API or converting to base64
multiContent := []chat.MessagePart{
{
Type: chat.MessagePartTypeText,
Text: textContent,
},
{
Type: chat.MessagePartTypeImageURL,
ImageURL: &chat.MessageImageURL{
Detail: chat.ImageURLDetailAuto,
FileRef: &chat.FileReference{
SourceType: chat.FileSourceTypeLocalPath,
LocalPath: absPath,
MimeType: mimeType,
},
},
},
}

return dataURL, nil
return session.UserMessage("", multiContent...)
}
214 changes: 214 additions & 0 deletions pkg/cli/runner_attachment_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
package cli

import (
"os"
"path/filepath"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/docker/cagent/pkg/chat"
)

func TestCreateUserMessageWithAttachment(t *testing.T) {
t.Parallel()

// Create a temporary test image file
tmpDir := t.TempDir()
jpegPath := filepath.Join(tmpDir, "test.jpg")
pngPath := filepath.Join(tmpDir, "test.png")
gifPath := filepath.Join(tmpDir, "test.gif")
webpPath := filepath.Join(tmpDir, "test.webp")
pdfPath := filepath.Join(tmpDir, "test.pdf")
unsupportedPath := filepath.Join(tmpDir, "test.xyz")

// Create test files
for _, path := range []string{jpegPath, pngPath, gifPath, webpPath, pdfPath, unsupportedPath} {
err := os.WriteFile(path, []byte("test data"), 0o644)
require.NoError(t, err)
}

tests := []struct {
name string
userContent string
attachmentPath string
wantMultiContent bool
wantFileRef bool
wantMimeType string
wantDefaultPrompt bool
}{
{
name: "no attachment",
userContent: "Hello world",
attachmentPath: "",
wantMultiContent: false,
},
{
name: "jpeg attachment",
userContent: "Check this image",
attachmentPath: jpegPath,
wantMultiContent: true,
wantFileRef: true,
wantMimeType: "image/jpeg",
},
{
name: "png attachment",
userContent: "Analyze this",
attachmentPath: pngPath,
wantMultiContent: true,
wantFileRef: true,
wantMimeType: "image/png",
},
{
name: "gif attachment",
userContent: "What's in this gif?",
attachmentPath: gifPath,
wantMultiContent: true,
wantFileRef: true,
wantMimeType: "image/gif",
},
{
name: "webp attachment",
userContent: "Describe this",
attachmentPath: webpPath,
wantMultiContent: true,
wantFileRef: true,
wantMimeType: "image/webp",
},
{
name: "pdf attachment",
userContent: "Summarize this PDF",
attachmentPath: pdfPath,
wantMultiContent: true,
wantFileRef: true,
wantMimeType: "application/pdf",
},
{
name: "attachment with empty content gets default prompt",
userContent: "",
attachmentPath: jpegPath,
wantMultiContent: true,
wantFileRef: true,
wantMimeType: "image/jpeg",
wantDefaultPrompt: true,
},
{
name: "attachment with whitespace content gets default prompt",
userContent: " ",
attachmentPath: jpegPath,
wantMultiContent: true,
wantFileRef: true,
wantMimeType: "image/jpeg",
wantDefaultPrompt: true,
},
{
name: "non-existent file falls back to text only",
userContent: "Hello",
attachmentPath: "/non/existent/file.jpg",
wantMultiContent: false,
},
{
name: "unsupported format falls back to text only",
userContent: "Hello",
attachmentPath: unsupportedPath,
wantMultiContent: false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
msg := CreateUserMessageWithAttachment(tt.userContent, tt.attachmentPath)

require.NotNil(t, msg)
assert.Equal(t, chat.MessageRoleUser, msg.Message.Role)

if tt.wantMultiContent {
assert.NotEmpty(t, msg.Message.MultiContent)
assert.Len(t, msg.Message.MultiContent, 2) // text + image

// Check text part
textPart := msg.Message.MultiContent[0]
assert.Equal(t, chat.MessagePartTypeText, textPart.Type)
if tt.wantDefaultPrompt {
assert.Equal(t, "Please analyze this attached file.", textPart.Text)
} else {
assert.Equal(t, tt.userContent, textPart.Text)
}

// Check image part
imagePart := msg.Message.MultiContent[1]
assert.Equal(t, chat.MessagePartTypeImageURL, imagePart.Type)
assert.NotNil(t, imagePart.ImageURL)

if tt.wantFileRef {
assert.NotNil(t, imagePart.ImageURL.FileRef)
assert.Equal(t, chat.FileSourceTypeLocalPath, imagePart.ImageURL.FileRef.SourceType)
assert.NotEmpty(t, imagePart.ImageURL.FileRef.LocalPath)
assert.Equal(t, tt.wantMimeType, imagePart.ImageURL.FileRef.MimeType)
}
} else {
assert.Empty(t, msg.Message.MultiContent)
assert.Equal(t, tt.userContent, msg.Message.Content)
}
})
}
}

func TestParseAttachCommand(t *testing.T) {
t.Parallel()

tests := []struct {
name string
input string
wantText string
wantAttachPath string
}{
{
name: "no attach command",
input: "Hello world",
wantText: "Hello world",
wantAttachPath: "",
},
{
name: "attach at start",
input: "/attach image.png describe this",
wantText: "describe this",
wantAttachPath: "image.png",
},
{
name: "attach in middle",
input: "please /attach photo.jpg analyze it",
wantText: "please analyze it",
wantAttachPath: "photo.jpg",
},
{
name: "attach only",
input: "/attach test.gif",
wantText: "",
wantAttachPath: "test.gif",
},
{
name: "attach with path containing spaces handled",
input: "/attach my_image.png what is this?",
wantText: "what is this?",
wantAttachPath: "my_image.png",
},
{
name: "multiline with attach",
input: "First line\n/attach image.jpg second part\nThird line",
wantText: "First line\nsecond part\nThird line",
wantAttachPath: "image.jpg",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
t.Parallel()
text, path := ParseAttachCommand(tt.input)
assert.Equal(t, tt.wantText, text)
assert.Equal(t, tt.wantAttachPath, path)
})
}
}
2 changes: 1 addition & 1 deletion pkg/model/provider/anthropic/beta_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func (c *Client) createBetaStream(
return nil, err
}

converted := convertBetaMessages(messages)
converted := convertBetaMessagesWithClient(ctx, &client, messages)
if err := validateAnthropicSequencingBeta(converted); err != nil {
slog.Warn("Invalid message sequencing for Anthropic Beta API detected, attempting self-repair", "error", err)
converted = repairAnthropicSequencingBeta(converted)
Expand Down
Loading