From bad1eb32d414ceb9b9b790bc535f565c28c484ca Mon Sep 17 00:00:00 2001 From: Sameep Rehlan Date: Sun, 8 Mar 2026 13:46:55 +0530 Subject: [PATCH 1/6] =?UTF-8?q?feat:=20Call=20Mode=20architecture=20?= =?UTF-8?q?=E2=80=94=20CallRoom=20participant=20model=20+=20MCP=20join/lea?= =?UTF-8?q?ve=20detection?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - CallRoom.swift: Participant model (llama/claudeCode/connection kinds), state machine (idle/thinking/streaming/paused), gesture-based addressing (finger count → slot), pause/remove lifecycle, claudeCodeIsActive gate for MCP transcript - MCPServer.swift: initialize handshake fires onJoined callback; 60s inactivity timer fires onLeft; autoclawd_get_audio_transcript returns standby message when paused - AppState: callRoom singleton added - AppDelegate: wires isPausedProvider, onJoined, onLeft into MCPServer.start() - MainPanelView: CallModeZoomView (3-panel zoom-call layout), hides PixelWorld in callMode - CameraPreviewView: screen preview constrained to parent frame (fixes chevron disappearing) - PillMode, PipelineModels, ChunkManager, WidgetCanvasViews, SkillStore: call mode wiring Co-Authored-By: Claude Sonnet 4.6 --- Sources/AppDelegate.swift | 20 ++ Sources/AppState.swift | 4 + Sources/CallModeSession.swift | 314 ++++++++++++++++++++++ Sources/CallRoom.swift | 169 ++++++++++++ Sources/CameraPreviewView.swift | 1 + Sources/ChunkManager.swift | 9 + Sources/MCPServer.swift | 461 ++++++++++++++++++++++++++++++++ Sources/MainPanelView.swift | 213 ++++++++++++++- Sources/PillMode.swift | 5 + Sources/PipelineModels.swift | 4 + Sources/ScreenGrabService.swift | 251 +++++++++++++++++ Sources/SkillStore.swift | 59 ++++ Sources/WidgetCanvasViews.swift | 115 ++++++++ 13 files changed, 1623 insertions(+), 2 deletions(-) create mode 100644 Sources/CallModeSession.swift create mode 100644 Sources/CallRoom.swift create mode 100644 Sources/MCPServer.swift create mode 100644 Sources/ScreenGrabService.swift diff --git a/Sources/AppDelegate.swift b/Sources/AppDelegate.swift index 8ba6fcc..0624df2 100644 --- a/Sources/AppDelegate.swift +++ b/Sources/AppDelegate.swift @@ -33,6 +33,23 @@ final class AppDelegate: NSObject, NSApplicationDelegate { // recording attempt — preventing the first chunk from silently failing. requestPermissionsUpfront() + // Start embedded MCP server so any Claude Code session can call screen-grab tools. + // Configure Claude Code with: { "mcpServers": { "autoclawd": { "type": "http", "url": "http://localhost:7892/mcp" } } } + let screenGrab = ScreenGrabService() + MCPServer.shared.start( + screenGrab: screenGrab, + transcriptProvider: { [weak self] in self?.appState.liveTranscriptText ?? "" }, + isPausedProvider: { [weak self] in !(self?.appState.callRoom.claudeCodeIsActive ?? true) }, + canvasWriter: { [weak self] text in self?.appState.callModeSession.appendExternalMessage(text) }, + onJoined: { [weak self] in self?.appState.callRoom.claudeCodeJoined() }, + onLeft: { [weak self] in self?.appState.callRoom.claudeCodeLeft() } + ) + + // Configure call mode session with the same transcript provider. + appState.callModeSession.configure( + transcriptProvider: { [weak self] in self?.appState.liveTranscriptText ?? "" } + ) + // Toast window disabled — logs are now shown inline inside the widget. // AutoClawdLogger.toastPublisher // .receive(on: DispatchQueue.main) @@ -721,6 +738,9 @@ struct PillContentView: View { typedText: typed ) return AnyView(v) + case .callMode: + let v = CallModeCanvasView(session: appState.callModeSession) + return AnyView(v) } } } diff --git a/Sources/AppState.swift b/Sources/AppState.swift index ddf491e..2fe6dc9 100644 --- a/Sources/AppState.swift +++ b/Sources/AppState.swift @@ -209,6 +209,10 @@ final class AppState: ObservableObject { @Published var sessionLifecycle: SessionLifecycleState = .undefined @Published var sessionConfig: SessionConfig? + // Call mode state + let callModeSession = CallModeSession() + let callRoom = CallRoom() + // Code widget state @Published var codeWidgetStep: CodeWidgetStep = .projectSelect @Published var codeSelectedProject: Project? = nil diff --git a/Sources/CallModeSession.swift b/Sources/CallModeSession.swift new file mode 100644 index 0000000..f2c5936 --- /dev/null +++ b/Sources/CallModeSession.swift @@ -0,0 +1,314 @@ +import Foundation +import SwiftUI + +// MARK: - CallModeSession + +/// Direct Anthropic API conversation session for Call Mode. +/// +/// Bypasses Llama entirely — voice transcript → Claude directly. +/// Claude proactively calls screen/cursor/selection tools to see what the user is looking at. +/// +/// Pipeline for call mode: +/// Mic → SFSpeech/Groq transcript → send() → Anthropic messages API +/// ↑ Claude calls tools ↓ +/// ScreenGrabService.captureScreen / captureCursorContext / captureSelection +/// ↓ Claude responds +/// @Published messages → CallModeView +@MainActor +final class CallModeSession: ObservableObject { + + @Published var messages: [CallMessage] = [] + @Published var isProcessing: Bool = false + + private var history: [[String: Any]] = [] + private let screenGrab = ScreenGrabService() + private var transcriptProvider: (() -> String)? + + // MARK: - Configuration + + func configure(transcriptProvider: @escaping () -> String) { + self.transcriptProvider = transcriptProvider + } + + // MARK: - Send + + /// Send a user message (typically from voice transcript) to Claude. + /// Claude may call screen tools before responding. + func send(text: String) async { + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmed.isEmpty else { return } + + let apiKey = SettingsManager.shared.anthropicAPIKey + guard !apiKey.isEmpty else { + messages.append(CallMessage(role: .error, + text: "Anthropic API key not configured.")) + return + } + + messages.append(CallMessage(role: .user, text: trimmed)) + history.append(["role": "user", "content": trimmed]) + + isProcessing = true + defer { isProcessing = false } + + do { + let reply = try await runAgentLoop(apiKey: apiKey) + if !reply.isEmpty { + messages.append(CallMessage(role: .assistant, text: reply)) + history.append(["role": "assistant", "content": reply]) + } + } catch { + messages.append(CallMessage(role: .error, text: error.localizedDescription)) + } + } + + func clearHistory() { + messages.removeAll() + history.removeAll() + } + + /// Append a message from an external source (e.g. Claude Code via MCP autoclawd_set_canvas). + func appendExternalMessage(_ text: String) { + messages.append(CallMessage(role: .external, text: text)) + } + + // MARK: - Agent Loop + + /// Tool-use loop: request → if tool_use → execute → continue → until end_turn. + private func runAgentLoop(apiKey: String) async throws -> String { + while true { + let body = makeRequestBody() + let response = try await callAnthropic(body: body, apiKey: apiKey) + + guard let stopReason = response["stop_reason"] as? String else { + throw CallModeError.invalidResponse + } + + let content = response["content"] as? [[String: Any]] ?? [] + + if stopReason == "end_turn" { + return content + .filter { $0["type"] as? String == "text" } + .compactMap { $0["text"] as? String } + .joined(separator: "\n") + } + + if stopReason == "tool_use" { + // Append Claude's tool-use turn to history + history.append(["role": "assistant", "content": content]) + + // Execute all tool calls in parallel, then collect results + var results: [[String: Any]] = [] + for block in content where block["type"] as? String == "tool_use" { + guard let toolID = block["id"] as? String, + let toolName = block["name"] as? String + else { continue } + + let args = block["input"] as? [String: Any] ?? [:] + let output = await executeTool(name: toolName, args: args) + + // Show tool use in messages for transparency + messages.append(CallMessage( + role: .tool, + text: "[\(toolName)]" + )) + + results.append([ + "type": "tool_result", + "tool_use_id": toolID, + "content": output + ]) + } + + history.append(["role": "user", "content": results]) + continue + } + + // Unexpected stop reason — return whatever text we have + return content + .filter { $0["type"] as? String == "text" } + .compactMap { $0["text"] as? String } + .joined(separator: "\n") + } + } + + // MARK: - Tool Execution + + private func executeTool(name: String, args: [String: Any]) async -> [[String: Any]] { + switch name { + + case "get_screen": + var region: CGRect? + if let r = args["region"] as? [String: Any], + let x = r["x"] as? CGFloat, let y = r["y"] as? CGFloat, + let w = r["width"] as? CGFloat, let h = r["height"] as? CGFloat { + region = CGRect(x: x, y: y, width: w, height: h) + } + let grab = await screenGrab.captureScreen(region: region) + return imageBlocks(from: grab) + + case "get_cursor_context": + let grab = await screenGrab.captureCursorContext() + return imageBlocks(from: grab) + + case "get_selection": + let sel = await screenGrab.captureSelection() + if sel.selectedText.isEmpty && sel.contextImageJPEGData == nil { + return [["type": "text", "text": "No text currently selected."]] + } + var blocks: [[String: Any]] = [] + if !sel.selectedText.isEmpty { + blocks.append(["type": "text", + "text": "Selected text:\n\(sel.selectedText)"]) + } + if let jpeg = sel.contextImageJPEGData { + blocks.append(imageBlock(jpeg)) + } + return blocks + + case "get_audio_transcript": + let maxChars = args["max_chars"] as? Int ?? 2_000 + let transcript = transcriptProvider?() ?? "" + let trimmed = transcript.count > maxChars + ? String(transcript.suffix(maxChars)) + : transcript + return [["type": "text", + "text": trimmed.isEmpty ? "No transcript available." : trimmed]] + + default: + return [["type": "text", "text": "Unknown tool: \(name)"]] + } + } + + // MARK: - Content Block Helpers + + private func imageBlocks(from grab: ScreenGrab) -> [[String: Any]] { + var blocks: [[String: Any]] = [] + let textParts = [ + grab.metadata.isEmpty ? nil : grab.metadata, + grab.ocrText.isEmpty ? nil : "Screen text:\n\(grab.ocrText)" + ].compactMap { $0 } + if !textParts.isEmpty { + blocks.append(["type": "text", "text": textParts.joined(separator: "\n\n")]) + } + if let jpeg = grab.imageJPEGData { + blocks.append(imageBlock(jpeg)) + } + return blocks + } + + private func imageBlock(_ jpeg: Data) -> [String: Any] { + [ + "type": "image", + "source": [ + "type": "base64", + "media_type": "image/jpeg", + "data": jpeg.base64EncodedString() + ] + ] + } + + // MARK: - Anthropic API + + private func makeRequestBody() -> [String: Any] { + [ + "model": "claude-opus-4-6", + "max_tokens": 4096, + "system": """ + You are an AI assistant running inside AutoClawd with real-time access \ + to the user's screen and microphone. You can see their screen, read OCR text, \ + and grab screenshots. Always call get_screen at the start of a new topic to \ + orient yourself. Use get_cursor_context when the user says "this" or "here" \ + without specifying. Use get_selection whenever the user has highlighted text. \ + Be concise, direct, and action-oriented. + """, + "tools": toolDefinitions(), + "messages": history + ] + } + + private func toolDefinitions() -> [[String: Any]] { + [ + [ + "name": "get_screen", + "description": "Capture the screen with OCR text and a JPEG screenshot. Optionally crop to a region.", + "input_schema": [ + "type": "object", + "properties": [ + "region": [ + "type": "object", + "properties": [ + "x": ["type": "number"], + "y": ["type": "number"], + "width": ["type": "number"], + "height": ["type": "number"] + ] + ] + ] + ] as [String: Any] + ], + [ + "name": "get_cursor_context", + "description": "Capture 600×400 region around the cursor with OCR. Use when user points at something.", + "input_schema": ["type": "object", "properties": [:] as [String: Any]] + ], + [ + "name": "get_selection", + "description": "Get selected text and screenshot of selection. Use when user highlights something.", + "input_schema": ["type": "object", "properties": [:] as [String: Any]] + ], + [ + "name": "get_audio_transcript", + "description": "Get recent spoken audio transcript from the user's microphone.", + "input_schema": [ + "type": "object", + "properties": [ + "max_chars": ["type": "number"] + ] + ] as [String: Any] + ] + ] + } + + private func callAnthropic(body: [String: Any], apiKey: String) async throws -> [String: Any] { + let url = URL(string: "https://api.anthropic.com/v1/messages")! + var req = URLRequest(url: url) + req.httpMethod = "POST" + req.setValue("application/json", forHTTPHeaderField: "Content-Type") + req.setValue(apiKey, forHTTPHeaderField: "x-api-key") + req.setValue("2023-06-01", forHTTPHeaderField: "anthropic-version") + req.httpBody = try JSONSerialization.data(withJSONObject: body) + + let (data, response) = try await URLSession.shared.data(for: req) + guard let http = response as? HTTPURLResponse, http.statusCode == 200 else { + let msg = String(data: data, encoding: .utf8) ?? "Unknown API error" + throw CallModeError.apiError(msg) + } + guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else { + throw CallModeError.invalidResponse + } + return json + } +} + +// MARK: - Supporting Types + +struct CallMessage: Identifiable { + let id = UUID() + let role: Role + let text: String + + enum Role { case user, assistant, tool, error, external } +} + +enum CallModeError: Error, LocalizedError { + case invalidResponse + case apiError(String) + + var errorDescription: String? { + switch self { + case .invalidResponse: return "Invalid API response from Anthropic." + case .apiError(let m): return m + } + } +} diff --git a/Sources/CallRoom.swift b/Sources/CallRoom.swift new file mode 100644 index 0000000..90e7b1d --- /dev/null +++ b/Sources/CallRoom.swift @@ -0,0 +1,169 @@ +import Foundation +import SwiftUI + +// MARK: - ParticipantKind + +enum ParticipantKind: Equatable { + case llama // AutoClawd PM — always present + case claudeCode // joins via MCP session + case connection(id: String, name: String) // future plugin integrations +} + +// MARK: - ParticipantState + +enum ParticipantState { + case idle // present, quiet + case thinking // processing (spinner) + case streaming // outputting text + case paused // muted — not receiving transcript context +} + +// MARK: - CallParticipant + +struct CallParticipant: Identifiable { + let id: String + let kind: ParticipantKind + var state: ParticipantState = .idle + var isPaused: Bool = false + var lastActivity: Date? + + var displayName: String { + switch kind { + case .llama: return "AutoClawd" + case .claudeCode: return "Claw'd" + case .connection(_, let name): return name + } + } + + var mascotSystemImage: String { + switch kind { + case .llama: return "brain" + case .claudeCode: return "terminal" + case .connection: return "plug" + } + } + + /// Gesture finger slot (1-based) based on current participant order. + /// Updated externally by CallRoom when participants array changes. + var gestureSlot: Int = 1 +} + +// MARK: - CallRoom + +/// Manages the set of participants in the active call and which one the user is addressing. +/// Llama is always participant[0] and cannot be removed. +@MainActor +final class CallRoom: ObservableObject { + + // MARK: Published State + + @Published private(set) var participants: [CallParticipant] = [] + @Published var activeParticipantID: String = "llama" + + // MARK: Init + + init() { + var llama = CallParticipant(id: "llama", kind: .llama) + llama.gestureSlot = 1 + participants = [llama] + } + + // MARK: - Active Participant + + var activeParticipant: CallParticipant? { + participants.first { $0.id == activeParticipantID } + } + + /// Select participant by left-hand finger count (1-based index into participants array). + func selectByGesture(fingerCount: Int) { + let index = fingerCount - 1 + guard index >= 0, index < participants.count else { return } + activeParticipantID = participants[index].id + } + + // MARK: - Join / Leave + + func claudeCodeJoined() { + guard !participants.contains(where: { $0.kind == .claudeCode }) else { + // Already present — refresh lastActivity + updateLastActivity(id: "claude-code") + return + } + var p = CallParticipant(id: "claude-code", kind: .claudeCode, lastActivity: Date()) + rebuildSlots() + p.gestureSlot = participants.count + 1 + participants.append(p) + rebuildSlots() + } + + func claudeCodeLeft() { + participants.removeAll { $0.kind == .claudeCode } + if activeParticipantID == "claude-code" { activeParticipantID = "llama" } + rebuildSlots() + } + + func connectionJoined(id: String, name: String) { + guard !participants.contains(where: { $0.id == id }) else { return } + var p = CallParticipant(id: id, kind: .connection(id: id, name: name)) + p.gestureSlot = participants.count + 1 + participants.append(p) + rebuildSlots() + } + + func connectionLeft(id: String) { + participants.removeAll { $0.id == id } + if activeParticipantID == id { activeParticipantID = "llama" } + rebuildSlots() + } + + // MARK: - Pause / Resume / Remove + + func togglePause(id: String) { + guard let idx = participants.firstIndex(where: { $0.id == id }) else { return } + participants[idx].isPaused.toggle() + if participants[idx].isPaused { participants[idx].state = .paused } + else if participants[idx].state == .paused { participants[idx].state = .idle } + } + + /// Remove a participant (Llama cannot be removed). + func remove(id: String) { + guard id != "llama" else { return } + participants.removeAll { $0.id == id } + if activeParticipantID == id { activeParticipantID = "llama" } + rebuildSlots() + } + + // MARK: - State Updates + + func setState(_ state: ParticipantState, for id: String) { + guard let idx = participants.firstIndex(where: { $0.id == id }) else { return } + guard !participants[idx].isPaused else { return } + participants[idx].state = state + participants[idx].lastActivity = Date() + } + + func updateLastActivity(id: String) { + guard let idx = participants.firstIndex(where: { $0.id == id }) else { return } + participants[idx].lastActivity = Date() + } + + // MARK: - MCP Pause Gating + + /// True when Claude Code is in the room and NOT paused — MCP transcript is live. + var claudeCodeIsActive: Bool { + guard let p = participants.first(where: { $0.kind == .claudeCode }) else { return false } + return !p.isPaused + } + + var claudeCodeIsPresent: Bool { + participants.contains { $0.kind == .claudeCode } + } + + // MARK: - Helpers + + private func rebuildSlots() { + for i in participants.indices { + participants[i].gestureSlot = i + 1 + } + } +} diff --git a/Sources/CameraPreviewView.swift b/Sources/CameraPreviewView.swift index 480c171..fb463c0 100644 --- a/Sources/CameraPreviewView.swift +++ b/Sources/CameraPreviewView.swift @@ -187,6 +187,7 @@ struct CameraFeedWidget: View { Image(decorative: image, scale: 1.0) .resizable() .aspectRatio(contentMode: .fill) + .frame(maxWidth: .infinity, maxHeight: .infinity) .clipShape(RoundedRectangle(cornerRadius: 16, style: .continuous)) } else { VStack(spacing: 6) { diff --git a/Sources/ChunkManager.swift b/Sources/ChunkManager.swift index fb41b54..9f7a17e 100644 --- a/Sources/ChunkManager.swift +++ b/Sources/ChunkManager.swift @@ -465,6 +465,15 @@ final class ChunkManager: ObservableObject { } Log.info(.pipeline, "Chunk \(index) [sess:\(label)]: raw text accumulated (pipeline deferred to session end)") + case .callMode: + // Bypass all Llama stages — send directly to Claude via CallModeSession. + let chunk = transcript + await MainActor.run { + guard let appState = self.appState else { return } + Task { await appState.callModeSession.send(text: chunk) } + } + Log.info(.pipeline, "Chunk \(index) [call]: forwarded to CallModeSession") + case .aiSearch: guard let qaService, let qaStore else { break } do { diff --git a/Sources/MCPServer.swift b/Sources/MCPServer.swift new file mode 100644 index 0000000..2b536a6 --- /dev/null +++ b/Sources/MCPServer.swift @@ -0,0 +1,461 @@ +import Foundation +import Network + +// MARK: - MCPServer + +/// Embeds a lightweight HTTP MCP server into AutoClawd so any Claude Code session +/// can call screen-grab tools directly — without a separate process. +/// +/// Configure Claude Code once (in ~/.claude/mcp.json or .mcp.json in project root): +/// ```json +/// { +/// "mcpServers": { +/// "autoclawd": { +/// "type": "http", +/// "url": "http://localhost:7892/mcp" +/// } +/// } +/// } +/// ``` +/// +/// Available tools: +/// autoclawd_get_screen — full screen or region: OCR text + JPEG screenshot +/// autoclawd_get_cursor_context — 600×400 crop around cursor: OCR + AX element +/// autoclawd_get_selection — currently selected text + selection screenshot +/// autoclawd_get_audio_transcript— rolling mic transcript buffer +/// +/// Transport: HTTP/1.1, JSON-RPC 2.0, single-response (no SSE needed for these tools). +final class MCPServer: @unchecked Sendable { + + static let shared = MCPServer() + + let port: UInt16 = 7892 + + private var listener: NWListener? + private var screenGrab: ScreenGrabService? + /// Called on @MainActor — returns the current session transcript text. + private var transcriptProvider: (@MainActor () -> String)? + /// Called on @MainActor — true if Claude Code participant is paused; gates transcript. + private var isPausedProvider: (@MainActor () -> Bool)? + /// Called on @MainActor — pushes text to the call mode canvas. + private var canvasWriter: (@MainActor (String) -> Void)? + /// Fired on @MainActor when a Claude Code session calls `initialize`. + private var onJoined: (@MainActor () -> Void)? + /// Fired on @MainActor when no MCP activity for `leaveTimeoutSeconds`. + private var onLeft: (@MainActor () -> Void)? + + /// Last time any MCP request was received from a Claude Code session. + private var lastActivityDate: Date? + private var leaveTimer: Timer? + private let leaveTimeoutSeconds: TimeInterval = 60 + + // MARK: - Lifecycle + + /// Start the server. Idempotent — safe to call multiple times. + func start(screenGrab: ScreenGrabService, + transcriptProvider: @escaping @MainActor () -> String, + isPausedProvider: (@MainActor () -> Bool)? = nil, + canvasWriter: (@MainActor (String) -> Void)? = nil, + onJoined: (@MainActor () -> Void)? = nil, + onLeft: (@MainActor () -> Void)? = nil) { + guard listener == nil else { return } + self.screenGrab = screenGrab + self.transcriptProvider = transcriptProvider + self.isPausedProvider = isPausedProvider + self.canvasWriter = canvasWriter + self.onJoined = onJoined + self.onLeft = onLeft + + do { + let params = NWParameters.tcp + params.allowLocalEndpointReuse = true + listener = try NWListener(using: params, on: NWEndpoint.Port(rawValue: port)!) + listener?.newConnectionHandler = { [weak self] conn in self?.accept(conn) } + listener?.stateUpdateHandler = { [weak self] state in + guard let self else { return } + switch state { + case .ready: + Log.info(.system, "MCPServer: ready — http://localhost:\(self.port)/mcp") + case .failed(let err): + Log.warn(.system, "MCPServer: listener failed — \(err)") + default: + break + } + } + listener?.start(queue: .global(qos: .utility)) + } catch { + Log.warn(.system, "MCPServer: could not bind port \(port) — \(error)") + } + } + + func stop() { + listener?.cancel() + listener = nil + leaveTimer?.invalidate() + leaveTimer = nil + } + + // MARK: - Activity Tracking + + /// Called on every inbound MCP request so we can detect session disconnection via timeout. + private func recordActivity() { + lastActivityDate = Date() + // Reset the leave timer each time activity is seen. + leaveTimer?.invalidate() + leaveTimer = Timer.scheduledTimer(withTimeInterval: leaveTimeoutSeconds, + repeats: false) { [weak self] _ in + self?.fireLeft() + } + RunLoop.main.add(leaveTimer!, forMode: .common) + } + + private func fireLeft() { + lastActivityDate = nil + leaveTimer = nil + guard let cb = onLeft else { return } + Task { @MainActor in cb() } + } + + // MARK: - Connection Handling + + private func accept(_ connection: NWConnection) { + connection.start(queue: .global(qos: .utility)) + receiveHTTP(connection: connection, buffer: Data()) + } + + /// Accumulate received bytes until we have a complete HTTP request, then process it. + private func receiveHTTP(connection: NWConnection, buffer: Data) { + connection.receive(minimumIncompleteLength: 1, maximumLength: 65_536) { [weak self] chunk, _, isComplete, error in + guard let self else { return } + if let err = error { + Log.warn(.system, "MCPServer: receive error \(err)") + return + } + var buf = buffer + if let chunk { buf.append(chunk) } + + if let (method, path, body) = self.parseHTTPRequest(buf) { + self.handleRequest(method: method, path: path, body: body, connection: connection) + } else if !isComplete { + self.receiveHTTP(connection: connection, buffer: buf) + } + } + } + + // MARK: - HTTP Parsing + + /// Returns (method, path, body) once a complete HTTP/1.1 request is buffered. + private func parseHTTPRequest(_ data: Data) -> (String, String, Data)? { + // Headers end at \r\n\r\n + let sep = Data([0x0d, 0x0a, 0x0d, 0x0a]) + guard let sepRange = data.range(of: sep) else { return nil } + + let headerData = data[data.startIndex..= 2 else { return nil } + let method = String(parts[0]) + let path = String(parts[1]) + + // Content-Length + var contentLength = 0 + for line in lines.dropFirst() { + let kv = line.split(separator: ":", maxSplits: 1) + if kv.count == 2, + kv[0].lowercased().trimmingCharacters(in: .whitespaces) == "content-length" { + contentLength = Int(kv[1].trimmingCharacters(in: .whitespaces)) ?? 0 + } + } + + let bodyStart = sepRange.upperBound + let available = data.distance(from: bodyStart, to: data.endIndex) + guard available >= contentLength else { return nil } // wait for more data + + let bodyEnd = data.index(bodyStart, offsetBy: contentLength) + return (method, path, Data(data[bodyStart.. Any { + switch method { + + case "initialize": + // Fire join callback — a Claude Code session just connected. + if let cb = onJoined { + Task { @MainActor in cb() } + } + return [ + "protocolVersion": "2024-11-05", + "capabilities": ["tools": [:] as [String: Any]], + "serverInfo": ["name": "autoclawd", "version": "1.0"] + ] as [String: Any] + + case "notifications/initialized": + return [:] as [String: Any] + + case "tools/list": + return ["tools": toolDefinitions()] + + case "tools/call": + let name = params["name"] as? String ?? "" + let args = params["arguments"] as? [String: Any] ?? [:] + return await callTool(name: name, args: args) + + default: + return ["error": "Unknown method: \(method)"] as [String: Any] + } + } + + // MARK: - Tool Definitions + + private func toolDefinitions() -> [[String: Any]] { + [ + [ + "name": "autoclawd_get_screen", + "description": """ + Capture the current screen with Vision OCR text and a JPEG screenshot. \ + Optionally crop to a pixel region (screen-space, top-left origin). \ + Returns structured text for reading plus the raw image for visual inspection. \ + Call this first to get overall screen context. + """, + "inputSchema": [ + "type": "object", + "properties": [ + "region": [ + "type": "object", + "description": "Pixel region to crop. Omit for full screen.", + "properties": [ + "x": ["type": "number"], + "y": ["type": "number"], + "width": ["type": "number"], + "height": ["type": "number"] + ] + ] + ] + ] as [String: Any] + ], + [ + "name": "autoclawd_get_cursor_context", + "description": """ + Capture a 600×400 screenshot centred on the user's current cursor position, \ + with OCR text and the UI element under the cursor. \ + Use this when the user points at something on screen without naming it. + """, + "inputSchema": ["type": "object", "properties": [:] as [String: Any]] + ], + [ + "name": "autoclawd_get_selection", + "description": """ + Get the user's currently highlighted/selected text and a screenshot \ + of that selection region. Use this when the user selects code, \ + an error message, a file path, or any text before speaking. + """, + "inputSchema": ["type": "object", "properties": [:] as [String: Any]] + ], + [ + "name": "autoclawd_get_audio_transcript", + "description": """ + Get the recent spoken audio transcript from the user's microphone session. \ + Useful to review what was said in the last few minutes without the user \ + having to repeat themselves. + """, + "inputSchema": [ + "type": "object", + "properties": [ + "max_chars": [ + "type": "number", + "description": "Maximum characters to return (default 2000, most-recent)." + ] + ] + ] as [String: Any] + ], + [ + "name": "autoclawd_set_canvas", + "description": """ + Push a text message to the AutoClawd widget canvas so the user can see it \ + on the floating pill. Use this to announce your presence ("Claude Code joined \ + the call"), stream responses, or display status updates directly on the widget. + """, + "inputSchema": [ + "type": "object", + "properties": [ + "text": [ + "type": "string", + "description": "The message to display on the call mode canvas." + ] + ], + "required": ["text"] + ] as [String: Any] + ] + ] + } + + // MARK: - Tool Execution + + private func callTool(name: String, args: [String: Any]) async -> [String: Any] { + var content: [[String: Any]] = [] + + switch name { + + case "autoclawd_get_screen": + var region: CGRect? + if let r = args["region"] as? [String: Any], + let x = r["x"] as? CGFloat, let y = r["y"] as? CGFloat, + let w = r["width"] as? CGFloat, let h = r["height"] as? CGFloat { + region = CGRect(x: x, y: y, width: w, height: h) + } + let grab = await screenGrab?.captureScreen(region: region) + ?? ScreenGrab(ocrText: "", metadata: "Screen service unavailable", + imageJPEGData: nil, capturedAt: Date()) + content = screenGrabBlocks(grab) + + case "autoclawd_get_cursor_context": + let grab = await screenGrab?.captureCursorContext() + ?? ScreenGrab(ocrText: "", metadata: "Screen service unavailable", + imageJPEGData: nil, capturedAt: Date()) + content = screenGrabBlocks(grab) + + case "autoclawd_get_selection": + let sel = await screenGrab?.captureSelection() + ?? SelectionGrab(selectedText: "", contextImageJPEGData: nil, capturedAt: Date()) + if sel.selectedText.isEmpty && sel.contextImageJPEGData == nil { + content = [["type": "text", "text": "No text currently selected."]] + } else { + if !sel.selectedText.isEmpty { + content.append(["type": "text", + "text": "Selected text:\n\(sel.selectedText)"]) + } + if let jpeg = sel.contextImageJPEGData { + content.append(["type": "image", + "data": jpeg.base64EncodedString(), + "mimeType": "image/jpeg"]) + } + } + + case "autoclawd_get_audio_transcript": + let maxChars = args["max_chars"] as? Int ?? 2_000 + let provider = transcriptProvider + let paused = isPausedProvider + let (transcript, isPaused) = await MainActor.run { + (provider?() ?? "", paused?() ?? false) + } + if isPaused { + content = [["type": "text", + "text": "Transcript paused — user and AutoClawd are planning. Stand by."]] + } else { + let trimmed = transcript.count > maxChars + ? String(transcript.suffix(maxChars)) + : transcript + content = [["type": "text", + "text": trimmed.isEmpty ? "No transcript available." : trimmed]] + } + + case "autoclawd_set_canvas": + let text = args["text"] as? String ?? "" + if !text.isEmpty, let writer = canvasWriter { + let w = writer + Task { @MainActor in w(text) } + content = [["type": "text", "text": "Canvas updated."]] + } else { + content = [["type": "text", "text": "No text provided or canvas unavailable."]] + } + + default: + content = [["type": "text", "text": "Unknown tool: \(name)"]] + } + + return ["content": content] + } + + /// Build MCP content blocks from a ScreenGrab (text + optional image). + private func screenGrabBlocks(_ grab: ScreenGrab) -> [[String: Any]] { + var blocks: [[String: Any]] = [] + let textParts = [ + grab.metadata.isEmpty ? nil : grab.metadata, + grab.ocrText.isEmpty ? nil : "Screen text:\n\(grab.ocrText)" + ].compactMap { $0 } + if !textParts.isEmpty { + blocks.append(["type": "text", "text": textParts.joined(separator: "\n\n")]) + } + if let jpeg = grab.imageJPEGData { + blocks.append(["type": "image", + "data": jpeg.base64EncodedString(), + "mimeType": "image/jpeg"]) + } + return blocks + } + + // MARK: - JSON-RPC Helpers + + private func rpcSuccess(id: Int?, result: Any) -> Data { + var obj: [String: Any] = ["jsonrpc": "2.0", "result": result] + if let id { obj["id"] = id } + return (try? JSONSerialization.data(withJSONObject: obj)) ?? Data() + } + + private func rpcError(code: Int, message: String) -> Data { + let obj: [String: Any] = [ + "jsonrpc": "2.0", + "error": ["code": code, "message": message] + ] + return (try? JSONSerialization.data(withJSONObject: obj)) ?? Data() + } +} diff --git a/Sources/MainPanelView.swift b/Sources/MainPanelView.swift index 2f1e7da..cf1e0bc 100644 --- a/Sources/MainPanelView.swift +++ b/Sources/MainPanelView.swift @@ -61,10 +61,18 @@ struct MainPanelView: View { @ViewBuilder private var content: some View { ZStack { + // PixelWorldView is always alive to preserve WKWebView state, + // but hidden during Call Mode (replaced by zoom-call view). PixelWorldView(appState: appState) .frame(maxWidth: .infinity, maxHeight: .infinity) - .opacity(selectedTab == .world ? 1 : 0) - .allowsHitTesting(selectedTab == .world) + .opacity(selectedTab == .world && appState.pillMode != .callMode ? 1 : 0) + .allowsHitTesting(selectedTab == .world && appState.pillMode != .callMode) + + // Zoom-call layout: replaces HQ view when Call Mode is active. + CallModeZoomView(appState: appState) + .frame(maxWidth: .infinity, maxHeight: .infinity) + .opacity(selectedTab == .world && appState.pillMode == .callMode ? 1 : 0) + .allowsHitTesting(selectedTab == .world && appState.pillMode == .callMode) ProjectsListView(appState: appState) .frame(maxWidth: .infinity, maxHeight: .infinity) @@ -86,6 +94,207 @@ struct MainPanelView: View { } +// MARK: - CallModeZoomView + +/// 3-panel "zoom call" layout shown in the panel's World tab when Call Mode is active. +/// Top: scrollable Claude message thread. Bottom: camera (left) + screen preview (right). +struct CallModeZoomView: View { + @ObservedObject var appState: AppState + + var body: some View { + VStack(spacing: 1) { + // Top: Claude messages thread + messagesPanel + .frame(maxWidth: .infinity, maxHeight: .infinity) + + // Bottom: camera + screen side by side + HStack(spacing: 1) { + cameraPanel + screenPanel + } + .frame(maxWidth: .infinity, minHeight: 200, maxHeight: 220) + } + .background(Color.black) + } + + // MARK: - Messages Panel + + private var messagesPanel: some View { + ZStack(alignment: .topLeading) { + Color(nsColor: .windowBackgroundColor).opacity(0.05) + + ScrollViewReader { proxy in + ScrollView { + LazyVStack(alignment: .leading, spacing: 10) { + ForEach(appState.callModeSession.messages) { msg in + CallZoomMessageRow(message: msg) + .id(msg.id) + } + } + .padding(16) + } + .onChange(of: appState.callModeSession.messages.count) { _ in + if let last = appState.callModeSession.messages.last { + withAnimation { proxy.scrollTo(last.id, anchor: .bottom) } + } + } + } + + // Processing indicator + if appState.callModeSession.isProcessing { + VStack { + Spacer() + HStack(spacing: 6) { + ProgressView().controlSize(.mini).tint(.cyan) + Text("Claude thinking…") + .font(.system(size: 10, design: .monospaced)) + .foregroundColor(.cyan.opacity(0.7)) + } + .padding(.horizontal, 16) + .padding(.vertical, 8) + } + } + + // Empty state + if appState.callModeSession.messages.isEmpty { + VStack(spacing: 8) { + Image(systemName: "phone.bubble") + .font(.system(size: 32)) + .foregroundColor(.cyan.opacity(0.25)) + Text("CALL MODE ACTIVE") + .font(.system(size: 11, weight: .semibold, design: .monospaced)) + .foregroundColor(.cyan.opacity(0.3)) + Text("Speak to start the call") + .font(.system(size: 10, design: .monospaced)) + .foregroundColor(.white.opacity(0.2)) + } + .frame(maxWidth: .infinity, maxHeight: .infinity) + } + } + } + + // MARK: - Camera Panel + + private var cameraPanel: some View { + ZStack { + Color.black + if appState.cameraEnabled && appState.cameraService.isRunning { + CameraPreviewView(session: appState.cameraService.captureSession) + .frame(maxWidth: .infinity, maxHeight: .infinity) + } else { + VStack(spacing: 4) { + Image(systemName: "camera.fill") + .font(.system(size: 18)) + .foregroundColor(.white.opacity(0.15)) + Text("Camera Off") + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(.white.opacity(0.2)) + } + } + VStack { + Spacer() + HStack { + feedBadge(label: "LIVE", color: .red) + Spacer() + } + .padding(8) + } + } + .clipShape(RoundedRectangle(cornerRadius: 0)) + } + + // MARK: - Screen Panel + + private var screenPanel: some View { + ZStack { + Color.black + if let img = appState.screenPreviewImage { + Image(decorative: img, scale: 1.0) + .resizable() + .aspectRatio(contentMode: .fit) + .frame(maxWidth: .infinity, maxHeight: .infinity) + } else { + VStack(spacing: 4) { + Image(systemName: "rectangle.on.rectangle") + .font(.system(size: 18)) + .foregroundColor(.white.opacity(0.15)) + Text("No Screen") + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(.white.opacity(0.2)) + } + } + VStack { + Spacer() + HStack { + feedBadge(label: "SCREEN", color: .cyan) + Spacer() + } + .padding(8) + } + } + } + + // MARK: - Badge + + private func feedBadge(label: String, color: Color) -> some View { + HStack(spacing: 3) { + Circle().fill(color).frame(width: 5, height: 5) + Text(label) + .font(.system(size: 7, weight: .bold, design: .monospaced)) + .foregroundColor(.white.opacity(0.8)) + } + .padding(.horizontal, 5) + .padding(.vertical, 2) + .background(Capsule().fill(Color.black.opacity(0.6))) + } +} + +// MARK: - CallZoomMessageRow + +private struct CallZoomMessageRow: View { + let message: CallMessage + + var body: some View { + HStack(alignment: .top, spacing: 8) { + Circle() + .fill(roleColor) + .frame(width: 7, height: 7) + .padding(.top, 4) + VStack(alignment: .leading, spacing: 3) { + Text(roleLabel) + .font(.system(size: 9, weight: .semibold, design: .monospaced)) + .foregroundColor(roleColor.opacity(0.6)) + Text(message.text) + .font(.system(size: 12, + design: message.role == .tool ? .monospaced : .default)) + .foregroundColor(.primary.opacity(message.role == .user ? 1.0 : 0.85)) + .textSelection(.enabled) + .fixedSize(horizontal: false, vertical: true) + } + } + } + + private var roleColor: Color { + switch message.role { + case .user: return .white + case .assistant: return .cyan + case .tool: return .yellow + case .error: return .red + case .external: return .green + } + } + + private var roleLabel: String { + switch message.role { + case .user: return "YOU" + case .assistant: return "CLAUDE" + case .tool: return "TOOL" + case .error: return "ERR" + case .external: return "CC" + } + } +} + // MARK: - ExecutionOutputView struct ExecutionOutputView: View { diff --git a/Sources/PillMode.swift b/Sources/PillMode.swift index 2181484..8bfe461 100644 --- a/Sources/PillMode.swift +++ b/Sources/PillMode.swift @@ -6,6 +6,7 @@ enum PillMode: String, CaseIterable { case transcription = "transcription" case aiSearch = "aiSearch" case meeting = "meeting" // Meeting notes: accumulate → analyse at end + case callMode = "callMode" // Direct Claude via Anthropic API; Llama bypassed var displayName: String { switch self { @@ -13,6 +14,7 @@ enum PillMode: String, CaseIterable { case .transcription: return "Transcribe" case .aiSearch: return "AI Search" case .meeting: return "Meeting" + case .callMode: return "Call" } } @@ -22,6 +24,7 @@ enum PillMode: String, CaseIterable { case .transcription: return "text.cursor" case .aiSearch: return "magnifyingglass" case .meeting: return "person.2.wave.2" + case .callMode: return "phone.bubble" } } @@ -31,6 +34,7 @@ enum PillMode: String, CaseIterable { case .transcription: return "[TRS]" case .aiSearch: return "[SRC]" case .meeting: return "[MTG]" + case .callMode: return "[CLL]" } } @@ -40,6 +44,7 @@ enum PillMode: String, CaseIterable { case .transcription: return .accentColor case .aiSearch: return .accentColor case .meeting: return .purple + case .callMode: return .cyan } } diff --git a/Sources/PipelineModels.swift b/Sources/PipelineModels.swift index 84b8882..40010ce 100644 --- a/Sources/PipelineModels.swift +++ b/Sources/PipelineModels.swift @@ -16,6 +16,10 @@ enum PipelineSource: String, Codable { case code /// WhatsApp self-chat → full pipeline + QA reply case whatsapp + /// Call mode — direct to Claude via Anthropic API, Llama bypassed entirely. + /// Audio transcript is handed straight to CallModeSession; MCP tools serve + /// visual context (screen, cursor, selection) on demand. + case callMode } // MARK: - Cleaned Transcript diff --git a/Sources/ScreenGrabService.swift b/Sources/ScreenGrabService.swift new file mode 100644 index 0000000..dd40645 --- /dev/null +++ b/Sources/ScreenGrabService.swift @@ -0,0 +1,251 @@ +import AppKit +import ApplicationServices +import CoreGraphics +import Foundation +import Vision + +// MARK: - Result Types + +/// Full screen or region capture result. +struct ScreenGrab { + let ocrText: String + let metadata: String // app name, window title, etc. + let imageJPEGData: Data? // nil if capture failed + let capturedAt: Date +} + +/// Selected text + context screenshot. +struct SelectionGrab { + let selectedText: String + let contextImageJPEGData: Data? + let capturedAt: Date +} + +// MARK: - ScreenGrabService + +/// On-demand screen / cursor / selection capture for Call Mode MCP tools. +/// +/// - `captureScreen(region:)` — full screen or cropped region, Vision OCR + JPEG +/// - `captureCursorContext()` — 600×400 crop around current cursor, OCR + AX element info +/// - `captureSelection()` — AX selected text + screenshot of selection bounds +/// +/// Wraps `ScreenVisionAnalyzer` for full-screen grabs; adds cursor and AX selection on top. +/// Thread-safe: all async methods dispatch to the right actor internally. +final class ScreenGrabService: @unchecked Sendable { + + private let visionAnalyzer = ScreenVisionAnalyzer() + + // MARK: - Full Screen / Region + + /// Capture the full screen (or a specific pixel region) with Vision OCR and a JPEG screenshot. + func captureScreen(region: CGRect? = nil) async -> ScreenGrab { + guard let snapshot = await visionAnalyzer.captureNow() else { + return ScreenGrab(ocrText: "", metadata: "Screen capture unavailable", + imageJPEGData: nil, capturedAt: Date()) + } + + let finalSnapshot: ScreenSnapshot + if let region { + let screen = await MainActor.run { NSScreen.main?.frame ?? .zero } + guard screen.width > 0, screen.height > 0 else { + return makeGrab(from: snapshot) + } + let normalized = CGRect( + x: region.minX / screen.width, + y: region.minY / screen.height, + width: region.width / screen.width, + height: region.height / screen.height + ) + finalSnapshot = await visionAnalyzer.applySelection(normalizedRect: normalized, to: snapshot) + } else { + finalSnapshot = snapshot + } + + return makeGrab(from: finalSnapshot) + } + + private func makeGrab(from snapshot: ScreenSnapshot) -> ScreenGrab { + let imageData = snapshot.savedImagePath.flatMap { path -> Data? in + guard let png = try? Data(contentsOf: URL(fileURLWithPath: path)) else { return nil } + return jpegFromPNG(png) + } + let parts = [ + snapshot.appName.map { "App: \($0)" }, + snapshot.windowTitle.map { "Window: \($0)" }, + snapshot.hasDialog ? "Modal/dialog visible" : nil + ].compactMap { $0 } + + return ScreenGrab( + ocrText: snapshot.croppedText ?? snapshot.extractedText, + metadata: parts.joined(separator: " | "), + imageJPEGData: imageData, + capturedAt: snapshot.capturedAt + ) + } + + // MARK: - Cursor Context + + /// Capture a 600×400 screenshot centred on the current cursor with OCR and AX element info. + /// Use this when the user points at something without saying explicitly what it is. + func captureCursorContext() async -> ScreenGrab { + let (mouseLocation, screenHeight) = await MainActor.run { + (NSEvent.mouseLocation, NSScreen.main?.frame.height ?? 900.0) + } + + // AppKit uses bottom-left origin; CGWindowListCreateImage uses top-left. + let cgX = mouseLocation.x + let cgY = screenHeight - mouseLocation.y + + let grabW: CGFloat = 600 + let grabH: CGFloat = 400 + let region = CGRect( + x: max(0, cgX - grabW / 2), + y: max(0, cgY - grabH / 2), + width: grabW, + height: grabH + ) + + guard let cgImage = CGWindowListCreateImage( + region, .optionOnScreenOnly, kCGNullWindowID, .bestResolution + ) else { + return ScreenGrab(ocrText: "", metadata: "Cursor capture failed", + imageJPEGData: nil, capturedAt: Date()) + } + + let ocrText = runOCR(on: cgImage) + let imageData = jpegFromCGImage(cgImage) + let elemInfo = axElementInfo(at: mouseLocation, screenHeight: screenHeight) + + let metadata = "Cursor at (\(Int(mouseLocation.x)), \(Int(mouseLocation.y)))" + + (elemInfo.isEmpty ? "" : " | \(elemInfo)") + + return ScreenGrab(ocrText: ocrText, metadata: metadata, + imageJPEGData: imageData, capturedAt: Date()) + } + + // MARK: - Selection + + /// Get the user's currently highlighted text and a screenshot of the selection bounds. + /// Use this when the user selects code, an error, or any text. + func captureSelection() async -> SelectionGrab { + let (text, bounds) = await MainActor.run { axSelectedTextAndBounds() } + let screenHeight = await MainActor.run { NSScreen.main?.frame.height ?? 900.0 } + + var imageData: Data? + if let b = bounds, b.width > 0, b.height > 0 { + // AX bounds use top-left origin on macOS (screen-space, not AppKit-space). + // Add padding around the selection. + let padded = CGRect( + x: b.minX - 24, + y: b.minY - 12, + width: b.width + 48, + height: b.height + 24 + ) + // CGWindowListCreateImage also uses top-left origin, so no flip needed here. + if let img = CGWindowListCreateImage( + padded, .optionOnScreenOnly, kCGNullWindowID, .bestResolution + ) { + imageData = jpegFromCGImage(img) + } + } + + return SelectionGrab( + selectedText: text ?? "", + contextImageJPEGData: imageData, + capturedAt: Date() + ) + } + + // MARK: - Accessibility Helpers + + /// Returns (selectedText, selectionBounds) for the focused UI element. + /// AX bounds are in screen-space top-left coordinates (same as CGWindow). + private func axSelectedTextAndBounds() -> (String?, CGRect?) { + let systemWide = AXUIElementCreateSystemWide() + + // Focused element + var focusedRef: CFTypeRef? + guard AXUIElementCopyAttributeValue( + systemWide, kAXFocusedUIElementAttribute as CFString, &focusedRef + ) == .success, let focused = focusedRef, + CFGetTypeID(focused) == AXUIElementGetTypeID() + else { return (nil, nil) } + + let element = focused as! AXUIElement // safe: type ID verified above + + // Selected text + var textRef: CFTypeRef? + _ = AXUIElementCopyAttributeValue(element, kAXSelectedTextAttribute as CFString, &textRef) + let text = textRef as? String + + // Selected text range + var rangeRef: CFTypeRef? + guard AXUIElementCopyAttributeValue( + element, kAXSelectedTextRangeAttribute as CFString, &rangeRef + ) == .success, let rangeVal = rangeRef else { return (text, nil) } + + // Bounding rect for that range + var boundsRef: CFTypeRef? + guard AXUIElementCopyParameterizedAttributeValue( + element, + kAXBoundsForRangeParameterizedAttribute as CFString, + rangeVal, + &boundsRef + ) == .success, let bVal = boundsRef, + CFGetTypeID(bVal) == AXValueGetTypeID() + else { return (text, nil) } + + var rect = CGRect.zero + AXValueGetValue(bVal as! AXValue, .cgRect, &rect) // safe: type ID verified + return (text, rect.width > 0 ? rect : nil) + } + + /// Description of the UI element under the cursor (role + title). + private func axElementInfo(at point: NSPoint, screenHeight: CGFloat) -> String { + let systemWide = AXUIElementCreateSystemWide() + var elementRef: AXUIElement? + // AX position uses top-left origin → flip AppKit Y + let axY = Float(screenHeight - point.y) + guard AXUIElementCopyElementAtPosition( + systemWide, Float(point.x), axY, &elementRef + ) == .success, let element = elementRef else { return "" } + + var roleRef: CFTypeRef? + AXUIElementCopyAttributeValue(element, kAXRoleAttribute as CFString, &roleRef) + let role = (roleRef as? String) ?? "" + + var titleRef: CFTypeRef? + AXUIElementCopyAttributeValue(element, kAXTitleAttribute as CFString, &titleRef) + let title = (titleRef as? String) ?? "" + + return [role, title].filter { !$0.isEmpty }.joined(separator: ": ") + } + + // MARK: - Vision OCR + + private func runOCR(on image: CGImage) -> String { + let req = VNRecognizeTextRequest() + req.recognitionLevel = .accurate + req.usesLanguageCorrection = true + let handler = VNImageRequestHandler(cgImage: image, options: [:]) + try? handler.perform([req]) + return (req.results ?? []) + .compactMap { $0.topCandidates(1).first?.string } + .joined(separator: "\n") + .trimmingCharacters(in: .whitespacesAndNewlines) + } + + // MARK: - Image Conversion + + private func jpegFromPNG(_ pngData: Data) -> Data? { + guard let rep = NSBitmapImageRep(data: pngData) else { return nil } + return rep.representation(using: .jpeg, properties: [.compressionFactor: 0.8]) + } + + private func jpegFromCGImage(_ cgImage: CGImage) -> Data? { + let nsImage = NSImage(cgImage: cgImage, size: .zero) + guard let tiff = nsImage.tiffRepresentation, + let rep = NSBitmapImageRep(data: tiff) else { return nil } + return rep.representation(using: .jpeg, properties: [.compressionFactor: 0.8]) + } +} diff --git a/Sources/SkillStore.swift b/Sources/SkillStore.swift index 9129538..2db1633 100644 --- a/Sources/SkillStore.swift +++ b/Sources/SkillStore.swift @@ -280,5 +280,64 @@ final class SkillStore: @unchecked Sendable { category: .marketing, isBuiltin: true ), + // Skill for Claude Code / CLI to know how to start and use Call Mode + Skill( + id: "call-mode-init", + name: "Start Call Mode", + description: "Tells Claude Code or Claude CLI how to switch AutoClawd into Call Mode and use the AutoClawd MCP tools for real-time screen and audio access during a call.", + promptTemplate: """ + To start Call Mode in AutoClawd: + + 1. Make sure AutoClawd is running (look for the floating pill widget on your screen). + 2. Click the mode icon on the pill widget and cycle to "Call" (cyan phone icon), OR say "call mode" aloud — the widget will switch automatically. + 3. Once in Call Mode, your voice is forwarded directly to Claude without Llama analysis. + + As Claude Code / Claude CLI, you have access to these AutoClawd MCP tools via the autoclawd MCP server: + - autoclawd_get_screen — full screen or region: Vision OCR text + JPEG screenshot + - autoclawd_get_cursor_context — 600×400 crop around cursor; use when user points at something + - autoclawd_get_selection — currently highlighted text + selection screenshot + - autoclawd_get_audio_transcript — recent spoken audio transcript (last 2000 chars) + + USAGE PATTERN: + - Call autoclawd_get_screen immediately to see what the user is looking at. + - Call autoclawd_get_cursor_context when user says "this", "here", or "that thing". + - Call autoclawd_get_selection when user highlights text before speaking. + - Keep responses short and conversational — this is a live call. + + To end Call Mode: cycle the widget back to any other mode. + + {{prompt}} + """, + workflowID: nil, + category: .development, + isBuiltin: true + ), + // Call Mode skill for the AutoClawd widget itself + Skill( + id: "call-mode", + name: "Call Mode", + description: "Activates when joining a video or voice call. Claude gets real-time access to your screen, cursor position, and selected text via AutoClawd MCP tools — acting like a meeting co-pilot that can see exactly what you see.", + promptTemplate: """ + You are now in Call Mode via AutoClawd. You have real-time sensory access to the user's environment through these MCP tools: + + AVAILABLE TOOLS: + - autoclawd_get_screen: Capture the full screen (or a pixel region) with Vision OCR text + JPEG screenshot. Call this first to orient yourself whenever the topic changes. + - autoclawd_get_cursor_context: Capture a 600×400 screenshot centred on the cursor with OCR. Use when the user says "this", "here", "that thing", or points at something without naming it. + - autoclawd_get_selection: Get the user's currently highlighted/selected text and a screenshot of that region. Use whenever the user selects code, an error, a file path, or any text before speaking. + - autoclawd_get_audio_transcript: Get the recent spoken transcript from the user's microphone. Use to review what was just said without the user repeating themselves. + + CALL MODE BEHAVIOUR: + - Call get_screen at the start of each new topic to understand context. + - Prefer get_cursor_context when spatial references are used ("this panel", "that button"). + - Prefer get_selection when the user has highlighted something — always check before answering questions about specific text. + - Keep responses concise and spoken-word friendly — this is a real-time call. + - Proactively mention what you can see to confirm your understanding. + + {{prompt}} + """, + workflowID: nil, + category: .development, + isBuiltin: true + ), ] } diff --git a/Sources/WidgetCanvasViews.swift b/Sources/WidgetCanvasViews.swift index 0aa3e7a..5d49c4d 100644 --- a/Sources/WidgetCanvasViews.swift +++ b/Sources/WidgetCanvasViews.swift @@ -428,6 +428,121 @@ struct MeetingCanvasView: View { } } +// MARK: - Call Mode Canvas + +/// Canvas for Call Mode — shows the real-time Claude conversation thread. +/// Voice chunks are sent directly to Claude; Claude calls screen/cursor/selection tools inline. +struct CallModeCanvasView: View { + @ObservedObject var session: CallModeSession + + var body: some View { + VStack(spacing: 0) { + // Header + HStack(spacing: 6) { + Image(systemName: "phone.bubble") + .font(.system(size: 9, weight: .semibold)) + .foregroundColor(.cyan) + Text("CALL MODE") + .font(.system(size: 8, weight: .bold, design: .monospaced)) + .foregroundColor(.white.opacity(0.45)) + Spacer() + if session.isProcessing { + HStack(spacing: 3) { + ProgressView() + .scaleEffect(0.5) + .frame(width: 10, height: 10) + Text("thinking") + .font(.system(size: 7)) + .foregroundColor(.cyan.opacity(0.60)) + } + } else { + Text("direct · claude") + .font(.system(size: 7)) + .foregroundColor(.white.opacity(0.20)) + } + } + .padding(.horizontal, 12) + .padding(.top, 10) + .padding(.bottom, 6) + + Divider().opacity(0.12) + + // Message thread + if session.messages.isEmpty { + VStack(spacing: 6) { + Image(systemName: "waveform") + .font(.system(size: 20, weight: .ultraLight)) + .foregroundColor(.cyan.opacity(0.18)) + Text("Speak to start the call") + .font(.system(size: 10)) + .foregroundColor(.white.opacity(0.22)) + } + .frame(maxWidth: .infinity, maxHeight: .infinity) + } else { + ScrollViewReader { proxy in + ScrollView(showsIndicators: false) { + LazyVStack(alignment: .leading, spacing: 6) { + ForEach(session.messages) { msg in + CallMessageBubble(message: msg) + .id(msg.id) + } + } + .padding(.horizontal, 10) + .padding(.vertical, 8) + } + .onChange(of: session.messages.count) { _ in + if let last = session.messages.last { + withAnimation { proxy.scrollTo(last.id, anchor: .bottom) } + } + } + } + } + } + .frame(maxWidth: .infinity, maxHeight: .infinity) + } +} + +/// Single message bubble in the call mode thread. +private struct CallMessageBubble: View { + let message: CallMessage + + var body: some View { + HStack(alignment: .top, spacing: 5) { + // Role indicator dot + Circle() + .fill(roleColor) + .frame(width: 5, height: 5) + .padding(.top, 4) + + Text(message.text) + .font(.system(size: 10, design: message.role == .tool ? .monospaced : .default)) + .foregroundColor(textColor) + .fixedSize(horizontal: false, vertical: true) + .frame(maxWidth: .infinity, alignment: .leading) + } + } + + private var roleColor: Color { + switch message.role { + case .user: return .white.opacity(0.45) + case .assistant: return .cyan + case .tool: return .yellow.opacity(0.60) + case .error: return .red + case .external: return .green + } + } + + private var textColor: Color { + switch message.role { + case .user: return .white.opacity(0.72) + case .assistant: return .white.opacity(0.88) + case .tool: return .white.opacity(0.38) + case .error: return .red.opacity(0.80) + case .external: return .green.opacity(0.85) + } + } +} + // MARK: - Project Picker Canvas (shared by Code + Tasks modes) /// Reusable tappable project list — used as the first canvas state in Code and Tasks modes. From b227e78190312a64f298a1e2b5229d7dff8bfe55 Mon Sep 17 00:00:00 2001 From: Sameep Rehlan Date: Sun, 8 Mar 2026 13:54:37 +0530 Subject: [PATCH 2/6] =?UTF-8?q?feat:=20Call=20Mode=20room=20UI=20=E2=80=94?= =?UTF-8?q?=20participant=20tiles,=20mascots,=20gesture=20addressing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - CallModeRoomView: replaces CallModeZoomView as the panel's call mode view - Horizontal scrollable participant tiles (tap or left-hand finger count to address) - ParticipantTileView: slot badge, pause/remove buttons, state label, mascot - ParticipantMascotView: SF Symbol icon with state-driven animations (idle: breathing pulse, thinking: spinner, streaming: shimmer ring, paused: static) - ThinkingDotsView: animated three-dot thinking indicator - Shared call feed: voice transcript + callModeSession messages unified - Bottom bar: camera thumb, screen thumb, play/pause/stop, active participant indicator - AppState: leftFingerCount gesture → callRoom.selectByGesture() when in callMode - MainPanelView: swapped CallModeZoomView → CallModeRoomView Co-Authored-By: Claude Sonnet 4.6 --- Sources/AppState.swift | 4 + Sources/CallModeRoomView.swift | 563 +++++++++++++++++++++++++++++++++ Sources/MainPanelView.swift | 4 +- 3 files changed, 569 insertions(+), 2 deletions(-) create mode 100644 Sources/CallModeRoomView.swift diff --git a/Sources/AppState.swift b/Sources/AppState.swift index 2fe6dc9..1512838 100644 --- a/Sources/AppState.swift +++ b/Sources/AppState.swift @@ -849,6 +849,10 @@ final class AppState: ObservableObject { // Select project during review: 0 = None, 1..N = project by position selectReviewProjectByIndex(count) Log.info(.camera, "Gesture: review project index \(count) selected") + } else if pillMode == .callMode { + // In call mode, finger count addresses a participant + callRoom.selectByGesture(fingerCount: count) + Log.info(.camera, "Gesture: call mode participant \(count) selected") } else if showOptionSelector { selectOption(index: count) Log.info(.camera, "Gesture: option \(count) selected (left fingers)") diff --git a/Sources/CallModeRoomView.swift b/Sources/CallModeRoomView.swift new file mode 100644 index 0000000..365ad40 --- /dev/null +++ b/Sources/CallModeRoomView.swift @@ -0,0 +1,563 @@ +import SwiftUI +import Combine + +// MARK: - CallModeRoomView + +/// Full-panel Call Mode UI — participant tiles, shared feed, session controls. +/// Replaces PixelWorldView in the World tab when pillMode == .callMode. +struct CallModeRoomView: View { + @ObservedObject var appState: AppState + + var body: some View { + VStack(spacing: 0) { + participantsRow + Divider().background(Color.white.opacity(0.07)) + callFeed + Divider().background(Color.white.opacity(0.07)) + bottomBar + } + .background(Color.black) + // Route left-hand finger count → participant selection + .onReceive(appState.$lastConfirmedGesture.compactMap { $0 }) { gesture in + if case .leftFingerCount(let count) = gesture { + appState.callRoom.selectByGesture(fingerCount: count) + } + } + } + + // MARK: - Participants Row + + private var participantsRow: some View { + ScrollView(.horizontal, showsIndicators: false) { + HStack(alignment: .top, spacing: 10) { + ForEach(appState.callRoom.participants) { participant in + ParticipantTileView( + participant: participant, + isActive: participant.id == appState.callRoom.activeParticipantID, + onTap: { appState.callRoom.activeParticipantID = participant.id }, + onPause: { appState.callRoom.togglePause(id: participant.id) }, + onRemove: participant.kind == .llama ? nil + : { appState.callRoom.remove(id: participant.id) } + ) + } + inviteButton + } + .padding(.horizontal, 16) + .padding(.vertical, 12) + } + .frame(height: 148) + } + + private var inviteButton: some View { + VStack(spacing: 6) { + Spacer() + ZStack { + Circle() + .fill(Color.white.opacity(0.05)) + .frame(width: 52, height: 52) + .overlay(Circle().stroke(Color.white.opacity(0.15), lineWidth: 0.8)) + Image(systemName: "plus") + .font(.system(size: 16, weight: .medium)) + .foregroundColor(.white.opacity(0.3)) + } + Text("Invite") + .font(.system(size: 9, design: .monospaced)) + .foregroundColor(.white.opacity(0.2)) + Spacer() + } + .frame(width: 80, height: 124) + } + + // MARK: - Call Feed + + private var callFeed: some View { + ZStack(alignment: .bottom) { + ScrollViewReader { proxy in + ScrollView { + LazyVStack(alignment: .leading, spacing: 8) { + // Transcript from user's voice — live session text + if !appState.liveTranscriptText.isEmpty { + CallFeedMessageRow( + participantName: "You", + color: .white, + icon: "mic.fill", + text: appState.liveTranscriptText + ) + .id("transcript") + } + // Messages from participants (Claude Code / external) + ForEach(appState.callModeSession.messages) { msg in + CallFeedMessageRow( + participantName: feedLabel(for: msg.role), + color: feedColor(for: msg.role), + icon: feedIcon(for: msg.role), + text: msg.text + ) + .id(msg.id) + } + } + .padding(14) + } + .onChange(of: appState.callModeSession.messages.count) { _ in + if let last = appState.callModeSession.messages.last { + withAnimation { proxy.scrollTo(last.id, anchor: .bottom) } + } + } + .onChange(of: appState.liveTranscriptText) { _ in + withAnimation { proxy.scrollTo("transcript", anchor: .bottom) } + } + } + + // Processing indicator + if appState.callModeSession.isProcessing { + HStack(spacing: 6) { + ProgressView().controlSize(.mini).tint(.cyan) + Text("Thinking…") + .font(.system(size: 9, design: .monospaced)) + .foregroundColor(.cyan.opacity(0.7)) + } + .padding(.horizontal, 12) + .padding(.vertical, 6) + .background(Capsule().fill(Color.black.opacity(0.8))) + .padding(.bottom, 8) + } + + // Empty state + if appState.callModeSession.messages.isEmpty && appState.liveTranscriptText.isEmpty { + VStack(spacing: 8) { + Image(systemName: "waveform") + .font(.system(size: 28)) + .foregroundColor(.white.opacity(0.1)) + Text("CALL ACTIVE") + .font(.system(size: 10, weight: .semibold, design: .monospaced)) + .foregroundColor(.white.opacity(0.15)) + Text("Speak to start — use left fingers to address participants") + .font(.system(size: 9, design: .monospaced)) + .foregroundColor(.white.opacity(0.1)) + .multilineTextAlignment(.center) + } + .padding(.horizontal, 32) + .frame(maxWidth: .infinity) + .padding(.vertical, 32) + } + } + } + + // MARK: - Bottom Bar + + private var bottomBar: some View { + HStack(spacing: 12) { + // Camera preview (small) + cameraThumb + // Screen preview (small) + screenThumb + Spacer() + // Session controls + sessionControls + Spacer() + // Addressing indicator + addressingIndicator + } + .padding(.horizontal, 16) + .padding(.vertical, 10) + .background(Color.white.opacity(0.025)) + } + + private var cameraThumb: some View { + ZStack { + RoundedRectangle(cornerRadius: 8) + .fill(Color.black) + .frame(width: 64, height: 44) + if appState.cameraEnabled && appState.cameraService.isRunning { + CameraPreviewView(session: appState.cameraService.captureSession) + .frame(width: 64, height: 44) + .clipShape(RoundedRectangle(cornerRadius: 8)) + } else { + Image(systemName: "camera.slash") + .font(.system(size: 12)) + .foregroundColor(.white.opacity(0.2)) + } + } + .overlay(RoundedRectangle(cornerRadius: 8).stroke(Color.white.opacity(0.1), lineWidth: 0.5)) + } + + private var screenThumb: some View { + ZStack { + RoundedRectangle(cornerRadius: 8) + .fill(Color.black) + .frame(width: 64, height: 44) + if let img = appState.screenPreviewImage { + Image(decorative: img, scale: 1.0) + .resizable() + .aspectRatio(contentMode: .fill) + .frame(width: 64, height: 44) + .clipShape(RoundedRectangle(cornerRadius: 8)) + } else { + Image(systemName: "rectangle.on.rectangle") + .font(.system(size: 12)) + .foregroundColor(.white.opacity(0.2)) + } + } + .overlay(RoundedRectangle(cornerRadius: 8).stroke(Color.white.opacity(0.1), lineWidth: 0.5)) + } + + private var sessionControls: some View { + HStack(spacing: 14) { + // Play / Pause toggle + Button { + if appState.isListening { appState.stopListening() } + else { appState.startListening() } + } label: { + Image(systemName: appState.isListening ? "pause.fill" : "play.fill") + .font(.system(size: 14, weight: .semibold)) + .foregroundColor(appState.isListening ? .white : .green) + .frame(width: 32, height: 32) + .background(Circle().fill(Color.white.opacity(0.08))) + } + .buttonStyle(.plain) + + // Stop — end call + Button { + appState.stopListening() + appState.pillMode = .ambientIntelligence + } label: { + Image(systemName: "stop.fill") + .font(.system(size: 12, weight: .semibold)) + .foregroundColor(.red.opacity(0.8)) + .frame(width: 32, height: 32) + .background(Circle().fill(Color.white.opacity(0.06))) + } + .buttonStyle(.plain) + } + } + + private var addressingIndicator: some View { + HStack(spacing: 5) { + if let active = appState.callRoom.activeParticipant { + Text("①".replacing("①", with: "⑤".isEmpty ? "" : slotEmoji(active.gestureSlot))) + .font(.system(size: 9, design: .monospaced)) + .foregroundColor(.white.opacity(0.3)) + Image(systemName: active.mascotSystemImage) + .font(.system(size: 10)) + .foregroundColor(participantColor(active.kind)) + Text(active.displayName) + .font(.system(size: 9, weight: .medium, design: .monospaced)) + .foregroundColor(.white.opacity(0.5)) + } + } + } + + // MARK: - Helpers + + private func slotEmoji(_ slot: Int) -> String { + let circled = ["①","②","③","④","⑤"] + guard slot >= 1, slot <= circled.count else { return "\(slot)" } + return circled[slot - 1] + } + + private func feedLabel(for role: CallMessage.Role) -> String { + switch role { + case .user: return "You" + case .assistant: return "AutoClawd" + case .tool: return "Tool" + case .error: return "Error" + case .external: return "Claw'd" + } + } + + private func feedColor(for role: CallMessage.Role) -> Color { + switch role { + case .user: return .white + case .assistant: return .teal + case .tool: return .yellow + case .error: return .red + case .external: return .orange + } + } + + private func feedIcon(for role: CallMessage.Role) -> String { + switch role { + case .user: return "mic.fill" + case .assistant: return "brain" + case .tool: return "wrench.adjustable" + case .error: return "exclamationmark.triangle" + case .external: return "terminal" + } + } + + private func participantColor(_ kind: ParticipantKind) -> Color { + switch kind { + case .llama: return .teal + case .claudeCode: return .orange + case .connection: return .purple + } + } +} + +// MARK: - ParticipantTileView + +struct ParticipantTileView: View { + let participant: CallParticipant + let isActive: Bool + let onTap: () -> Void + let onPause: () -> Void + let onRemove: (() -> Void)? + + private var tileColor: Color { + switch participant.kind { + case .llama: return .teal + case .claudeCode: return .orange + case .connection: return .purple + } + } + + var body: some View { + VStack(spacing: 5) { + // Top row: gesture slot + controls + HStack(spacing: 4) { + slotBadge + Spacer() + pauseButton + if let rm = onRemove { removeButton(action: rm) } + } + + // Mascot + ParticipantMascotView( + kind: participant.kind, + state: participant.state, + isPaused: participant.isPaused + ) + .frame(width: 54, height: 54) + + // Name + Text(participant.displayName) + .font(.system(size: 9, weight: .semibold, design: .monospaced)) + .foregroundColor(isActive ? .white : .white.opacity(0.4)) + .lineLimit(1) + + // State label + stateLabel + } + .padding(.horizontal, 9) + .padding(.vertical, 8) + .frame(width: 104) + .background( + RoundedRectangle(cornerRadius: 14, style: .continuous) + .fill(Color.white.opacity(isActive ? 0.07 : 0.02)) + .overlay( + RoundedRectangle(cornerRadius: 14, style: .continuous) + .stroke( + isActive ? tileColor.opacity(0.55) : Color.white.opacity(0.08), + lineWidth: isActive ? 1.5 : 0.5 + ) + ) + ) + .contentShape(Rectangle()) + .onTapGesture { onTap() } + .animation(.easeInOut(duration: 0.15), value: isActive) + .animation(.easeInOut(duration: 0.15), value: participant.state) + } + + private var slotBadge: some View { + Text(circledDigit(participant.gestureSlot)) + .font(.system(size: 9, weight: .bold, design: .monospaced)) + .foregroundColor(isActive ? tileColor : .white.opacity(0.25)) + } + + private var pauseButton: some View { + Button(action: onPause) { + Image(systemName: participant.isPaused ? "play.fill" : "pause.fill") + .font(.system(size: 7, weight: .bold)) + .foregroundColor(.white.opacity(0.45)) + .frame(width: 16, height: 16) + .background(Circle().fill(Color.white.opacity(0.06))) + } + .buttonStyle(.plain) + } + + private func removeButton(action: @escaping () -> Void) -> some View { + Button(action: action) { + Image(systemName: "xmark") + .font(.system(size: 7, weight: .bold)) + .foregroundColor(.white.opacity(0.35)) + .frame(width: 16, height: 16) + .background(Circle().fill(Color.white.opacity(0.06))) + } + .buttonStyle(.plain) + } + + @ViewBuilder + private var stateLabel: some View { + if participant.isPaused { + Text("paused") + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(.white.opacity(0.2)) + } else { + switch participant.state { + case .idle: + Text("idle") + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(.white.opacity(0.2)) + case .thinking: + ThinkingDotsView() + case .streaming: + Text("streaming") + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(tileColor.opacity(0.7)) + case .paused: + Text("paused") + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(.white.opacity(0.2)) + } + } + } + + private func circledDigit(_ n: Int) -> String { + let circled = ["①","②","③","④","⑤"] + guard n >= 1, n <= circled.count else { return "\(n)" } + return circled[n - 1] + } +} + +// MARK: - ParticipantMascotView + +struct ParticipantMascotView: View { + let kind: ParticipantKind + let state: ParticipantState + let isPaused: Bool + + @State private var breathe = false + @State private var shimmer = false + + private var mascotIcon: String { + switch kind { + case .llama: return "brain" + case .claudeCode: return "hammer.fill" + case .connection(_, _): return "cable.connector" + } + } + + private var mascotColor: Color { + switch kind { + case .llama: return .teal + case .claudeCode: return .orange + case .connection: return .purple + } + } + + var body: some View { + ZStack { + // Outer glow ring — pulses when streaming + Circle() + .stroke( + mascotColor.opacity(isPaused ? 0 : (state == .streaming ? 0.4 : 0.12)), + lineWidth: state == .streaming ? 2 : 1 + ) + .scaleEffect(shimmer ? 1.15 : 1.0) + .opacity(shimmer ? 0 : 1) + + // Background fill + Circle() + .fill(mascotColor.opacity(isPaused ? 0.04 : 0.12)) + + // Icon + Image(systemName: mascotIcon) + .font(.system(size: 22, weight: .medium)) + .foregroundColor(isPaused ? .gray.opacity(0.3) : mascotColor) + .scaleEffect(breathe ? 1.06 : 1.0) + + // Thinking spinner overlay + if state == .thinking && !isPaused { + Circle() + .trim(from: 0, to: 0.65) + .stroke(mascotColor.opacity(0.5), style: StrokeStyle(lineWidth: 1.5, lineCap: .round)) + .rotationEffect(.degrees(breathe ? 360 : 0)) + .animation(.linear(duration: 0.9).repeatForever(autoreverses: false), value: breathe) + } + + // Paused badge + if isPaused { + Image(systemName: "pause.fill") + .font(.system(size: 9)) + .foregroundColor(.white.opacity(0.3)) + .offset(x: 14, y: 14) + } + } + .onAppear { animate() } + .onChange(of: state) { _ in animate() } + .onChange(of: isPaused) { _ in animate() } + } + + private func animate() { + switch state { + case .idle: + withAnimation(.easeInOut(duration: 2.4).repeatForever(autoreverses: true)) { + breathe = true + } + shimmer = false + case .thinking: + breathe = true // spinner uses this + case .streaming: + withAnimation(.easeInOut(duration: 0.6).repeatForever(autoreverses: false)) { + shimmer = true + } + withAnimation(.easeInOut(duration: 1.2).repeatForever(autoreverses: true)) { + breathe = true + } + case .paused: + breathe = false + shimmer = false + } + } +} + +// MARK: - ThinkingDotsView + +private struct ThinkingDotsView: View { + @State private var phase = 0 + + var body: some View { + HStack(spacing: 3) { + ForEach(0..<3, id: \.self) { i in + Circle() + .fill(Color.white.opacity(phase == i ? 0.7 : 0.2)) + .frame(width: 4, height: 4) + } + } + .onAppear { + Timer.scheduledTimer(withTimeInterval: 0.35, repeats: true) { _ in + phase = (phase + 1) % 3 + } + } + } +} + +// MARK: - CallFeedMessageRow + +private struct CallFeedMessageRow: View { + let participantName: String + let color: Color + let icon: String + let text: String + + var body: some View { + HStack(alignment: .top, spacing: 8) { + Image(systemName: icon) + .font(.system(size: 9)) + .foregroundColor(color.opacity(0.6)) + .frame(width: 14) + .padding(.top, 2) + + VStack(alignment: .leading, spacing: 2) { + Text(participantName.uppercased()) + .font(.system(size: 8, weight: .bold, design: .monospaced)) + .foregroundColor(color.opacity(0.5)) + Text(text) + .font(.system(size: 12)) + .foregroundColor(.white.opacity(0.85)) + .textSelection(.enabled) + .fixedSize(horizontal: false, vertical: true) + } + } + } +} diff --git a/Sources/MainPanelView.swift b/Sources/MainPanelView.swift index cf1e0bc..0c448dc 100644 --- a/Sources/MainPanelView.swift +++ b/Sources/MainPanelView.swift @@ -68,8 +68,8 @@ struct MainPanelView: View { .opacity(selectedTab == .world && appState.pillMode != .callMode ? 1 : 0) .allowsHitTesting(selectedTab == .world && appState.pillMode != .callMode) - // Zoom-call layout: replaces HQ view when Call Mode is active. - CallModeZoomView(appState: appState) + // Call Mode room: replaces HQ view when Call Mode is active. + CallModeRoomView(appState: appState) .frame(maxWidth: .infinity, maxHeight: .infinity) .opacity(selectedTab == .world && appState.pillMode == .callMode ? 1 : 0) .allowsHitTesting(selectedTab == .world && appState.pillMode == .callMode) From fe72a69bf45932cf8937e25e9e8381e768b3fc8e Mon Sep 17 00:00:00 2001 From: Sameep Rehlan Date: Sun, 8 Mar 2026 14:09:24 +0530 Subject: [PATCH 3/6] =?UTF-8?q?feat:=20plugin=20participants=20=E2=80=94?= =?UTF-8?q?=20Claude=20Code=20invites=20tools=20to=20the=20call?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 4 new MCP tools let Claude Code orchestrate a live cast of participants: - autoclawd_invite_participant(id, name, system_image) → tile joins call - autoclawd_set_participant_state(id, state) → animate tile - autoclawd_send_participant_message(id, name, text) → attributed feed bubble - autoclawd_remove_participant(id) → tile leaves CallRoom: connection kind now carries systemImage + consistent hash-derived tile color so each plugin always gets the same hue. CallMessage gains participantID/participantName so feed rows show the correct plugin name, icon, and color rather than a generic "Plugin" label. Co-Authored-By: Claude Sonnet 4.6 --- Sources/AppDelegate.swift | 22 ++++- Sources/CallModeRoomView.swift | 99 +++++++++++----------- Sources/CallModeSession.swift | 19 +++-- Sources/CallRoom.swift | 38 ++++++--- Sources/MCPServer.swift | 140 ++++++++++++++++++++++++++++++-- Sources/MainPanelView.swift | 22 ++--- Sources/WidgetCanvasViews.swift | 22 ++--- 7 files changed, 269 insertions(+), 93 deletions(-) diff --git a/Sources/AppDelegate.swift b/Sources/AppDelegate.swift index 0624df2..5837206 100644 --- a/Sources/AppDelegate.swift +++ b/Sources/AppDelegate.swift @@ -42,7 +42,27 @@ final class AppDelegate: NSObject, NSApplicationDelegate { isPausedProvider: { [weak self] in !(self?.appState.callRoom.claudeCodeIsActive ?? true) }, canvasWriter: { [weak self] text in self?.appState.callModeSession.appendExternalMessage(text) }, onJoined: { [weak self] in self?.appState.callRoom.claudeCodeJoined() }, - onLeft: { [weak self] in self?.appState.callRoom.claudeCodeLeft() } + onLeft: { [weak self] in self?.appState.callRoom.claudeCodeLeft() }, + onInviteParticipant: { [weak self] id, name, icon in + self?.appState.callRoom.connectionJoined(id: id, name: name, systemImage: icon) + }, + onSetParticipantState: { [weak self] id, stateStr in + guard let room = self?.appState.callRoom else { return } + let state: ParticipantState + switch stateStr { + case "thinking": state = .thinking + case "streaming": state = .streaming + case "paused": state = .paused + default: state = .idle + } + room.setState(state, for: id) + }, + onParticipantMessage: { [weak self] id, name, text in + self?.appState.callModeSession.appendParticipantMessage(id: id, name: name, text: text) + }, + onRemoveParticipant: { [weak self] id in + self?.appState.callRoom.remove(id: id) + } ) // Configure call mode session with the same transcript provider. diff --git a/Sources/CallModeRoomView.swift b/Sources/CallModeRoomView.swift index 365ad40..cd85354 100644 --- a/Sources/CallModeRoomView.swift +++ b/Sources/CallModeRoomView.swift @@ -85,12 +85,12 @@ struct CallModeRoomView: View { ) .id("transcript") } - // Messages from participants (Claude Code / external) + // Messages from participants (Claude Code / external / plugin) ForEach(appState.callModeSession.messages) { msg in CallFeedMessageRow( - participantName: feedLabel(for: msg.role), - color: feedColor(for: msg.role), - icon: feedIcon(for: msg.role), + participantName: msg.participantName ?? feedLabel(for: msg.role), + color: participantFeedColor(for: msg), + icon: participantFeedIcon(for: msg), text: msg.text ) .id(msg.id) @@ -239,7 +239,7 @@ struct CallModeRoomView: View { .foregroundColor(.white.opacity(0.3)) Image(systemName: active.mascotSystemImage) .font(.system(size: 10)) - .foregroundColor(participantColor(active.kind)) + .foregroundColor(active.tileColor) Text(active.displayName) .font(.system(size: 9, weight: .medium, design: .monospaced)) .foregroundColor(.white.opacity(0.5)) @@ -257,40 +257,53 @@ struct CallModeRoomView: View { private func feedLabel(for role: CallMessage.Role) -> String { switch role { - case .user: return "You" - case .assistant: return "AutoClawd" - case .tool: return "Tool" - case .error: return "Error" - case .external: return "Claw'd" + case .user: return "You" + case .assistant: return "AutoClawd" + case .tool: return "Tool" + case .error: return "Error" + case .external: return "Claw'd" + case .participant: return "Plugin" } } private func feedColor(for role: CallMessage.Role) -> Color { switch role { - case .user: return .white - case .assistant: return .teal - case .tool: return .yellow - case .error: return .red - case .external: return .orange + case .user: return .white + case .assistant: return .teal + case .tool: return .yellow + case .error: return .red + case .external: return .orange + case .participant: return .purple } } private func feedIcon(for role: CallMessage.Role) -> String { switch role { - case .user: return "mic.fill" - case .assistant: return "brain" - case .tool: return "wrench.adjustable" - case .error: return "exclamationmark.triangle" - case .external: return "terminal" + case .user: return "mic.fill" + case .assistant: return "brain" + case .tool: return "wrench.adjustable" + case .error: return "exclamationmark.triangle" + case .external: return "terminal" + case .participant: return "cable.connector" } } - private func participantColor(_ kind: ParticipantKind) -> Color { - switch kind { - case .llama: return .teal - case .claudeCode: return .orange - case .connection: return .purple + /// Feed color for a message — uses the participant's tile color when available. + private func participantFeedColor(for msg: CallMessage) -> Color { + if msg.role == .participant, let pid = msg.participantID, + let p = appState.callRoom.participants.first(where: { $0.id == pid }) { + return p.tileColor } + return feedColor(for: msg.role) + } + + /// Feed icon for a message — uses the participant's mascot icon when available. + private func participantFeedIcon(for msg: CallMessage) -> String { + if msg.role == .participant, let pid = msg.participantID, + let p = appState.callRoom.participants.first(where: { $0.id == pid }) { + return p.mascotSystemImage + } + return feedIcon(for: msg.role) } } @@ -303,13 +316,7 @@ struct ParticipantTileView: View { let onPause: () -> Void let onRemove: (() -> Void)? - private var tileColor: Color { - switch participant.kind { - case .llama: return .teal - case .claudeCode: return .orange - case .connection: return .purple - } - } + private var tileColor: Color { participant.tileColor } var body: some View { VStack(spacing: 5) { @@ -323,9 +330,10 @@ struct ParticipantTileView: View { // Mascot ParticipantMascotView( - kind: participant.kind, - state: participant.state, - isPaused: participant.isPaused + kind: participant.kind, + state: participant.state, + isPaused: participant.isPaused, + tileColor: participant.tileColor ) .frame(width: 54, height: 54) @@ -422,28 +430,23 @@ struct ParticipantTileView: View { // MARK: - ParticipantMascotView struct ParticipantMascotView: View { - let kind: ParticipantKind - let state: ParticipantState - let isPaused: Bool + let kind: ParticipantKind + let state: ParticipantState + let isPaused: Bool + let tileColor: Color @State private var breathe = false @State private var shimmer = false private var mascotIcon: String { switch kind { - case .llama: return "brain" - case .claudeCode: return "hammer.fill" - case .connection(_, _): return "cable.connector" + case .llama: return "brain" + case .claudeCode: return "hammer.fill" + case .connection(_, _, let icon): return icon } } - private var mascotColor: Color { - switch kind { - case .llama: return .teal - case .claudeCode: return .orange - case .connection: return .purple - } - } + private var mascotColor: Color { tileColor } var body: some View { ZStack { diff --git a/Sources/CallModeSession.swift b/Sources/CallModeSession.swift index f2c5936..61d74b7 100644 --- a/Sources/CallModeSession.swift +++ b/Sources/CallModeSession.swift @@ -72,6 +72,12 @@ final class CallModeSession: ObservableObject { messages.append(CallMessage(role: .external, text: text)) } + /// Append a message attributed to a named plugin participant (via autoclawd_send_participant_message). + func appendParticipantMessage(id: String, name: String, text: String) { + messages.append(CallMessage(role: .participant, text: text, + participantID: id, participantName: name)) + } + // MARK: - Agent Loop /// Tool-use loop: request → if tool_use → execute → continue → until end_turn. @@ -294,11 +300,14 @@ final class CallModeSession: ObservableObject { // MARK: - Supporting Types struct CallMessage: Identifiable { - let id = UUID() - let role: Role - let text: String - - enum Role { case user, assistant, tool, error, external } + let id: UUID = UUID() + let role: Role + let text: String + /// Set when the message comes from a plugin participant (autoclawd_send_participant_message). + var participantID: String? = nil + var participantName: String? = nil + + enum Role { case user, assistant, tool, error, external, participant } } enum CallModeError: Error, LocalizedError { diff --git a/Sources/CallRoom.swift b/Sources/CallRoom.swift index 90e7b1d..e2c3d3c 100644 --- a/Sources/CallRoom.swift +++ b/Sources/CallRoom.swift @@ -4,9 +4,9 @@ import SwiftUI // MARK: - ParticipantKind enum ParticipantKind: Equatable { - case llama // AutoClawd PM — always present - case claudeCode // joins via MCP session - case connection(id: String, name: String) // future plugin integrations + case llama // AutoClawd PM — always present + case claudeCode // joins via MCP session + case connection(id: String, name: String, systemImage: String) // plugin/tool participants } // MARK: - ParticipantState @@ -29,17 +29,28 @@ struct CallParticipant: Identifiable { var displayName: String { switch kind { - case .llama: return "AutoClawd" - case .claudeCode: return "Claw'd" - case .connection(_, let name): return name + case .llama: return "AutoClawd" + case .claudeCode: return "Claw'd" + case .connection(_, let name, _): return name } } var mascotSystemImage: String { switch kind { - case .llama: return "brain" - case .claudeCode: return "terminal" - case .connection: return "plug" + case .llama: return "brain" + case .claudeCode: return "hammer.fill" + case .connection(_, _, let icon): return icon + } + } + + /// Consistent color per participant — connections derive hue from their ID. + var tileColor: Color { + switch kind { + case .llama: return .teal + case .claudeCode: return .orange + case .connection(let id, _, _): + let hash = id.unicodeScalars.reduce(0) { ($0 &+ Int($1.value)) % 360 } + return Color(hue: Double(hash) / 360.0, saturation: 0.65, brightness: 0.95) } } @@ -102,9 +113,12 @@ final class CallRoom: ObservableObject { rebuildSlots() } - func connectionJoined(id: String, name: String) { - guard !participants.contains(where: { $0.id == id }) else { return } - var p = CallParticipant(id: id, kind: .connection(id: id, name: name)) + func connectionJoined(id: String, name: String, systemImage: String = "cable.connector") { + guard !participants.contains(where: { $0.id == id }) else { + updateLastActivity(id: id) + return + } + var p = CallParticipant(id: id, kind: .connection(id: id, name: name, systemImage: systemImage)) p.gestureSlot = participants.count + 1 participants.append(p) rebuildSlots() diff --git a/Sources/MCPServer.swift b/Sources/MCPServer.swift index 2b536a6..53aee55 100644 --- a/Sources/MCPServer.swift +++ b/Sources/MCPServer.swift @@ -43,6 +43,14 @@ final class MCPServer: @unchecked Sendable { private var onJoined: (@MainActor () -> Void)? /// Fired on @MainActor when no MCP activity for `leaveTimeoutSeconds`. private var onLeft: (@MainActor () -> Void)? + /// Invite a plugin/tool as a call participant: (id, name, systemImage). + private var onInviteParticipant: (@MainActor (String, String, String) -> Void)? + /// Set a participant's state: (id, stateString). + private var onSetParticipantState: (@MainActor (String, String) -> Void)? + /// Append a feed message attributed to a participant: (id, name, text). + private var onParticipantMessage: (@MainActor (String, String, String) -> Void)? + /// Remove a participant from the call: (id). + private var onRemoveParticipant: (@MainActor (String) -> Void)? /// Last time any MCP request was received from a Claude Code session. private var lastActivityDate: Date? @@ -57,14 +65,22 @@ final class MCPServer: @unchecked Sendable { isPausedProvider: (@MainActor () -> Bool)? = nil, canvasWriter: (@MainActor (String) -> Void)? = nil, onJoined: (@MainActor () -> Void)? = nil, - onLeft: (@MainActor () -> Void)? = nil) { + onLeft: (@MainActor () -> Void)? = nil, + onInviteParticipant: (@MainActor (String, String, String) -> Void)? = nil, + onSetParticipantState: (@MainActor (String, String) -> Void)? = nil, + onParticipantMessage: (@MainActor (String, String, String) -> Void)? = nil, + onRemoveParticipant: (@MainActor (String) -> Void)? = nil) { guard listener == nil else { return } - self.screenGrab = screenGrab - self.transcriptProvider = transcriptProvider - self.isPausedProvider = isPausedProvider - self.canvasWriter = canvasWriter - self.onJoined = onJoined - self.onLeft = onLeft + self.screenGrab = screenGrab + self.transcriptProvider = transcriptProvider + self.isPausedProvider = isPausedProvider + self.canvasWriter = canvasWriter + self.onJoined = onJoined + self.onLeft = onLeft + self.onInviteParticipant = onInviteParticipant + self.onSetParticipantState = onSetParticipantState + self.onParticipantMessage = onParticipantMessage + self.onRemoveParticipant = onRemoveParticipant do { let params = NWParameters.tcp @@ -344,6 +360,75 @@ final class MCPServer: @unchecked Sendable { ], "required": ["text"] ] as [String: Any] + ], + [ + "name": "autoclawd_invite_participant", + "description": """ + Add a plugin, tool, or service as a visible participant tile in the Call Mode room. \ + Use this when you start using an external service (GitHub, Gmail, Calendar, \ + Remotion, web search, etc.) so the user can see it join the call. \ + Participants appear as named tiles with icons and animated states. + """, + "inputSchema": [ + "type": "object", + "properties": [ + "id": ["type": "string", + "description": "Unique stable ID (e.g. \"github\", \"gmail\")."], + "name": ["type": "string", + "description": "Display name shown on the tile."], + "system_image": ["type": "string", + "description": "SF Symbol name (e.g. \"envelope.fill\", \"globe\")."] + ], + "required": ["id", "name", "system_image"] + ] as [String: Any] + ], + [ + "name": "autoclawd_set_participant_state", + "description": """ + Update the visual state of a call participant tile. \ + Use "thinking" when starting work, "streaming" while producing output, \ + "idle" when done, and "paused" to mute/suspend. + """, + "inputSchema": [ + "type": "object", + "properties": [ + "id": ["type": "string", "description": "Participant ID to update."], + "state": ["type": "string", + "enum": ["idle", "thinking", "streaming", "paused"], + "description": "New visual state."] + ], + "required": ["id", "state"] + ] as [String: Any] + ], + [ + "name": "autoclawd_send_participant_message", + "description": """ + Post a message to the shared call feed attributed to a specific participant. \ + Use this to surface a plugin's output as a chat bubble from that participant. + """, + "inputSchema": [ + "type": "object", + "properties": [ + "id": ["type": "string", "description": "Participant ID sending the message."], + "name": ["type": "string", "description": "Display name for the message header."], + "text": ["type": "string", "description": "Message content to show in the feed."] + ], + "required": ["id", "name", "text"] + ] as [String: Any] + ], + [ + "name": "autoclawd_remove_participant", + "description": """ + Remove a plugin participant from the call room when you are done using it. \ + The tile will disappear from the participants row. + """, + "inputSchema": [ + "type": "object", + "properties": [ + "id": ["type": "string", "description": "Participant ID to remove."] + ], + "required": ["id"] + ] as [String: Any] ] ] } @@ -418,6 +503,47 @@ final class MCPServer: @unchecked Sendable { content = [["type": "text", "text": "No text provided or canvas unavailable."]] } + case "autoclawd_invite_participant": + let id = args["id"] as? String ?? "" + let name = args["name"] as? String ?? "" + let icon = args["system_image"] as? String ?? "cable.connector" + if !id.isEmpty, let cb = onInviteParticipant { + Task { @MainActor in cb(id, name, icon) } + content = [["type": "text", "text": "\(name) joined the call."]] + } else { + content = [["type": "text", "text": "Could not invite participant."]] + } + + case "autoclawd_set_participant_state": + let id = args["id"] as? String ?? "" + let stateStr = args["state"] as? String ?? "idle" + if !id.isEmpty, let cb = onSetParticipantState { + Task { @MainActor in cb(id, stateStr) } + content = [["type": "text", "text": "State updated."]] + } else { + content = [["type": "text", "text": "Could not update participant state."]] + } + + case "autoclawd_send_participant_message": + let id = args["id"] as? String ?? "" + let name = args["name"] as? String ?? "" + let text = args["text"] as? String ?? "" + if !id.isEmpty, !text.isEmpty, let cb = onParticipantMessage { + Task { @MainActor in cb(id, name, text) } + content = [["type": "text", "text": "Message posted."]] + } else { + content = [["type": "text", "text": "Could not post participant message."]] + } + + case "autoclawd_remove_participant": + let id = args["id"] as? String ?? "" + if !id.isEmpty, let cb = onRemoveParticipant { + Task { @MainActor in cb(id) } + content = [["type": "text", "text": "Participant removed."]] + } else { + content = [["type": "text", "text": "Could not remove participant."]] + } + default: content = [["type": "text", "text": "Unknown tool: \(name)"]] } diff --git a/Sources/MainPanelView.swift b/Sources/MainPanelView.swift index 0c448dc..2789118 100644 --- a/Sources/MainPanelView.swift +++ b/Sources/MainPanelView.swift @@ -276,21 +276,23 @@ private struct CallZoomMessageRow: View { private var roleColor: Color { switch message.role { - case .user: return .white - case .assistant: return .cyan - case .tool: return .yellow - case .error: return .red - case .external: return .green + case .user: return .white + case .assistant: return .cyan + case .tool: return .yellow + case .error: return .red + case .external: return .green + case .participant: return .purple } } private var roleLabel: String { switch message.role { - case .user: return "YOU" - case .assistant: return "CLAUDE" - case .tool: return "TOOL" - case .error: return "ERR" - case .external: return "CC" + case .user: return "YOU" + case .assistant: return "CLAUDE" + case .tool: return "TOOL" + case .error: return "ERR" + case .external: return "CC" + case .participant: return message.participantName?.uppercased() ?? "PLUGIN" } } } diff --git a/Sources/WidgetCanvasViews.swift b/Sources/WidgetCanvasViews.swift index 5d49c4d..3bab5dd 100644 --- a/Sources/WidgetCanvasViews.swift +++ b/Sources/WidgetCanvasViews.swift @@ -524,21 +524,23 @@ private struct CallMessageBubble: View { private var roleColor: Color { switch message.role { - case .user: return .white.opacity(0.45) - case .assistant: return .cyan - case .tool: return .yellow.opacity(0.60) - case .error: return .red - case .external: return .green + case .user: return .white.opacity(0.45) + case .assistant: return .cyan + case .tool: return .yellow.opacity(0.60) + case .error: return .red + case .external: return .green + case .participant: return .purple } } private var textColor: Color { switch message.role { - case .user: return .white.opacity(0.72) - case .assistant: return .white.opacity(0.88) - case .tool: return .white.opacity(0.38) - case .error: return .red.opacity(0.80) - case .external: return .green.opacity(0.85) + case .user: return .white.opacity(0.72) + case .assistant: return .white.opacity(0.88) + case .tool: return .white.opacity(0.38) + case .error: return .red.opacity(0.80) + case .external: return .green.opacity(0.85) + case .participant: return .white.opacity(0.85) } } } From bf1d5050c412b0947a274afc304d5872df797cf4 Mon Sep 17 00:00:00 2001 From: Sameep Rehlan Date: Sun, 8 Mar 2026 14:16:01 +0530 Subject: [PATCH 4/6] fix: remove participant tile tools from MCP tool list These tools (invite/set_state/send_message/remove) would have appeared in Claude Code's tools/list and been called during normal work, burning credits on pure UI bookkeeping. Callbacks and CallRoom plumbing remain for future UI-driven or stream-inferred participant management. Co-Authored-By: Claude Sonnet 4.6 --- Sources/MCPServer.swift | 74 +++-------------------------------------- 1 file changed, 5 insertions(+), 69 deletions(-) diff --git a/Sources/MCPServer.swift b/Sources/MCPServer.swift index 53aee55..1c9ddad 100644 --- a/Sources/MCPServer.swift +++ b/Sources/MCPServer.swift @@ -361,78 +361,14 @@ final class MCPServer: @unchecked Sendable { "required": ["text"] ] as [String: Any] ], - [ - "name": "autoclawd_invite_participant", - "description": """ - Add a plugin, tool, or service as a visible participant tile in the Call Mode room. \ - Use this when you start using an external service (GitHub, Gmail, Calendar, \ - Remotion, web search, etc.) so the user can see it join the call. \ - Participants appear as named tiles with icons and animated states. - """, - "inputSchema": [ - "type": "object", - "properties": [ - "id": ["type": "string", - "description": "Unique stable ID (e.g. \"github\", \"gmail\")."], - "name": ["type": "string", - "description": "Display name shown on the tile."], - "system_image": ["type": "string", - "description": "SF Symbol name (e.g. \"envelope.fill\", \"globe\")."] - ], - "required": ["id", "name", "system_image"] - ] as [String: Any] - ], - [ - "name": "autoclawd_set_participant_state", - "description": """ - Update the visual state of a call participant tile. \ - Use "thinking" when starting work, "streaming" while producing output, \ - "idle" when done, and "paused" to mute/suspend. - """, - "inputSchema": [ - "type": "object", - "properties": [ - "id": ["type": "string", "description": "Participant ID to update."], - "state": ["type": "string", - "enum": ["idle", "thinking", "streaming", "paused"], - "description": "New visual state."] - ], - "required": ["id", "state"] - ] as [String: Any] - ], - [ - "name": "autoclawd_send_participant_message", - "description": """ - Post a message to the shared call feed attributed to a specific participant. \ - Use this to surface a plugin's output as a chat bubble from that participant. - """, - "inputSchema": [ - "type": "object", - "properties": [ - "id": ["type": "string", "description": "Participant ID sending the message."], - "name": ["type": "string", "description": "Display name for the message header."], - "text": ["type": "string", "description": "Message content to show in the feed."] - ], - "required": ["id", "name", "text"] - ] as [String: Any] - ], - [ - "name": "autoclawd_remove_participant", - "description": """ - Remove a plugin participant from the call room when you are done using it. \ - The tile will disappear from the participants row. - """, - "inputSchema": [ - "type": "object", - "properties": [ - "id": ["type": "string", "description": "Participant ID to remove."] - ], - "required": ["id"] - ] as [String: Any] - ] ] } + // NOTE: Participant tile tools (invite/set_state/send_message/remove) are intentionally + // NOT exposed via tools/list. Tile management is UI-driven (user taps Invite) or will + // be inferred from Claude Code's existing tool-call stream — never by making Claude Code + // call extra MCP tools that burn tokens on pure UI bookkeeping. + // MARK: - Tool Execution private func callTool(name: String, args: [String: Any]) async -> [String: Any] { From a8a675cf894af5c206983795f922170ff56ed82d Mon Sep 17 00:00:00 2001 From: Sameep Rehlan Date: Sun, 8 Mar 2026 14:34:47 +0530 Subject: [PATCH 5/6] feat: Claude Code hooks integration with real-time Llama narration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - HookNarrationService: parses PostToolUse/Stop hook events; calls Llama to produce a 1-sentence natural-language narration (e.g. "Claw'd is reading package.json to understand the project"); falls back to template descriptions if Ollama is unavailable - MCPServer: adds POST /hook endpoint — Claude Code hooks curl here on every tool event; fires onHookEvent callback on @MainActor - MCPConfigManager: writeHooksConfig() merges PostToolUse + Stop hook entries into ~/.claude/settings.json automatically on launch - AppDelegate: wires onHookEvent → narrate → appendParticipantMessage so every Claude Code tool call appears as a Claw'd feed bubble in the call room; also calls writeHooksConfig() at startup The call room now acts as a live video-call window into a Claude Code session — each tool use becomes a narrated story beat in the feed. Co-Authored-By: Claude Sonnet 4.6 --- Sources/AppDelegate.swift | 28 +++++ Sources/HookNarrationService.swift | 177 +++++++++++++++++++++++++++++ Sources/MCPConfigManager.swift | 48 ++++++++ Sources/MCPServer.swift | 18 ++- 4 files changed, 270 insertions(+), 1 deletion(-) create mode 100644 Sources/HookNarrationService.swift diff --git a/Sources/AppDelegate.swift b/Sources/AppDelegate.swift index 5837206..7355299 100644 --- a/Sources/AppDelegate.swift +++ b/Sources/AppDelegate.swift @@ -62,9 +62,37 @@ final class AppDelegate: NSObject, NSApplicationDelegate { }, onRemoveParticipant: { [weak self] id in self?.appState.callRoom.remove(id: id) + }, + onHookEvent: { [weak self] event in + guard let self else { return } + // When Claude Code fires a hook, narrate it via Llama and post + // the sentence to the call-room feed attributed to the Claw'd tile. + let room = self.appState.callRoom + let session = self.appState.callModeSession + + // Ensure Claw'd is in the room (hooks can arrive before MCP initialize). + room.claudeCodeJoined() + + // Show "thinking" state while narration is being generated. + room.setState(.thinking, for: "claude-code") + + Task { + let narration = await HookNarrationService().narrate(event) + await MainActor.run { + session.appendParticipantMessage( + id: "claude-code", + name: "Claw'd", + text: narration + ) + room.setState(event.isStop ? .idle : .streaming, for: "claude-code") + } + } } ) + // Auto-register Claude Code hooks so PostToolUse events arrive at /hook. + MCPConfigManager.writeHooksConfig() + // Configure call mode session with the same transcript provider. appState.callModeSession.configure( transcriptProvider: { [weak self] in self?.appState.liveTranscriptText ?? "" } diff --git a/Sources/HookNarrationService.swift b/Sources/HookNarrationService.swift new file mode 100644 index 0000000..afd4834 --- /dev/null +++ b/Sources/HookNarrationService.swift @@ -0,0 +1,177 @@ +import Foundation + +// MARK: - HookEvent + +/// A parsed Claude Code hook event (PostToolUse, Stop, PreToolUse, etc.). +struct HookEvent { + let eventName: String // "PostToolUse", "Stop", "PreToolUse", … + let toolName: String? + let toolInput: [String: Any]? + let toolResponse: [String: Any]? + let sessionID: String? + let rawJSON: [String: Any] + + /// True when the hook signals the session has finished. + var isStop: Bool { eventName == "Stop" } + + /// True when the hook signals a tool is about to run (pre-tool). + var isPreTool: Bool { eventName == "PreToolUse" } +} + +// MARK: - HookNarrationService + +/// Translates raw Claude Code hook events into one-sentence human-readable narratives. +/// +/// Pipeline: +/// 1. Parse the raw JSON from `/hook` into a `HookEvent`. +/// 2. `narrate(_:)` builds a compact summary string. +/// 3. If Ollama is reachable, sends a short prompt to Llama and returns its response. +/// 4. If Ollama is unavailable or slow, falls back to the template summary. +/// +/// The narrated sentence is then shown in the call-room feed attributed to the +/// Claude Code participant tile — turning the call view into a live running +/// commentary of what Claude Code is doing. +final class HookNarrationService: @unchecked Sendable { + + private let ollama = OllamaService() + + // MARK: - Parse + + /// Parse raw hook JSON into a HookEvent. + static func parse(_ json: [String: Any]) -> HookEvent { + HookEvent( + eventName: json["hook_event_name"] as? String + ?? json["type"] as? String + ?? "Unknown", + toolName: json["tool_name"] as? String, + toolInput: json["tool_input"] as? [String: Any], + toolResponse: json["tool_response"] as? [String: Any], + sessionID: json["session_id"] as? String, + rawJSON: json + ) + } + + // MARK: - Narrate + + /// Returns a short, natural-language sentence describing what just happened. + /// Uses Llama if available; falls back to a template description otherwise. + func narrate(_ event: HookEvent) async -> String { + // Stop events don't need an LLM call + if event.isStop { + return "Claw'd finished the task." + } + + let summary = templateSummary(event) + + do { + let prompt = """ + You are a narrator watching an AI coding assistant named "Claw'd" work. \ + In one short, casual sentence (10–15 words max), narrate what it just did. \ + Be specific. Never start with "The AI" — always use "Claw'd". \ + Don't add quotes around the sentence. + + Event: \(summary) + + Narration: + """ + var narration = try await ollama.generate(prompt: prompt, numPredict: 60) + // Strip any trailing artefacts that Llama sometimes adds + narration = narration + .components(separatedBy: "\n").first ?? narration + narration = narration + .trimmingCharacters(in: .whitespacesAndNewlines) + .trimmingCharacters(in: CharacterSet(charactersIn: "\"'")) + return narration.isEmpty ? summary : narration + } catch { + // Ollama not running — use template + return summary + } + } + + // MARK: - Template Fallback + + private func templateSummary(_ event: HookEvent) -> String { + guard let tool = event.toolName else { + return "Working…" + } + let input = event.toolInput ?? [:] + + switch tool { + + case "Read": + if let path = input["file_path"] as? String { + return "Reading \(fileName(path))" + } + return "Reading a file" + + case "Write": + if let path = input["file_path"] as? String { + return "Writing \(fileName(path))" + } + return "Writing a file" + + case "Edit": + if let path = input["file_path"] as? String { + return "Editing \(fileName(path))" + } + return "Editing a file" + + case "MultiEdit": + if let path = input["file_path"] as? String { + return "Multi-editing \(fileName(path))" + } + return "Applying multiple edits" + + case "Bash": + if let cmd = input["command"] as? String { + let short = String(cmd.prefix(50)) + return "Running: \(short)\(cmd.count > 50 ? "…" : "")" + } + return "Running a shell command" + + case "Glob": + if let pattern = input["pattern"] as? String { + return "Searching for files matching '\(pattern)'" + } + return "Searching files" + + case "Grep": + if let pattern = input["pattern"] as? String { + return "Searching code for '\(String(pattern.prefix(40)))'" + } + return "Searching in files" + + case "Task": + if let desc = input["description"] as? String { + return "Spawning sub-agent: \(String(desc.prefix(40)))" + } + return "Launching a sub-agent" + + case "WebFetch": + if let urlStr = input["url"] as? String, + let host = URL(string: urlStr)?.host { + return "Fetching \(host)" + } + return "Fetching a web page" + + case "WebSearch": + if let query = input["query"] as? String { + return "Searching the web for '\(String(query.prefix(40)))'" + } + return "Searching the web" + + case "TodoWrite": + return "Updating the task list" + + case "NotebookEdit": + return "Editing a notebook cell" + + default: + return "Using \(tool)" + } + } + + private func fileName(_ path: String) -> String { + URL(fileURLWithPath: path).lastPathComponent + } +} diff --git a/Sources/MCPConfigManager.swift b/Sources/MCPConfigManager.swift index 52efb2c..64b40d0 100644 --- a/Sources/MCPConfigManager.swift +++ b/Sources/MCPConfigManager.swift @@ -33,6 +33,54 @@ enum MCPConfigManager { } } + // MARK: - Claude Code Hooks Config + + /// Write (or merge) AutoClawd hook entries into ~/.claude/settings.json. + /// + /// Registers `PostToolUse` and `Stop` hooks that curl the AutoClawd hook + /// endpoint on every Claude Code tool event. This lets AutoClawd narrate + /// what Claude Code is doing in real time inside the call room feed. + /// + /// Safe to call repeatedly — only the `hooks` key is overwritten; all + /// other existing settings are preserved. + static func writeHooksConfig() { + let settingsURL = FileManager.default.homeDirectoryForCurrentUser + .appendingPathComponent(".claude") + .appendingPathComponent("settings.json") + + // Load existing settings or start fresh + var settings: [String: Any] = [:] + if let data = try? Data(contentsOf: settingsURL), + let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] { + settings = json + } + + let hookCommand = "curl -s -X POST http://localhost:7892/hook -H 'Content-Type: application/json' -d @-" + let hookEntry: [String: Any] = ["type": "command", "command": hookCommand] + let hookGroup: [String: Any] = ["matcher": "", "hooks": [hookEntry]] + + var hooks = settings["hooks"] as? [String: Any] ?? [:] + hooks["PostToolUse"] = [hookGroup] + hooks["Stop"] = [hookGroup] + settings["hooks"] = hooks + + do { + try FileManager.default.createDirectory( + at: settingsURL.deletingLastPathComponent(), + withIntermediateDirectories: true, + attributes: nil + ) + let data = try JSONSerialization.data( + withJSONObject: settings, + options: [.prettyPrinted, .sortedKeys] + ) + try data.write(to: settingsURL, options: .atomic) + Log.info(.system, "MCPConfigManager: hooks written to \(settingsURL.path)") + } catch { + Log.warn(.system, "MCPConfigManager: failed to write hooks config — \(error)") + } + } + private static func findMCPBinary() -> String? { // 1. Inside the app bundle (distribution) let bundleBinary = Bundle.main.bundleURL diff --git a/Sources/MCPServer.swift b/Sources/MCPServer.swift index 1c9ddad..2694d2e 100644 --- a/Sources/MCPServer.swift +++ b/Sources/MCPServer.swift @@ -51,6 +51,8 @@ final class MCPServer: @unchecked Sendable { private var onParticipantMessage: (@MainActor (String, String, String) -> Void)? /// Remove a participant from the call: (id). private var onRemoveParticipant: (@MainActor (String) -> Void)? + /// Fired on @MainActor when a Claude Code hook event arrives on POST /hook. + private var onHookEvent: (@MainActor (HookEvent) -> Void)? /// Last time any MCP request was received from a Claude Code session. private var lastActivityDate: Date? @@ -69,7 +71,8 @@ final class MCPServer: @unchecked Sendable { onInviteParticipant: (@MainActor (String, String, String) -> Void)? = nil, onSetParticipantState: (@MainActor (String, String) -> Void)? = nil, onParticipantMessage: (@MainActor (String, String, String) -> Void)? = nil, - onRemoveParticipant: (@MainActor (String) -> Void)? = nil) { + onRemoveParticipant: (@MainActor (String) -> Void)? = nil, + onHookEvent: (@MainActor (HookEvent) -> Void)? = nil) { guard listener == nil else { return } self.screenGrab = screenGrab self.transcriptProvider = transcriptProvider @@ -81,6 +84,7 @@ final class MCPServer: @unchecked Sendable { self.onSetParticipantState = onSetParticipantState self.onParticipantMessage = onParticipantMessage self.onRemoveParticipant = onRemoveParticipant + self.onHookEvent = onHookEvent do { let params = NWParameters.tcp @@ -224,6 +228,18 @@ final class MCPServer: @unchecked Sendable { return } + // Claude Code hook events — fired by PostToolUse / Stop hooks + if path == "/hook" { + if let json = try? JSONSerialization.jsonObject(with: body) as? [String: Any] { + let event = HookNarrationService.parse(json) + if let cb = onHookEvent { + Task { @MainActor in cb(event) } + } + } + sendHTTP(Data("{}".utf8), connection: connection) + return + } + guard path == "/mcp" || path.hasPrefix("/mcp?") else { sendHTTP(rpcError(code: -32_600, message: "Not found"), status: 404, connection: connection) return From b7bf9e4af19cd2af9d0de7f796570a7c6db42a43 Mon Sep 17 00:00:00 2001 From: Sameep Rehlan Date: Sun, 8 Mar 2026 16:41:47 +0530 Subject: [PATCH 6/6] feat: brutalist Call Stream Widget with storytelling feed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add CallStreamWidget.swift — always-on-top floating NSPanel (420×560) that overlays any session during Call Mode; animates in/out with slide+fade; draggable from anywhere; snaps to bottom-right by default - Add CallStreamWidgetView.swift — full brutalist redesign: · No circles, no soft rounding — sharp rectangular geometry throughout · Header: REC dot + CALL STREAM label + session timer + close button · Mission bar: orange left-accent shows user's spoken goal (first message) · Participant strip: sharp tiles with thick colored top-border accent (no circles) Square mascot icon with NSImage(named: "mascot-{id}") fallback to SF Symbol · Task queue: top 3 pending StructuredTodos; active task highlighted orange · Stream feed: brutalist group-chat format — NAME ─── time / message / image Generated/reaction messages render at 70% opacity with ~ marker · Spotlight panel: auto-surfaces latest image (full-width) or filename detected via regex from stream messages · Bottom bar: animated waveform + event count + END CALL red square button - CallModeSession.swift: add createdAt: Date to CallMessage for timestamps; add imageData and isGenerated fields; extend appendParticipantMessage - HookNarrationService.swift: complete implementation — parse hook events, extract ToolParticipant from mcp__server__tool names, extract image data from 3 tool_response payload patterns, Llama narration with template fallback, optional AutoClawd reaction for MCP/image events - CallModeRoomView.swift: CallFeedMessageRow with 2px left-border style; solid gradient for real messages, dashed segments for generated ones; inline image support via imageData - AppDelegate.swift: wire $pillMode observer to show/hide CallStreamWidget; lazy-init on first show; animate out on mode change; onHookEvent handler auto-joins tool participants and posts structured feed messages - SettingsManager.swift: add callStreamWidgetEnabled (default true) Co-Authored-By: Claude Sonnet 4.6 --- Sources/AppDelegate.swift | 80 +++- Sources/CallModeRoomView.swift | 97 +++- Sources/CallModeSession.swift | 13 +- Sources/CallStreamWidget.swift | 119 +++++ Sources/CallStreamWidgetView.swift | 719 +++++++++++++++++++++++++++++ Sources/HookNarrationService.swift | 185 +++++++- Sources/SettingsManager.swift | 12 +- 7 files changed, 1171 insertions(+), 54 deletions(-) create mode 100644 Sources/CallStreamWidget.swift create mode 100644 Sources/CallStreamWidgetView.swift diff --git a/Sources/AppDelegate.swift b/Sources/AppDelegate.swift index 7355299..55805cf 100644 --- a/Sources/AppDelegate.swift +++ b/Sources/AppDelegate.swift @@ -12,6 +12,7 @@ final class AppDelegate: NSObject, NSApplicationDelegate { private var mainPanel: MainPanelWindow? private var toastWindow: ToastWindow? private var setupWindow: SetupWindow? + private var callStreamWidget: CallStreamWidget? private var toastDismissWork: DispatchWorkItem? private var cancellables = Set() @@ -65,26 +66,61 @@ final class AppDelegate: NSObject, NSApplicationDelegate { }, onHookEvent: { [weak self] event in guard let self else { return } - // When Claude Code fires a hook, narrate it via Llama and post - // the sentence to the call-room feed attributed to the Claw'd tile. + // When Claude Code fires a hook, build a NarrationBundle via Llama and post + // a two-sided conversation to the call-room feed: + // 1. Claw'd tile — real tool narration (solid border) + // 2. Tool participant tile — auto-joined; optional image response + // 3. AutoClawd tile — generated reaction comment (dashed/faint border) let room = self.appState.callRoom let session = self.appState.callModeSession // Ensure Claw'd is in the room (hooks can arrive before MCP initialize). room.claudeCodeJoined() - - // Show "thinking" state while narration is being generated. room.setState(.thinking, for: "claude-code") Task { - let narration = await HookNarrationService().narrate(event) + let bundle = await HookNarrationService().narrate(event) await MainActor.run { + // Auto-join MCP tool participant (e.g. "pencil", "figma") + if let tp = bundle.toolParticipant { + room.connectionJoined(id: tp.id, name: tp.name, systemImage: tp.systemImage) + } + + // Post Claw'd's real narration (solid left bar) session.appendParticipantMessage( - id: "claude-code", - name: "Claw'd", - text: narration + id: "claude-code", + name: "Claw'd", + text: bundle.narration, + isGenerated: false ) room.setState(event.isStop ? .idle : .streaming, for: "claude-code") + + // Post tool response if it includes an image or text + if let tp = bundle.toolParticipant { + if bundle.imageData != nil || bundle.toolResponseText != nil { + room.setState(.streaming, for: tp.id) + session.appendParticipantMessage( + id: tp.id, + name: tp.name, + text: bundle.toolResponseText ?? "", + imageData: bundle.imageData, + isGenerated: false + ) + room.setState(.idle, for: tp.id) + } else { + room.updateLastActivity(id: tp.id) + } + } + + // Post AutoClawd's generated reaction (dashed/faint left bar) + if let reaction = bundle.autoClawdReaction { + session.appendParticipantMessage( + id: "llama", + name: "AutoClawd", + text: reaction, + isGenerated: true + ) + } } } } @@ -120,6 +156,34 @@ final class AppDelegate: NSObject, NSApplicationDelegate { } } .store(in: &cancellables) + + // Show/hide the Call Stream Widget when pill mode changes. + appState.$pillMode + .receive(on: DispatchQueue.main) + .sink { [weak self] mode in + guard let self else { return } + if mode == .callMode && SettingsManager.shared.callStreamWidgetEnabled { + self.showCallStreamWidget() + } else { + self.callStreamWidget?.animateOut() + } + } + .store(in: &cancellables) + } + + // MARK: - Call Stream Widget + + private func showCallStreamWidget() { + if callStreamWidget == nil { + let widget = CallStreamWidget() + let view = CallStreamWidgetView(appState: appState) { [weak self] in + self?.appState.pillMode = .ambientIntelligence + self?.callStreamWidget?.animateOut() + } + widget.setContent(view) + callStreamWidget = widget + } + callStreamWidget?.animateIn() } func applicationWillTerminate(_ notification: Notification) { diff --git a/Sources/CallModeRoomView.swift b/Sources/CallModeRoomView.swift index cd85354..300d76a 100644 --- a/Sources/CallModeRoomView.swift +++ b/Sources/CallModeRoomView.swift @@ -89,9 +89,11 @@ struct CallModeRoomView: View { ForEach(appState.callModeSession.messages) { msg in CallFeedMessageRow( participantName: msg.participantName ?? feedLabel(for: msg.role), - color: participantFeedColor(for: msg), - icon: participantFeedIcon(for: msg), - text: msg.text + color: participantFeedColor(for: msg), + icon: participantFeedIcon(for: msg), + text: msg.text, + imageData: msg.imageData, + isGenerated: msg.isGenerated ) .id(msg.id) } @@ -539,28 +541,79 @@ private struct ThinkingDotsView: View { private struct CallFeedMessageRow: View { let participantName: String - let color: Color - let icon: String - let text: String + let color: Color + let icon: String + let text: String + var imageData: Data? = nil + /// True for AI-generated narrative messages (AutoClawd reactions). + /// Rendered at reduced opacity with a dashed left bar. + var isGenerated: Bool = false var body: some View { - HStack(alignment: .top, spacing: 8) { - Image(systemName: icon) - .font(.system(size: 9)) - .foregroundColor(color.opacity(0.6)) - .frame(width: 14) - .padding(.top, 2) - - VStack(alignment: .leading, spacing: 2) { - Text(participantName.uppercased()) - .font(.system(size: 8, weight: .bold, design: .monospaced)) - .foregroundColor(color.opacity(0.5)) - Text(text) - .font(.system(size: 12)) - .foregroundColor(.white.opacity(0.85)) - .textSelection(.enabled) - .fixedSize(horizontal: false, vertical: true) + HStack(alignment: .top, spacing: 0) { + // 2px colored left bar — solid for real events, dashed-look for generated + if isGenerated { + // Dashed effect: two small rectangles with gap + VStack(spacing: 0) { + ForEach(0..<8, id: \.self) { i in + Rectangle() + .fill(color.opacity(i % 2 == 0 ? 0.45 : 0.0)) + .frame(width: 2, height: 4) + } + } + .frame(width: 2) + .frame(maxHeight: .infinity, alignment: .top) + .padding(.top, 4) + } else { + Rectangle() + .fill(color) + .frame(width: 2) + } + + // Content + VStack(alignment: .leading, spacing: 5) { + // Header: icon + name (lowercase monospace) + generated marker + HStack(spacing: 5) { + Image(systemName: icon) + .font(.system(size: 9)) + .foregroundColor(color.opacity(isGenerated ? 0.5 : 0.7)) + Text(participantName.lowercased()) + .font(.system(size: 9, weight: .semibold, design: .monospaced)) + .foregroundColor(color.opacity(isGenerated ? 0.45 : 0.65)) + if isGenerated { + Text("~") + .font(.system(size: 9, design: .monospaced)) + .foregroundColor(color.opacity(0.3)) + } + Spacer() + } + + // Body text (skip if empty and we have an image) + if !text.isEmpty { + Text(text) + .font(.system(size: 12, design: .monospaced)) + .foregroundColor(.white.opacity(isGenerated ? 0.55 : 0.85)) + .textSelection(.enabled) + .fixedSize(horizontal: false, vertical: true) + } + + // Inline image (Pencil screenshot, ScreenGrab, etc.) + if let data = imageData, let nsImage = NSImage(data: data) { + Image(nsImage: nsImage) + .resizable() + .aspectRatio(contentMode: .fit) + .frame(maxWidth: 280, maxHeight: 200) + .cornerRadius(6) + .overlay( + RoundedRectangle(cornerRadius: 6) + .stroke(color.opacity(0.25), lineWidth: 0.5) + ) + } } + .padding(.leading, 10) + .padding(.vertical, 8) + .padding(.trailing, 4) } + .opacity(isGenerated ? 0.75 : 1.0) } } diff --git a/Sources/CallModeSession.swift b/Sources/CallModeSession.swift index 61d74b7..0301c00 100644 --- a/Sources/CallModeSession.swift +++ b/Sources/CallModeSession.swift @@ -73,9 +73,11 @@ final class CallModeSession: ObservableObject { } /// Append a message attributed to a named plugin participant (via autoclawd_send_participant_message). - func appendParticipantMessage(id: String, name: String, text: String) { + func appendParticipantMessage(id: String, name: String, text: String, + imageData: Data? = nil, isGenerated: Bool = false) { messages.append(CallMessage(role: .participant, text: text, - participantID: id, participantName: name)) + participantID: id, participantName: name, + imageData: imageData, isGenerated: isGenerated)) } // MARK: - Agent Loop @@ -303,9 +305,16 @@ struct CallMessage: Identifiable { let id: UUID = UUID() let role: Role let text: String + /// Timestamp — used to show time labels in the call feed. + let createdAt: Date = Date() /// Set when the message comes from a plugin participant (autoclawd_send_participant_message). var participantID: String? = nil var participantName: String? = nil + /// Inline image attached to this message (e.g. Pencil screenshot, ScreenGrab). + var imageData: Data? = nil + /// True for AI-generated narrative messages (AutoClawd reactions, fake conversation). + /// These render with a dashed/faint border vs solid for real tool events. + var isGenerated: Bool = false enum Role { case user, assistant, tool, error, external, participant } } diff --git a/Sources/CallStreamWidget.swift b/Sources/CallStreamWidget.swift new file mode 100644 index 0000000..d156f6a --- /dev/null +++ b/Sources/CallStreamWidget.swift @@ -0,0 +1,119 @@ +import AppKit +import SwiftUI + +// MARK: - CallStreamWidget + +/// Always-on-top floating widget that shows the live call stream — agent tiles, +/// camera feed, and conversation feed. Separate from the main pill so it can +/// overlay any screen or app during a Claude Code session. +/// +/// Activated when pillMode == .callMode and the setting is enabled. +/// Draggable from anywhere; snaps to the bottom-right corner by default. +final class CallStreamWidget: NSPanel { + + static let defaultWidth: CGFloat = 420 + static let defaultHeight: CGFloat = 560 + + private var hostingView: NSHostingView? + + // Smooth drag + private var initialMouseLoc: NSPoint = .zero + private var initialOrigin: NSPoint = .zero + + init() { + super.init( + contentRect: NSRect(x: 0, y: 0, + width: Self.defaultWidth, + height: Self.defaultHeight), + styleMask: [.borderless, .nonactivatingPanel, .utilityWindow], + backing: .buffered, + defer: false + ) + configure() + } + + private func configure() { + isOpaque = false + backgroundColor = .clear + hasShadow = true + level = .floating + collectionBehavior = [.canJoinAllSpaces, .stationary, .ignoresCycle] + isMovableByWindowBackground = false + animationBehavior = .utilityWindow + + // Default position: bottom-right, 20pt inset from visible frame + if let screen = NSScreen.main { + let vf = screen.visibleFrame + let x = vf.maxX - Self.defaultWidth - 20 + let y = vf.minY + 20 + setFrameOrigin(NSPoint(x: x, y: y)) + } + } + + func setContent(_ view: V) { + let hosting = NSHostingView(rootView: AnyView(view)) + hosting.frame = contentView?.bounds ?? .zero + hosting.autoresizingMask = [.width, .height] + hosting.wantsLayer = true + hosting.layer?.backgroundColor = CGColor.clear + hosting.layer?.isOpaque = false + contentView = hosting + hostingView = hosting + } + + // MARK: - Dragging + + override func mouseDown(with event: NSEvent) { + initialMouseLoc = NSEvent.mouseLocation + initialOrigin = frame.origin + } + + override func mouseDragged(with event: NSEvent) { + let cur = NSEvent.mouseLocation + setFrameOrigin(NSPoint( + x: initialOrigin.x + (cur.x - initialMouseLoc.x), + y: initialOrigin.y + (cur.y - initialMouseLoc.y) + )) + } + + // MARK: - Visibility + + func show() { + orderFront(nil) + } + + func hide() { + orderOut(nil) + } + + /// Animate in from bottom (slide + fade). + func animateIn() { + alphaValue = 0 + let target = frame + var start = target + start.origin.y -= 24 + setFrame(start, display: false) + + NSAnimationContext.runAnimationGroup { ctx in + ctx.duration = 0.28 + ctx.timingFunction = CAMediaTimingFunction(name: .easeOut) + animator().setFrame(target, display: true) + animator().alphaValue = 1 + } + orderFront(nil) + } + + /// Animate out (slide + fade). + func animateOut(completion: (() -> Void)? = nil) { + NSAnimationContext.runAnimationGroup { ctx in + ctx.duration = 0.2 + ctx.timingFunction = CAMediaTimingFunction(name: .easeIn) + var end = frame; end.origin.y -= 24 + animator().setFrame(end, display: true) + animator().alphaValue = 0 + } completionHandler: { + self.orderOut(nil) + completion?() + } + } +} diff --git a/Sources/CallStreamWidgetView.swift b/Sources/CallStreamWidgetView.swift new file mode 100644 index 0000000..2977165 --- /dev/null +++ b/Sources/CallStreamWidgetView.swift @@ -0,0 +1,719 @@ +import SwiftUI +import AppKit + +// MARK: - CallStreamWidgetView +// +// Brutalist design language — no circles, no soft rounding, sharp geometric. +// The feed is a story, not a log. Sections: +// +// ┌─ HEADER: CALL STREAM + timer + close ─────────────────┐ +// │ MISSION: user's goal (first spoken message) │ +// ├─ PARTICIPANTS: rectangular tiles, thick top accent ────┤ +// │ TASKS: pending todo queue (top 3) │ +// ├─ STREAM ──────────────────────────────────────────────┤ +// │ Group chat — NAME ─── time / message / image │ +// ├─ SPOTLIGHT: current file or image (auto-shown) ────────┤ +// │ END CALL bar │ +// └────────────────────────────────────────────────────────┘ + +struct CallStreamWidgetView: View { + @ObservedObject var appState: AppState + let onClose: () -> Void + + @State private var sessionSeconds: Int = 0 + @State private var spotlightImage: NSImage? = nil + @State private var spotlightFile: String? = nil + + private let sessionTimer = Timer.publish(every: 1, on: .main, in: .common).autoconnect() + + // Palette + private let bg = Color(red: 0.067, green: 0.067, blue: 0.067) + private let surf = Color(red: 0.102, green: 0.102, blue: 0.102) + private let border = Color.white.opacity(0.07) + + var body: some View { + VStack(spacing: 0) { + header + rowDivider + + if let goal = missionGoal { + missionBar(goal) + rowDivider + } + + participantStrip + rowDivider + + let tasks = pendingTasks + if !tasks.isEmpty { + taskSection(tasks) + rowDivider + } + + streamHeader + streamFeed + + if spotlightImage != nil || spotlightFile != nil { + rowDivider + spotlightPanel + } + + rowDivider + bottomBar + } + .frame(width: 420) + .background(bg) + .clipShape(RoundedRectangle(cornerRadius: 3, style: .continuous)) + .overlay( + RoundedRectangle(cornerRadius: 3, style: .continuous) + .stroke(Color.white.opacity(0.09), lineWidth: 0.5) + ) + .shadow(color: .black.opacity(0.65), radius: 36, y: 16) + .shadow(color: .black.opacity(0.20), radius: 6, y: 2) + .onReceive(sessionTimer) { _ in sessionSeconds += 1 } + .onChange(of: appState.callModeSession.messages.count) { _ in + updateSpotlight() + } + } + + // MARK: - Header + + private var header: some View { + HStack(spacing: 10) { + // REC dot + Circle() + .fill(Color.red) + .frame(width: 6, height: 6) + .shadow(color: .red.opacity(0.9), radius: 5) + + Text("CALL STREAM") + .font(.system(size: 9, weight: .bold, design: .monospaced)) + .foregroundColor(.white.opacity(0.45)) + .tracking(2) + + Spacer() + + Text(formatDuration(sessionSeconds)) + .font(.system(size: 10, design: .monospaced)) + .foregroundColor(.white.opacity(0.28)) + .monospacedDigit() + + closeButton + } + .padding(.horizontal, 14) + .padding(.vertical, 9) + .background(surf) + } + + private var closeButton: some View { + Button(action: onClose) { + Image(systemName: "xmark") + .font(.system(size: 8, weight: .semibold)) + .foregroundColor(.white.opacity(0.3)) + .frame(width: 18, height: 18) + .background(Rectangle().fill(.white.opacity(0.07))) + } + .buttonStyle(.plain) + } + + // MARK: - Mission + + private var missionGoal: String? { + appState.callModeSession.messages.first(where: { $0.role == .user })?.text + } + + private func missionBar(_ goal: String) -> some View { + HStack(alignment: .top, spacing: 12) { + Rectangle() + .fill(Color.orange) + .frame(width: 2) + .frame(maxHeight: .infinity) + + VStack(alignment: .leading, spacing: 3) { + sectionLabel("MISSION") + Text(goal) + .font(.system(size: 11)) + .foregroundColor(.white.opacity(0.72)) + .lineLimit(3) + } + Spacer() + } + .padding(.horizontal, 14) + .padding(.vertical, 9) + .fixedSize(horizontal: false, vertical: true) + } + + // MARK: - Participant strip (brutalist — no circles) + + private var participantStrip: some View { + HStack(spacing: 1) { + userTile + ForEach(appState.callRoom.participants) { p in + ParticipantBrutalistTile( + participant: p, + isActive: p.id == appState.callRoom.activeParticipantID, + onTap: { appState.callRoom.activeParticipantID = p.id } + ) + } + } + .frame(maxWidth: .infinity, minHeight: 76) + .background(surf) + } + + private var userTile: some View { + VStack(spacing: 5) { + ZStack { + Rectangle() + .fill(.white.opacity(0.05)) + .frame(width: 32, height: 32) + if appState.cameraEnabled && appState.cameraService.isRunning { + CameraPreviewView(session: appState.cameraService.captureSession) + .frame(width: 32, height: 32) + .clipped() + } else { + Image(systemName: "person.fill") + .font(.system(size: 13)) + .foregroundColor(.white.opacity(0.35)) + } + } + + Text("YOU") + .font(.system(size: 7, weight: .bold, design: .monospaced)) + .foregroundColor(.white.opacity(0.28)) + .tracking(1) + + // Mic bars + HStack(spacing: 2) { + ForEach(0..<3, id: \.self) { _ in + Rectangle() + .fill(Color.white.opacity(appState.isListening ? 0.5 : 0.1)) + .frame(width: 2, height: 5) + } + } + } + .frame(maxWidth: .infinity) + .frame(height: 76) + .overlay(alignment: .top) { + Rectangle().fill(.white.opacity(0.2)).frame(height: 2) + } + } + + // MARK: - Tasks + + private var pendingTasks: [StructuredTodo] { + Array(appState.structuredTodos.filter { !$0.isExecuted }.prefix(3)) + } + + private func taskSection(_ tasks: [StructuredTodo]) -> some View { + VStack(spacing: 0) { + HStack { + sectionLabel("TASKS") + .padding(.horizontal, 14) + .padding(.vertical, 7) + Spacer() + Text("\(tasks.count) pending") + .font(.system(size: 7, design: .monospaced)) + .foregroundColor(.white.opacity(0.18)) + .padding(.trailing, 14) + } + .background(surf) + + rowDivider + + VStack(spacing: 0) { + ForEach(Array(tasks.enumerated()), id: \.element.id) { i, todo in + HStack(spacing: 10) { + Rectangle() + .fill(i == 0 ? Color.orange : .white.opacity(0.12)) + .frame(width: 2, height: 14) + Image(systemName: i == 0 ? "arrow.right" : "circle") + .font(.system(size: 8)) + .foregroundColor(i == 0 ? .orange : .white.opacity(0.2)) + Text(todo.content) + .font(.system(size: 10)) + .foregroundColor(i == 0 ? .white.opacity(0.82) : .white.opacity(0.35)) + .lineLimit(1) + Spacer() + } + .padding(.horizontal, 14) + .padding(.vertical, 7) + .background(i == 0 ? Color.orange.opacity(0.04) : Color.clear) + + if i < tasks.count - 1 { + Rectangle() + .fill(border) + .frame(maxWidth: .infinity, maxHeight: 0.5) + .padding(.leading, 14) + } + } + } + } + } + + // MARK: - Stream + + private var streamHeader: some View { + HStack { + sectionLabel("STREAM") + Spacer() + if appState.callModeSession.isProcessing { + HStack(spacing: 5) { + ProgressView().controlSize(.mini).tint(.teal) + Text("thinking...") + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(.teal.opacity(0.65)) + } + } + } + .padding(.horizontal, 14) + .padding(.vertical, 7) + .background(surf) + } + + private var streamFeed: some View { + ScrollViewReader { proxy in + ScrollView { + LazyVStack(spacing: 0) { + ForEach(appState.callModeSession.messages) { msg in + BrutalistChatMessage( + msg: msg, + color: participantColor(for: msg), + name: participantName(for: msg) + ) + .id(msg.id) + } + + // Live user speech + if !appState.liveTranscriptText.isEmpty { + BrutalistChatMessage( + msg: CallMessage( + role: .user, + text: appState.liveTranscriptText, + participantName: "you" + ), + color: .white, + name: "YOU" + ) + .id("live") + } + + // Empty state + if appState.callModeSession.messages.isEmpty && appState.liveTranscriptText.isEmpty { + VStack(spacing: 8) { + Rectangle() + .fill(.white.opacity(0.04)) + .frame(width: 1, height: 32) + .padding(.top, 24) + Text("speak or start a claude code session") + .font(.system(size: 10, design: .monospaced)) + .foregroundColor(.white.opacity(0.14)) + Rectangle() + .fill(.white.opacity(0.04)) + .frame(width: 1, height: 24) + } + .frame(maxWidth: .infinity) + .padding(.bottom, 24) + } + } + } + .frame(maxHeight: .infinity) + .onChange(of: appState.callModeSession.messages.count) { _ in + withAnimation(.easeOut(duration: 0.2)) { + if let last = appState.callModeSession.messages.last { + proxy.scrollTo(last.id, anchor: .bottom) + } + } + } + .onChange(of: appState.liveTranscriptText) { _ in + withAnimation { proxy.scrollTo("live", anchor: .bottom) } + } + } + } + + // MARK: - Spotlight + + @ViewBuilder + private var spotlightPanel: some View { + VStack(spacing: 0) { + HStack(spacing: 7) { + Rectangle() + .fill(Color.yellow.opacity(0.6)) + .frame(width: 2, height: 10) + sectionLabel("SPOTLIGHT") + Spacer() + Button(action: { withAnimation { spotlightImage = nil; spotlightFile = nil } }) { + Image(systemName: "xmark") + .font(.system(size: 7)) + .foregroundColor(.white.opacity(0.2)) + } + .buttonStyle(.plain) + .padding(.trailing, 14) + } + .padding(.horizontal, 14) + .padding(.vertical, 7) + .background(surf) + + rowDivider + + if let img = spotlightImage { + Image(nsImage: img) + .resizable() + .aspectRatio(contentMode: .fit) + .frame(maxWidth: .infinity, maxHeight: 190) + .background(Color.black) + } else if let file = spotlightFile { + HStack(spacing: 12) { + Rectangle() + .fill(Color.yellow.opacity(0.08)) + .frame(width: 32, height: 32) + .overlay( + Image(systemName: fileIcon(file)) + .font(.system(size: 14)) + .foregroundColor(.yellow.opacity(0.55)) + ) + + VStack(alignment: .leading, spacing: 3) { + Text(fileName(file)) + .font(.system(size: 11, design: .monospaced)) + .foregroundColor(.white.opacity(0.75)) + Text(fileCategory(file)) + .font(.system(size: 9, design: .monospaced)) + .foregroundColor(.white.opacity(0.28)) + } + + Spacer() + + Text("reading") + .font(.system(size: 8, weight: .semibold, design: .monospaced)) + .foregroundColor(.orange) + .padding(.horizontal, 7) + .padding(.vertical, 4) + .background(Rectangle().fill(Color.orange.opacity(0.10))) + } + .padding(.horizontal, 14) + .padding(.vertical, 10) + } + } + .transition(.asymmetric( + insertion: .move(edge: .bottom).combined(with: .opacity), + removal: .opacity + )) + .animation(.easeOut(duration: 0.2), value: spotlightFile) + .animation(.easeOut(duration: 0.2), value: spotlightImage != nil) + } + + // MARK: - Bottom bar + + private var bottomBar: some View { + HStack(spacing: 14) { + // Waveform + HStack(spacing: 2) { + ForEach(0..<10, id: \.self) { i in + Rectangle() + .fill(Color.green.opacity(appState.isListening ? 0.7 : 0.18)) + .frame(width: 2, height: waveH(i)) + .animation(.easeInOut(duration: 0.1), value: appState.audioLevel) + } + } + .frame(width: 30) + + Spacer() + + if !appState.callModeSession.messages.isEmpty { + Text("\(appState.callModeSession.messages.count) events") + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(.white.opacity(0.18)) + } + + // End call — brutalist: red square + Button { + appState.stopListening() + appState.pillMode = .ambientIntelligence + } label: { + HStack(spacing: 6) { + Rectangle() + .fill(.white) + .frame(width: 7, height: 7) + Text("END CALL") + .font(.system(size: 9, weight: .bold, design: .monospaced)) + .tracking(1) + } + .foregroundColor(.white) + .padding(.horizontal, 12) + .padding(.vertical, 7) + .background(Rectangle().fill(Color.red.opacity(0.85))) + } + .buttonStyle(.plain) + } + .padding(.horizontal, 14) + .padding(.vertical, 10) + .background(surf) + } + + // MARK: - Shared helpers + + private var rowDivider: some View { + Rectangle() + .fill(border) + .frame(maxWidth: .infinity, maxHeight: 1) + } + + private func sectionLabel(_ text: String) -> some View { + Text(text) + .font(.system(size: 8, weight: .bold, design: .monospaced)) + .foregroundColor(.white.opacity(0.25)) + .tracking(2) + } + + private func participantColor(for msg: CallMessage) -> Color { + if let pid = msg.participantID, + let p = appState.callRoom.participants.first(where: { $0.id == pid }) { + return p.tileColor + } + switch msg.role { + case .user: return .white + case .assistant: return .teal + case .external: return .orange + case .tool: return .yellow + case .error: return .red + case .participant: return .purple + } + } + + private func participantName(for msg: CallMessage) -> String { + if let name = msg.participantName { return name.uppercased() } + switch msg.role { + case .user: return "YOU" + case .assistant: return "AUTOCLAWD" + case .external: return "CLAWD" + case .tool: return "TOOL" + case .error: return "ERROR" + case .participant: return "AGENT" + } + } + + // MARK: - Spotlight helpers + + private func updateSpotlight() { + let msgs = appState.callModeSession.messages + for msg in msgs.suffix(6).reversed() { + if let data = msg.imageData, let img = NSImage(data: data) { + withAnimation { spotlightImage = img; spotlightFile = nil } + return + } + } + for msg in msgs.suffix(6).reversed() { + if let file = extractFilename(from: msg.text) { + withAnimation { spotlightFile = file; spotlightImage = nil } + return + } + } + } + + private func extractFilename(from text: String) -> String? { + // Matches things like "Sources/Foo.swift" or "foo.ts" etc. + let pat = #"[\w\-./]+\.(swift|ts|tsx|py|js|json|md|yaml|yml|sh|go|rs|kt|css|html)"# + guard let range = text.range(of: pat, options: .regularExpression), + text[range].count > 5 else { return nil } + return String(text[range]) + } + + private func fileIcon(_ path: String) -> String { + switch URL(fileURLWithPath: path).pathExtension.lowercased() { + case "swift": return "swift" + case "ts", "tsx": return "t.square" + case "py": return "terminal" + case "js": return "j.square" + case "json": return "curlybraces" + case "md": return "text.alignleft" + case "sh": return "terminal.fill" + default: return "doc.text" + } + } + + private func fileName(_ path: String) -> String { URL(fileURLWithPath: path).lastPathComponent } + + private func fileCategory(_ path: String) -> String { + switch URL(fileURLWithPath: path).pathExtension.lowercased() { + case "swift": return "swift source" + case "ts", "tsx": return "typescript" + case "py": return "python" + case "js": return "javascript" + case "json": return "configuration" + case "md": return "documentation" + default: return "source file" + } + } + + private func waveH(_ i: Int) -> CGFloat { + guard appState.isListening else { return 3 } + let ph = Double(i) * 0.8 + return 3 + (sin(ph + Double(appState.audioLevel) * 6) * 0.5 + 0.5) + * CGFloat(appState.audioLevel) * 12 + } + + private func formatDuration(_ sec: Int) -> String { + String(format: "%d:%02d", sec / 60, sec % 60) + } +} + +// MARK: - ParticipantBrutalistTile + +private struct ParticipantBrutalistTile: View { + let participant: CallParticipant + let isActive: Bool + let onTap: () -> Void + + private var color: Color { participant.tileColor } + + var body: some View { + Button(action: onTap) { + VStack(spacing: 5) { + // Square icon/mascot — NO circles + ZStack { + Rectangle() + .fill(color.opacity(isActive ? 0.12 : 0.06)) + .frame(width: 34, height: 34) + + if let ns = NSImage(named: "mascot-\(participant.id)") { + Image(nsImage: ns) + .resizable() + .scaledToFit() + .frame(width: 24, height: 24) + } else { + Image(systemName: participant.mascotSystemImage) + .font(.system(size: 14, weight: .medium)) + .foregroundColor(participant.isPaused ? .gray.opacity(0.3) : color) + } + + // Activity border (square pulse instead of circle) + if participant.state == .streaming || participant.state == .thinking { + Rectangle() + .stroke(color.opacity(0.55), lineWidth: 1) + .frame(width: 38, height: 38) + } + } + + Text(participant.displayName.uppercased()) + .font(.system(size: 7, weight: .bold, design: .monospaced)) + .foregroundColor(isActive ? color : .white.opacity(0.28)) + .tracking(0.5) + .lineLimit(1) + + Text(stateText) + .font(.system(size: 7, design: .monospaced)) + .foregroundColor(stateColor.opacity(0.7)) + } + .frame(maxWidth: .infinity) + .frame(height: 76) + .background(isActive ? color.opacity(0.055) : Color.clear) + // Thick top accent bar + .overlay(alignment: .top) { + Rectangle() + .fill(color) + .frame(height: isActive ? 3 : 1) + .opacity(isActive ? 1.0 : 0.3) + } + // Side/bottom border + .overlay( + Rectangle() + .stroke(Color.white.opacity(0.05), lineWidth: 0.5) + ) + } + .buttonStyle(.plain) + } + + private var stateText: String { + if participant.isPaused { return "paused" } + switch participant.state { + case .idle: return "idle" + case .thinking: return "thinking" + case .streaming: return "working" + case .paused: return "paused" + } + } + + private var stateColor: Color { + switch participant.state { + case .streaming: return color + case .thinking: return .yellow + default: return .white.opacity(0.3) + } + } +} + +// MARK: - BrutalistChatMessage + +private struct BrutalistChatMessage: View { + let msg: CallMessage + let color: Color + let name: String + + private static let timeFmt: DateFormatter = { + let f = DateFormatter() + f.dateFormat = "H:mm" + return f + }() + + var body: some View { + VStack(alignment: .leading, spacing: 0) { + // NAME ─────────────── time + HStack(spacing: 8) { + Text(name) + .font(.system(size: 9, weight: .bold, design: .monospaced)) + .foregroundColor(color.opacity(msg.isGenerated ? 0.55 : 1.0)) + + if msg.isGenerated { + Text("~") + .font(.system(size: 9, design: .monospaced)) + .italic() + .foregroundColor(color.opacity(0.4)) + } + + Rectangle() + .fill(color.opacity(msg.isGenerated ? 0.12 : 0.22)) + .frame(maxWidth: .infinity, maxHeight: 1) + + Text(Self.timeFmt.string(from: msg.createdAt)) + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(.white.opacity(0.18)) + .monospacedDigit() + } + .padding(.horizontal, 14) + .padding(.top, 12) + + // Message text + if !msg.text.isEmpty { + Text(msg.text) + .font(.system(size: 12)) + .foregroundColor(.white.opacity(msg.isGenerated ? 0.50 : 0.82)) + .fixedSize(horizontal: false, vertical: true) + .textSelection(.enabled) + .padding(.horizontal, 14) + .padding(.top, 5) + .padding(.bottom, msg.imageData != nil ? 6 : 12) + } + + // Inline image (Pencil screenshot, ScreenGrab, etc.) + if let data = msg.imageData, let ns = NSImage(data: data) { + Image(nsImage: ns) + .resizable() + .aspectRatio(contentMode: .fit) + .frame(maxWidth: .infinity, maxHeight: 200) + .background(Color.black) + .overlay( + Rectangle() + .stroke(color.opacity(0.22), lineWidth: 0.5) + ) + .padding(.horizontal, 14) + .padding(.bottom, 12) + } + + // Bottom rule + Rectangle() + .fill(Color.white.opacity(0.04)) + .frame(maxWidth: .infinity, maxHeight: 1) + } + .opacity(msg.isGenerated ? 0.70 : 1.0) + } +} diff --git a/Sources/HookNarrationService.swift b/Sources/HookNarrationService.swift index afd4834..1ee76e6 100644 --- a/Sources/HookNarrationService.swift +++ b/Sources/HookNarrationService.swift @@ -18,19 +18,45 @@ struct HookEvent { var isPreTool: Bool { eventName == "PreToolUse" } } +// MARK: - ToolParticipant + +/// A tool (MCP server) that used as a participant in the call room feed. +struct ToolParticipant { + let id: String // e.g. "pencil" + let name: String // e.g. "Pencil" + let systemImage: String // SF symbol +} + +// MARK: - NarrationBundle + +/// Rich narration package returned for each hook event. +/// Contains everything needed to post a multi-message "conversation" in the call feed. +struct NarrationBundle { + /// What Claw'd says (real, solid-border message). + let narration: String + /// The tool participant derived from the MCP tool name (nil for built-in tools). + let toolParticipant: ToolParticipant? + /// Short summary of the tool's response text (optional). + let toolResponseText: String? + /// Image bytes extracted from the tool response (e.g. Pencil screenshot). + let imageData: Data? + /// AutoClawd's generated reaction — rendered as a dashed/faint generated message. + let autoClawdReaction: String? +} + // MARK: - HookNarrationService -/// Translates raw Claude Code hook events into one-sentence human-readable narratives. +/// Translates raw Claude Code hook events into NarrationBundles for the call-room feed. /// /// Pipeline: /// 1. Parse the raw JSON from `/hook` into a `HookEvent`. -/// 2. `narrate(_:)` builds a compact summary string. -/// 3. If Ollama is reachable, sends a short prompt to Llama and returns its response. -/// 4. If Ollama is unavailable or slow, falls back to the template summary. +/// 2. Extract tool participant (for MCP tools like `mcp__pencil__*`). +/// 3. Extract image data from the tool response if present. +/// 4. `narrate(_:)` calls Ollama Llama for a short narration; falls back to template. +/// 5. Optionally generates an AutoClawd reaction via a second Ollama call. /// -/// The narrated sentence is then shown in the call-room feed attributed to the -/// Claude Code participant tile — turning the call view into a live running -/// commentary of what Claude Code is doing. +/// The bundle is then used by AppDelegate to auto-join tool participants and post +/// a two-sided conversation in the feed (Claw'd speaks, tool responds, AutoClawd reacts). final class HookNarrationService: @unchecked Sendable { private let ollama = OllamaService() @@ -53,16 +79,24 @@ final class HookNarrationService: @unchecked Sendable { // MARK: - Narrate - /// Returns a short, natural-language sentence describing what just happened. - /// Uses Llama if available; falls back to a template description otherwise. - func narrate(_ event: HookEvent) async -> String { - // Stop events don't need an LLM call + /// Returns a NarrationBundle describing what just happened. + func narrate(_ event: HookEvent) async -> NarrationBundle { if event.isStop { - return "Claw'd finished the task." + return NarrationBundle( + narration: "Claw'd finished the task.", + toolParticipant: nil, + toolResponseText: nil, + imageData: nil, + autoClawdReaction: nil + ) } - let summary = templateSummary(event) + let tp = HookNarrationService.toolParticipant(from: event.toolName) + let imageData = HookNarrationService.extractImageData(from: event.toolResponse) + let summary = templateSummary(event) + // Ask Llama for a casual narration sentence + var narration = summary do { let prompt = """ You are a narrator watching an AI coding assistant named "Claw'd" work. \ @@ -74,17 +108,119 @@ final class HookNarrationService: @unchecked Sendable { Narration: """ - var narration = try await ollama.generate(prompt: prompt, numPredict: 60) - // Strip any trailing artefacts that Llama sometimes adds - narration = narration - .components(separatedBy: "\n").first ?? narration - narration = narration + var llm = try await ollama.generate(prompt: prompt, numPredict: 60) + llm = llm.components(separatedBy: "\n").first ?? llm + llm = llm + .trimmingCharacters(in: .whitespacesAndNewlines) + .trimmingCharacters(in: CharacterSet(charactersIn: "\"'")) + if !llm.isEmpty { narration = llm } + } catch {} + + // Generate AutoClawd reaction for interesting MCP events or image results + var reaction: String? = nil + if tp != nil || imageData != nil { + reaction = await generateAutoClawdReaction(for: narration, hasImage: imageData != nil) + } + + return NarrationBundle( + narration: narration, + toolParticipant: tp, + toolResponseText: nil, + imageData: imageData, + autoClawdReaction: reaction + ) + } + + // MARK: - Tool Participant Extraction + + /// Parses `mcp__server__tool` tool names and returns a ToolParticipant. + static func toolParticipant(from toolName: String?) -> ToolParticipant? { + guard let name = toolName, name.hasPrefix("mcp__") else { return nil } + let parts = name.components(separatedBy: "__") + guard parts.count >= 2 else { return nil } + let serverID = parts[1] + return ToolParticipant( + id: serverID, + name: serverID.capitalized, + systemImage: systemImageForServer(serverID) + ) + } + + private static func systemImageForServer(_ server: String) -> String { + switch server { + case "pencil": return "paintpalette" + case "figma": return "rectangle.3.group" + case "github": return "cat" + case "linear": return "square.and.pencil" + case "notion": return "doc.text" + case "googlesheets": return "tablecells" + default: return "cable.connector" + } + } + + // MARK: - Image Extraction + + /// Extracts raw JPEG/PNG bytes from a tool_response JSON payload. + /// Handles several content formats used by MCP tools. + static func extractImageData(from toolResponse: [String: Any]?) -> Data? { + guard let resp = toolResponse else { return nil } + + // Pattern 1: {"type": "image", "data": "base64..."} (direct) + if resp["type"] as? String == "image", + let dataStr = resp["data"] as? String, + let d = Data(base64Encoded: dataStr) { + return d + } + + // Pattern 2: {"content": [{"type": "image", "data": "..."}]} + if let content = resp["content"] as? [[String: Any]] { + for item in content { + if item["type"] as? String == "image" { + if let dataStr = item["data"] as? String, + let d = Data(base64Encoded: dataStr) { return d } + if let source = item["source"] as? [String: Any], + let dataStr = source["data"] as? String, + let d = Data(base64Encoded: dataStr) { return d } + } + } + } + + // Pattern 3: {"result": [{"type": "image", "source": {"data": "..."}}]} + if let result = resp["result"] as? [[String: Any]] { + for item in result { + if item["type"] as? String == "image", + let source = item["source"] as? [String: Any], + let dataStr = source["data"] as? String, + let d = Data(base64Encoded: dataStr) { return d } + } + } + + return nil + } + + // MARK: - AutoClawd Reaction + + private func generateAutoClawdReaction(for narration: String, hasImage: Bool) async -> String? { + let imageHint = hasImage ? " A screenshot or image was returned." : "" + let prompt = """ + You are AutoClawd, an AI project manager watching Claw'd (your coding AI) work. + In one casual, short sentence (8–12 words), react to what just happened.\(imageHint) + Be observant — sometimes curious, sometimes encouraging, sometimes dry. + Never start with "I". Don't add quotes. + + What happened: \(narration) + + Your reaction: + """ + do { + var reaction = try await ollama.generate(prompt: prompt, numPredict: 40) + reaction = reaction.components(separatedBy: "\n").first ?? reaction + reaction = reaction .trimmingCharacters(in: .whitespacesAndNewlines) .trimmingCharacters(in: CharacterSet(charactersIn: "\"'")) - return narration.isEmpty ? summary : narration + return reaction.isEmpty ? nil : reaction } catch { - // Ollama not running — use template - return summary + return nil } } @@ -167,6 +303,13 @@ final class HookNarrationService: @unchecked Sendable { return "Editing a notebook cell" default: + // MCP tools: mcp__server__action -> "Using Pencil: batch_design" + if tool.hasPrefix("mcp__") { + let parts = tool.components(separatedBy: "__") + if parts.count >= 3 { + return "Using \(parts[1].capitalized): \(parts[2...].joined(separator: "_"))" + } + } return "Using \(tool)" } } diff --git a/Sources/SettingsManager.swift b/Sources/SettingsManager.swift index ae5b35f..9d2e78c 100644 --- a/Sources/SettingsManager.swift +++ b/Sources/SettingsManager.swift @@ -138,7 +138,8 @@ final class SettingsManager: @unchecked Sendable { private let kFaceTrackingEnabled = "face_tracking_enabled" private let kGestureHoldDuration = "gesture_hold_duration" private let kCameraAnalysisFPS = "camera_analysis_fps" - private let kSelectedCameraDeviceID = "selected_camera_device_id" + private let kSelectedCameraDeviceID = "selected_camera_device_id" + private let kCallStreamWidgetEnabled = "call_stream_widget_enabled" // MARK: - Properties @@ -388,5 +389,14 @@ final class SettingsManager: @unchecked Sendable { set { defaults.set(newValue, forKey: kSelectedCameraDeviceID) } } + // MARK: - Call Stream Widget + + /// When true, the floating Call Stream Widget appears whenever call mode is active. + /// Toggle in Settings → Call Mode. + var callStreamWidgetEnabled: Bool { + get { defaults.object(forKey: kCallStreamWidgetEnabled) as? Bool ?? true } + set { defaults.set(newValue, forKey: kCallStreamWidgetEnabled) } + } + private init() {} }