diff --git a/Sources/AppDelegate.swift b/Sources/AppDelegate.swift index 8ba6fcc..55805cf 100644 --- a/Sources/AppDelegate.swift +++ b/Sources/AppDelegate.swift @@ -12,6 +12,7 @@ final class AppDelegate: NSObject, NSApplicationDelegate { private var mainPanel: MainPanelWindow? private var toastWindow: ToastWindow? private var setupWindow: SetupWindow? + private var callStreamWidget: CallStreamWidget? private var toastDismissWork: DispatchWorkItem? private var cancellables = Set() @@ -33,6 +34,106 @@ final class AppDelegate: NSObject, NSApplicationDelegate { // recording attempt — preventing the first chunk from silently failing. requestPermissionsUpfront() + // Start embedded MCP server so any Claude Code session can call screen-grab tools. + // Configure Claude Code with: { "mcpServers": { "autoclawd": { "type": "http", "url": "http://localhost:7892/mcp" } } } + let screenGrab = ScreenGrabService() + MCPServer.shared.start( + screenGrab: screenGrab, + transcriptProvider: { [weak self] in self?.appState.liveTranscriptText ?? "" }, + isPausedProvider: { [weak self] in !(self?.appState.callRoom.claudeCodeIsActive ?? true) }, + canvasWriter: { [weak self] text in self?.appState.callModeSession.appendExternalMessage(text) }, + onJoined: { [weak self] in self?.appState.callRoom.claudeCodeJoined() }, + onLeft: { [weak self] in self?.appState.callRoom.claudeCodeLeft() }, + onInviteParticipant: { [weak self] id, name, icon in + self?.appState.callRoom.connectionJoined(id: id, name: name, systemImage: icon) + }, + onSetParticipantState: { [weak self] id, stateStr in + guard let room = self?.appState.callRoom else { return } + let state: ParticipantState + switch stateStr { + case "thinking": state = .thinking + case "streaming": state = .streaming + case "paused": state = .paused + default: state = .idle + } + room.setState(state, for: id) + }, + onParticipantMessage: { [weak self] id, name, text in + self?.appState.callModeSession.appendParticipantMessage(id: id, name: name, text: text) + }, + onRemoveParticipant: { [weak self] id in + self?.appState.callRoom.remove(id: id) + }, + onHookEvent: { [weak self] event in + guard let self else { return } + // When Claude Code fires a hook, build a NarrationBundle via Llama and post + // a two-sided conversation to the call-room feed: + // 1. Claw'd tile — real tool narration (solid border) + // 2. Tool participant tile — auto-joined; optional image response + // 3. AutoClawd tile — generated reaction comment (dashed/faint border) + let room = self.appState.callRoom + let session = self.appState.callModeSession + + // Ensure Claw'd is in the room (hooks can arrive before MCP initialize). + room.claudeCodeJoined() + room.setState(.thinking, for: "claude-code") + + Task { + let bundle = await HookNarrationService().narrate(event) + await MainActor.run { + // Auto-join MCP tool participant (e.g. "pencil", "figma") + if let tp = bundle.toolParticipant { + room.connectionJoined(id: tp.id, name: tp.name, systemImage: tp.systemImage) + } + + // Post Claw'd's real narration (solid left bar) + session.appendParticipantMessage( + id: "claude-code", + name: "Claw'd", + text: bundle.narration, + isGenerated: false + ) + room.setState(event.isStop ? .idle : .streaming, for: "claude-code") + + // Post tool response if it includes an image or text + if let tp = bundle.toolParticipant { + if bundle.imageData != nil || bundle.toolResponseText != nil { + room.setState(.streaming, for: tp.id) + session.appendParticipantMessage( + id: tp.id, + name: tp.name, + text: bundle.toolResponseText ?? "", + imageData: bundle.imageData, + isGenerated: false + ) + room.setState(.idle, for: tp.id) + } else { + room.updateLastActivity(id: tp.id) + } + } + + // Post AutoClawd's generated reaction (dashed/faint left bar) + if let reaction = bundle.autoClawdReaction { + session.appendParticipantMessage( + id: "llama", + name: "AutoClawd", + text: reaction, + isGenerated: true + ) + } + } + } + } + ) + + // Auto-register Claude Code hooks so PostToolUse events arrive at /hook. + MCPConfigManager.writeHooksConfig() + + // Configure call mode session with the same transcript provider. + appState.callModeSession.configure( + transcriptProvider: { [weak self] in self?.appState.liveTranscriptText ?? "" } + ) + // Toast window disabled — logs are now shown inline inside the widget. // AutoClawdLogger.toastPublisher // .receive(on: DispatchQueue.main) @@ -55,6 +156,34 @@ final class AppDelegate: NSObject, NSApplicationDelegate { } } .store(in: &cancellables) + + // Show/hide the Call Stream Widget when pill mode changes. + appState.$pillMode + .receive(on: DispatchQueue.main) + .sink { [weak self] mode in + guard let self else { return } + if mode == .callMode && SettingsManager.shared.callStreamWidgetEnabled { + self.showCallStreamWidget() + } else { + self.callStreamWidget?.animateOut() + } + } + .store(in: &cancellables) + } + + // MARK: - Call Stream Widget + + private func showCallStreamWidget() { + if callStreamWidget == nil { + let widget = CallStreamWidget() + let view = CallStreamWidgetView(appState: appState) { [weak self] in + self?.appState.pillMode = .ambientIntelligence + self?.callStreamWidget?.animateOut() + } + widget.setContent(view) + callStreamWidget = widget + } + callStreamWidget?.animateIn() } func applicationWillTerminate(_ notification: Notification) { @@ -721,6 +850,9 @@ struct PillContentView: View { typedText: typed ) return AnyView(v) + case .callMode: + let v = CallModeCanvasView(session: appState.callModeSession) + return AnyView(v) } } } diff --git a/Sources/AppState.swift b/Sources/AppState.swift index ddf491e..1512838 100644 --- a/Sources/AppState.swift +++ b/Sources/AppState.swift @@ -209,6 +209,10 @@ final class AppState: ObservableObject { @Published var sessionLifecycle: SessionLifecycleState = .undefined @Published var sessionConfig: SessionConfig? + // Call mode state + let callModeSession = CallModeSession() + let callRoom = CallRoom() + // Code widget state @Published var codeWidgetStep: CodeWidgetStep = .projectSelect @Published var codeSelectedProject: Project? = nil @@ -845,6 +849,10 @@ final class AppState: ObservableObject { // Select project during review: 0 = None, 1..N = project by position selectReviewProjectByIndex(count) Log.info(.camera, "Gesture: review project index \(count) selected") + } else if pillMode == .callMode { + // In call mode, finger count addresses a participant + callRoom.selectByGesture(fingerCount: count) + Log.info(.camera, "Gesture: call mode participant \(count) selected") } else if showOptionSelector { selectOption(index: count) Log.info(.camera, "Gesture: option \(count) selected (left fingers)") diff --git a/Sources/CallModeRoomView.swift b/Sources/CallModeRoomView.swift new file mode 100644 index 0000000..300d76a --- /dev/null +++ b/Sources/CallModeRoomView.swift @@ -0,0 +1,619 @@ +import SwiftUI +import Combine + +// MARK: - CallModeRoomView + +/// Full-panel Call Mode UI — participant tiles, shared feed, session controls. +/// Replaces PixelWorldView in the World tab when pillMode == .callMode. +struct CallModeRoomView: View { + @ObservedObject var appState: AppState + + var body: some View { + VStack(spacing: 0) { + participantsRow + Divider().background(Color.white.opacity(0.07)) + callFeed + Divider().background(Color.white.opacity(0.07)) + bottomBar + } + .background(Color.black) + // Route left-hand finger count → participant selection + .onReceive(appState.$lastConfirmedGesture.compactMap { $0 }) { gesture in + if case .leftFingerCount(let count) = gesture { + appState.callRoom.selectByGesture(fingerCount: count) + } + } + } + + // MARK: - Participants Row + + private var participantsRow: some View { + ScrollView(.horizontal, showsIndicators: false) { + HStack(alignment: .top, spacing: 10) { + ForEach(appState.callRoom.participants) { participant in + ParticipantTileView( + participant: participant, + isActive: participant.id == appState.callRoom.activeParticipantID, + onTap: { appState.callRoom.activeParticipantID = participant.id }, + onPause: { appState.callRoom.togglePause(id: participant.id) }, + onRemove: participant.kind == .llama ? nil + : { appState.callRoom.remove(id: participant.id) } + ) + } + inviteButton + } + .padding(.horizontal, 16) + .padding(.vertical, 12) + } + .frame(height: 148) + } + + private var inviteButton: some View { + VStack(spacing: 6) { + Spacer() + ZStack { + Circle() + .fill(Color.white.opacity(0.05)) + .frame(width: 52, height: 52) + .overlay(Circle().stroke(Color.white.opacity(0.15), lineWidth: 0.8)) + Image(systemName: "plus") + .font(.system(size: 16, weight: .medium)) + .foregroundColor(.white.opacity(0.3)) + } + Text("Invite") + .font(.system(size: 9, design: .monospaced)) + .foregroundColor(.white.opacity(0.2)) + Spacer() + } + .frame(width: 80, height: 124) + } + + // MARK: - Call Feed + + private var callFeed: some View { + ZStack(alignment: .bottom) { + ScrollViewReader { proxy in + ScrollView { + LazyVStack(alignment: .leading, spacing: 8) { + // Transcript from user's voice — live session text + if !appState.liveTranscriptText.isEmpty { + CallFeedMessageRow( + participantName: "You", + color: .white, + icon: "mic.fill", + text: appState.liveTranscriptText + ) + .id("transcript") + } + // Messages from participants (Claude Code / external / plugin) + ForEach(appState.callModeSession.messages) { msg in + CallFeedMessageRow( + participantName: msg.participantName ?? feedLabel(for: msg.role), + color: participantFeedColor(for: msg), + icon: participantFeedIcon(for: msg), + text: msg.text, + imageData: msg.imageData, + isGenerated: msg.isGenerated + ) + .id(msg.id) + } + } + .padding(14) + } + .onChange(of: appState.callModeSession.messages.count) { _ in + if let last = appState.callModeSession.messages.last { + withAnimation { proxy.scrollTo(last.id, anchor: .bottom) } + } + } + .onChange(of: appState.liveTranscriptText) { _ in + withAnimation { proxy.scrollTo("transcript", anchor: .bottom) } + } + } + + // Processing indicator + if appState.callModeSession.isProcessing { + HStack(spacing: 6) { + ProgressView().controlSize(.mini).tint(.cyan) + Text("Thinking…") + .font(.system(size: 9, design: .monospaced)) + .foregroundColor(.cyan.opacity(0.7)) + } + .padding(.horizontal, 12) + .padding(.vertical, 6) + .background(Capsule().fill(Color.black.opacity(0.8))) + .padding(.bottom, 8) + } + + // Empty state + if appState.callModeSession.messages.isEmpty && appState.liveTranscriptText.isEmpty { + VStack(spacing: 8) { + Image(systemName: "waveform") + .font(.system(size: 28)) + .foregroundColor(.white.opacity(0.1)) + Text("CALL ACTIVE") + .font(.system(size: 10, weight: .semibold, design: .monospaced)) + .foregroundColor(.white.opacity(0.15)) + Text("Speak to start — use left fingers to address participants") + .font(.system(size: 9, design: .monospaced)) + .foregroundColor(.white.opacity(0.1)) + .multilineTextAlignment(.center) + } + .padding(.horizontal, 32) + .frame(maxWidth: .infinity) + .padding(.vertical, 32) + } + } + } + + // MARK: - Bottom Bar + + private var bottomBar: some View { + HStack(spacing: 12) { + // Camera preview (small) + cameraThumb + // Screen preview (small) + screenThumb + Spacer() + // Session controls + sessionControls + Spacer() + // Addressing indicator + addressingIndicator + } + .padding(.horizontal, 16) + .padding(.vertical, 10) + .background(Color.white.opacity(0.025)) + } + + private var cameraThumb: some View { + ZStack { + RoundedRectangle(cornerRadius: 8) + .fill(Color.black) + .frame(width: 64, height: 44) + if appState.cameraEnabled && appState.cameraService.isRunning { + CameraPreviewView(session: appState.cameraService.captureSession) + .frame(width: 64, height: 44) + .clipShape(RoundedRectangle(cornerRadius: 8)) + } else { + Image(systemName: "camera.slash") + .font(.system(size: 12)) + .foregroundColor(.white.opacity(0.2)) + } + } + .overlay(RoundedRectangle(cornerRadius: 8).stroke(Color.white.opacity(0.1), lineWidth: 0.5)) + } + + private var screenThumb: some View { + ZStack { + RoundedRectangle(cornerRadius: 8) + .fill(Color.black) + .frame(width: 64, height: 44) + if let img = appState.screenPreviewImage { + Image(decorative: img, scale: 1.0) + .resizable() + .aspectRatio(contentMode: .fill) + .frame(width: 64, height: 44) + .clipShape(RoundedRectangle(cornerRadius: 8)) + } else { + Image(systemName: "rectangle.on.rectangle") + .font(.system(size: 12)) + .foregroundColor(.white.opacity(0.2)) + } + } + .overlay(RoundedRectangle(cornerRadius: 8).stroke(Color.white.opacity(0.1), lineWidth: 0.5)) + } + + private var sessionControls: some View { + HStack(spacing: 14) { + // Play / Pause toggle + Button { + if appState.isListening { appState.stopListening() } + else { appState.startListening() } + } label: { + Image(systemName: appState.isListening ? "pause.fill" : "play.fill") + .font(.system(size: 14, weight: .semibold)) + .foregroundColor(appState.isListening ? .white : .green) + .frame(width: 32, height: 32) + .background(Circle().fill(Color.white.opacity(0.08))) + } + .buttonStyle(.plain) + + // Stop — end call + Button { + appState.stopListening() + appState.pillMode = .ambientIntelligence + } label: { + Image(systemName: "stop.fill") + .font(.system(size: 12, weight: .semibold)) + .foregroundColor(.red.opacity(0.8)) + .frame(width: 32, height: 32) + .background(Circle().fill(Color.white.opacity(0.06))) + } + .buttonStyle(.plain) + } + } + + private var addressingIndicator: some View { + HStack(spacing: 5) { + if let active = appState.callRoom.activeParticipant { + Text("①".replacing("①", with: "⑤".isEmpty ? "" : slotEmoji(active.gestureSlot))) + .font(.system(size: 9, design: .monospaced)) + .foregroundColor(.white.opacity(0.3)) + Image(systemName: active.mascotSystemImage) + .font(.system(size: 10)) + .foregroundColor(active.tileColor) + Text(active.displayName) + .font(.system(size: 9, weight: .medium, design: .monospaced)) + .foregroundColor(.white.opacity(0.5)) + } + } + } + + // MARK: - Helpers + + private func slotEmoji(_ slot: Int) -> String { + let circled = ["①","②","③","④","⑤"] + guard slot >= 1, slot <= circled.count else { return "\(slot)" } + return circled[slot - 1] + } + + private func feedLabel(for role: CallMessage.Role) -> String { + switch role { + case .user: return "You" + case .assistant: return "AutoClawd" + case .tool: return "Tool" + case .error: return "Error" + case .external: return "Claw'd" + case .participant: return "Plugin" + } + } + + private func feedColor(for role: CallMessage.Role) -> Color { + switch role { + case .user: return .white + case .assistant: return .teal + case .tool: return .yellow + case .error: return .red + case .external: return .orange + case .participant: return .purple + } + } + + private func feedIcon(for role: CallMessage.Role) -> String { + switch role { + case .user: return "mic.fill" + case .assistant: return "brain" + case .tool: return "wrench.adjustable" + case .error: return "exclamationmark.triangle" + case .external: return "terminal" + case .participant: return "cable.connector" + } + } + + /// Feed color for a message — uses the participant's tile color when available. + private func participantFeedColor(for msg: CallMessage) -> Color { + if msg.role == .participant, let pid = msg.participantID, + let p = appState.callRoom.participants.first(where: { $0.id == pid }) { + return p.tileColor + } + return feedColor(for: msg.role) + } + + /// Feed icon for a message — uses the participant's mascot icon when available. + private func participantFeedIcon(for msg: CallMessage) -> String { + if msg.role == .participant, let pid = msg.participantID, + let p = appState.callRoom.participants.first(where: { $0.id == pid }) { + return p.mascotSystemImage + } + return feedIcon(for: msg.role) + } +} + +// MARK: - ParticipantTileView + +struct ParticipantTileView: View { + let participant: CallParticipant + let isActive: Bool + let onTap: () -> Void + let onPause: () -> Void + let onRemove: (() -> Void)? + + private var tileColor: Color { participant.tileColor } + + var body: some View { + VStack(spacing: 5) { + // Top row: gesture slot + controls + HStack(spacing: 4) { + slotBadge + Spacer() + pauseButton + if let rm = onRemove { removeButton(action: rm) } + } + + // Mascot + ParticipantMascotView( + kind: participant.kind, + state: participant.state, + isPaused: participant.isPaused, + tileColor: participant.tileColor + ) + .frame(width: 54, height: 54) + + // Name + Text(participant.displayName) + .font(.system(size: 9, weight: .semibold, design: .monospaced)) + .foregroundColor(isActive ? .white : .white.opacity(0.4)) + .lineLimit(1) + + // State label + stateLabel + } + .padding(.horizontal, 9) + .padding(.vertical, 8) + .frame(width: 104) + .background( + RoundedRectangle(cornerRadius: 14, style: .continuous) + .fill(Color.white.opacity(isActive ? 0.07 : 0.02)) + .overlay( + RoundedRectangle(cornerRadius: 14, style: .continuous) + .stroke( + isActive ? tileColor.opacity(0.55) : Color.white.opacity(0.08), + lineWidth: isActive ? 1.5 : 0.5 + ) + ) + ) + .contentShape(Rectangle()) + .onTapGesture { onTap() } + .animation(.easeInOut(duration: 0.15), value: isActive) + .animation(.easeInOut(duration: 0.15), value: participant.state) + } + + private var slotBadge: some View { + Text(circledDigit(participant.gestureSlot)) + .font(.system(size: 9, weight: .bold, design: .monospaced)) + .foregroundColor(isActive ? tileColor : .white.opacity(0.25)) + } + + private var pauseButton: some View { + Button(action: onPause) { + Image(systemName: participant.isPaused ? "play.fill" : "pause.fill") + .font(.system(size: 7, weight: .bold)) + .foregroundColor(.white.opacity(0.45)) + .frame(width: 16, height: 16) + .background(Circle().fill(Color.white.opacity(0.06))) + } + .buttonStyle(.plain) + } + + private func removeButton(action: @escaping () -> Void) -> some View { + Button(action: action) { + Image(systemName: "xmark") + .font(.system(size: 7, weight: .bold)) + .foregroundColor(.white.opacity(0.35)) + .frame(width: 16, height: 16) + .background(Circle().fill(Color.white.opacity(0.06))) + } + .buttonStyle(.plain) + } + + @ViewBuilder + private var stateLabel: some View { + if participant.isPaused { + Text("paused") + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(.white.opacity(0.2)) + } else { + switch participant.state { + case .idle: + Text("idle") + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(.white.opacity(0.2)) + case .thinking: + ThinkingDotsView() + case .streaming: + Text("streaming") + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(tileColor.opacity(0.7)) + case .paused: + Text("paused") + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(.white.opacity(0.2)) + } + } + } + + private func circledDigit(_ n: Int) -> String { + let circled = ["①","②","③","④","⑤"] + guard n >= 1, n <= circled.count else { return "\(n)" } + return circled[n - 1] + } +} + +// MARK: - ParticipantMascotView + +struct ParticipantMascotView: View { + let kind: ParticipantKind + let state: ParticipantState + let isPaused: Bool + let tileColor: Color + + @State private var breathe = false + @State private var shimmer = false + + private var mascotIcon: String { + switch kind { + case .llama: return "brain" + case .claudeCode: return "hammer.fill" + case .connection(_, _, let icon): return icon + } + } + + private var mascotColor: Color { tileColor } + + var body: some View { + ZStack { + // Outer glow ring — pulses when streaming + Circle() + .stroke( + mascotColor.opacity(isPaused ? 0 : (state == .streaming ? 0.4 : 0.12)), + lineWidth: state == .streaming ? 2 : 1 + ) + .scaleEffect(shimmer ? 1.15 : 1.0) + .opacity(shimmer ? 0 : 1) + + // Background fill + Circle() + .fill(mascotColor.opacity(isPaused ? 0.04 : 0.12)) + + // Icon + Image(systemName: mascotIcon) + .font(.system(size: 22, weight: .medium)) + .foregroundColor(isPaused ? .gray.opacity(0.3) : mascotColor) + .scaleEffect(breathe ? 1.06 : 1.0) + + // Thinking spinner overlay + if state == .thinking && !isPaused { + Circle() + .trim(from: 0, to: 0.65) + .stroke(mascotColor.opacity(0.5), style: StrokeStyle(lineWidth: 1.5, lineCap: .round)) + .rotationEffect(.degrees(breathe ? 360 : 0)) + .animation(.linear(duration: 0.9).repeatForever(autoreverses: false), value: breathe) + } + + // Paused badge + if isPaused { + Image(systemName: "pause.fill") + .font(.system(size: 9)) + .foregroundColor(.white.opacity(0.3)) + .offset(x: 14, y: 14) + } + } + .onAppear { animate() } + .onChange(of: state) { _ in animate() } + .onChange(of: isPaused) { _ in animate() } + } + + private func animate() { + switch state { + case .idle: + withAnimation(.easeInOut(duration: 2.4).repeatForever(autoreverses: true)) { + breathe = true + } + shimmer = false + case .thinking: + breathe = true // spinner uses this + case .streaming: + withAnimation(.easeInOut(duration: 0.6).repeatForever(autoreverses: false)) { + shimmer = true + } + withAnimation(.easeInOut(duration: 1.2).repeatForever(autoreverses: true)) { + breathe = true + } + case .paused: + breathe = false + shimmer = false + } + } +} + +// MARK: - ThinkingDotsView + +private struct ThinkingDotsView: View { + @State private var phase = 0 + + var body: some View { + HStack(spacing: 3) { + ForEach(0..<3, id: \.self) { i in + Circle() + .fill(Color.white.opacity(phase == i ? 0.7 : 0.2)) + .frame(width: 4, height: 4) + } + } + .onAppear { + Timer.scheduledTimer(withTimeInterval: 0.35, repeats: true) { _ in + phase = (phase + 1) % 3 + } + } + } +} + +// MARK: - CallFeedMessageRow + +private struct CallFeedMessageRow: View { + let participantName: String + let color: Color + let icon: String + let text: String + var imageData: Data? = nil + /// True for AI-generated narrative messages (AutoClawd reactions). + /// Rendered at reduced opacity with a dashed left bar. + var isGenerated: Bool = false + + var body: some View { + HStack(alignment: .top, spacing: 0) { + // 2px colored left bar — solid for real events, dashed-look for generated + if isGenerated { + // Dashed effect: two small rectangles with gap + VStack(spacing: 0) { + ForEach(0..<8, id: \.self) { i in + Rectangle() + .fill(color.opacity(i % 2 == 0 ? 0.45 : 0.0)) + .frame(width: 2, height: 4) + } + } + .frame(width: 2) + .frame(maxHeight: .infinity, alignment: .top) + .padding(.top, 4) + } else { + Rectangle() + .fill(color) + .frame(width: 2) + } + + // Content + VStack(alignment: .leading, spacing: 5) { + // Header: icon + name (lowercase monospace) + generated marker + HStack(spacing: 5) { + Image(systemName: icon) + .font(.system(size: 9)) + .foregroundColor(color.opacity(isGenerated ? 0.5 : 0.7)) + Text(participantName.lowercased()) + .font(.system(size: 9, weight: .semibold, design: .monospaced)) + .foregroundColor(color.opacity(isGenerated ? 0.45 : 0.65)) + if isGenerated { + Text("~") + .font(.system(size: 9, design: .monospaced)) + .foregroundColor(color.opacity(0.3)) + } + Spacer() + } + + // Body text (skip if empty and we have an image) + if !text.isEmpty { + Text(text) + .font(.system(size: 12, design: .monospaced)) + .foregroundColor(.white.opacity(isGenerated ? 0.55 : 0.85)) + .textSelection(.enabled) + .fixedSize(horizontal: false, vertical: true) + } + + // Inline image (Pencil screenshot, ScreenGrab, etc.) + if let data = imageData, let nsImage = NSImage(data: data) { + Image(nsImage: nsImage) + .resizable() + .aspectRatio(contentMode: .fit) + .frame(maxWidth: 280, maxHeight: 200) + .cornerRadius(6) + .overlay( + RoundedRectangle(cornerRadius: 6) + .stroke(color.opacity(0.25), lineWidth: 0.5) + ) + } + } + .padding(.leading, 10) + .padding(.vertical, 8) + .padding(.trailing, 4) + } + .opacity(isGenerated ? 0.75 : 1.0) + } +} diff --git a/Sources/CallModeSession.swift b/Sources/CallModeSession.swift new file mode 100644 index 0000000..0301c00 --- /dev/null +++ b/Sources/CallModeSession.swift @@ -0,0 +1,332 @@ +import Foundation +import SwiftUI + +// MARK: - CallModeSession + +/// Direct Anthropic API conversation session for Call Mode. +/// +/// Bypasses Llama entirely — voice transcript → Claude directly. +/// Claude proactively calls screen/cursor/selection tools to see what the user is looking at. +/// +/// Pipeline for call mode: +/// Mic → SFSpeech/Groq transcript → send() → Anthropic messages API +/// ↑ Claude calls tools ↓ +/// ScreenGrabService.captureScreen / captureCursorContext / captureSelection +/// ↓ Claude responds +/// @Published messages → CallModeView +@MainActor +final class CallModeSession: ObservableObject { + + @Published var messages: [CallMessage] = [] + @Published var isProcessing: Bool = false + + private var history: [[String: Any]] = [] + private let screenGrab = ScreenGrabService() + private var transcriptProvider: (() -> String)? + + // MARK: - Configuration + + func configure(transcriptProvider: @escaping () -> String) { + self.transcriptProvider = transcriptProvider + } + + // MARK: - Send + + /// Send a user message (typically from voice transcript) to Claude. + /// Claude may call screen tools before responding. + func send(text: String) async { + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + guard !trimmed.isEmpty else { return } + + let apiKey = SettingsManager.shared.anthropicAPIKey + guard !apiKey.isEmpty else { + messages.append(CallMessage(role: .error, + text: "Anthropic API key not configured.")) + return + } + + messages.append(CallMessage(role: .user, text: trimmed)) + history.append(["role": "user", "content": trimmed]) + + isProcessing = true + defer { isProcessing = false } + + do { + let reply = try await runAgentLoop(apiKey: apiKey) + if !reply.isEmpty { + messages.append(CallMessage(role: .assistant, text: reply)) + history.append(["role": "assistant", "content": reply]) + } + } catch { + messages.append(CallMessage(role: .error, text: error.localizedDescription)) + } + } + + func clearHistory() { + messages.removeAll() + history.removeAll() + } + + /// Append a message from an external source (e.g. Claude Code via MCP autoclawd_set_canvas). + func appendExternalMessage(_ text: String) { + messages.append(CallMessage(role: .external, text: text)) + } + + /// Append a message attributed to a named plugin participant (via autoclawd_send_participant_message). + func appendParticipantMessage(id: String, name: String, text: String, + imageData: Data? = nil, isGenerated: Bool = false) { + messages.append(CallMessage(role: .participant, text: text, + participantID: id, participantName: name, + imageData: imageData, isGenerated: isGenerated)) + } + + // MARK: - Agent Loop + + /// Tool-use loop: request → if tool_use → execute → continue → until end_turn. + private func runAgentLoop(apiKey: String) async throws -> String { + while true { + let body = makeRequestBody() + let response = try await callAnthropic(body: body, apiKey: apiKey) + + guard let stopReason = response["stop_reason"] as? String else { + throw CallModeError.invalidResponse + } + + let content = response["content"] as? [[String: Any]] ?? [] + + if stopReason == "end_turn" { + return content + .filter { $0["type"] as? String == "text" } + .compactMap { $0["text"] as? String } + .joined(separator: "\n") + } + + if stopReason == "tool_use" { + // Append Claude's tool-use turn to history + history.append(["role": "assistant", "content": content]) + + // Execute all tool calls in parallel, then collect results + var results: [[String: Any]] = [] + for block in content where block["type"] as? String == "tool_use" { + guard let toolID = block["id"] as? String, + let toolName = block["name"] as? String + else { continue } + + let args = block["input"] as? [String: Any] ?? [:] + let output = await executeTool(name: toolName, args: args) + + // Show tool use in messages for transparency + messages.append(CallMessage( + role: .tool, + text: "[\(toolName)]" + )) + + results.append([ + "type": "tool_result", + "tool_use_id": toolID, + "content": output + ]) + } + + history.append(["role": "user", "content": results]) + continue + } + + // Unexpected stop reason — return whatever text we have + return content + .filter { $0["type"] as? String == "text" } + .compactMap { $0["text"] as? String } + .joined(separator: "\n") + } + } + + // MARK: - Tool Execution + + private func executeTool(name: String, args: [String: Any]) async -> [[String: Any]] { + switch name { + + case "get_screen": + var region: CGRect? + if let r = args["region"] as? [String: Any], + let x = r["x"] as? CGFloat, let y = r["y"] as? CGFloat, + let w = r["width"] as? CGFloat, let h = r["height"] as? CGFloat { + region = CGRect(x: x, y: y, width: w, height: h) + } + let grab = await screenGrab.captureScreen(region: region) + return imageBlocks(from: grab) + + case "get_cursor_context": + let grab = await screenGrab.captureCursorContext() + return imageBlocks(from: grab) + + case "get_selection": + let sel = await screenGrab.captureSelection() + if sel.selectedText.isEmpty && sel.contextImageJPEGData == nil { + return [["type": "text", "text": "No text currently selected."]] + } + var blocks: [[String: Any]] = [] + if !sel.selectedText.isEmpty { + blocks.append(["type": "text", + "text": "Selected text:\n\(sel.selectedText)"]) + } + if let jpeg = sel.contextImageJPEGData { + blocks.append(imageBlock(jpeg)) + } + return blocks + + case "get_audio_transcript": + let maxChars = args["max_chars"] as? Int ?? 2_000 + let transcript = transcriptProvider?() ?? "" + let trimmed = transcript.count > maxChars + ? String(transcript.suffix(maxChars)) + : transcript + return [["type": "text", + "text": trimmed.isEmpty ? "No transcript available." : trimmed]] + + default: + return [["type": "text", "text": "Unknown tool: \(name)"]] + } + } + + // MARK: - Content Block Helpers + + private func imageBlocks(from grab: ScreenGrab) -> [[String: Any]] { + var blocks: [[String: Any]] = [] + let textParts = [ + grab.metadata.isEmpty ? nil : grab.metadata, + grab.ocrText.isEmpty ? nil : "Screen text:\n\(grab.ocrText)" + ].compactMap { $0 } + if !textParts.isEmpty { + blocks.append(["type": "text", "text": textParts.joined(separator: "\n\n")]) + } + if let jpeg = grab.imageJPEGData { + blocks.append(imageBlock(jpeg)) + } + return blocks + } + + private func imageBlock(_ jpeg: Data) -> [String: Any] { + [ + "type": "image", + "source": [ + "type": "base64", + "media_type": "image/jpeg", + "data": jpeg.base64EncodedString() + ] + ] + } + + // MARK: - Anthropic API + + private func makeRequestBody() -> [String: Any] { + [ + "model": "claude-opus-4-6", + "max_tokens": 4096, + "system": """ + You are an AI assistant running inside AutoClawd with real-time access \ + to the user's screen and microphone. You can see their screen, read OCR text, \ + and grab screenshots. Always call get_screen at the start of a new topic to \ + orient yourself. Use get_cursor_context when the user says "this" or "here" \ + without specifying. Use get_selection whenever the user has highlighted text. \ + Be concise, direct, and action-oriented. + """, + "tools": toolDefinitions(), + "messages": history + ] + } + + private func toolDefinitions() -> [[String: Any]] { + [ + [ + "name": "get_screen", + "description": "Capture the screen with OCR text and a JPEG screenshot. Optionally crop to a region.", + "input_schema": [ + "type": "object", + "properties": [ + "region": [ + "type": "object", + "properties": [ + "x": ["type": "number"], + "y": ["type": "number"], + "width": ["type": "number"], + "height": ["type": "number"] + ] + ] + ] + ] as [String: Any] + ], + [ + "name": "get_cursor_context", + "description": "Capture 600×400 region around the cursor with OCR. Use when user points at something.", + "input_schema": ["type": "object", "properties": [:] as [String: Any]] + ], + [ + "name": "get_selection", + "description": "Get selected text and screenshot of selection. Use when user highlights something.", + "input_schema": ["type": "object", "properties": [:] as [String: Any]] + ], + [ + "name": "get_audio_transcript", + "description": "Get recent spoken audio transcript from the user's microphone.", + "input_schema": [ + "type": "object", + "properties": [ + "max_chars": ["type": "number"] + ] + ] as [String: Any] + ] + ] + } + + private func callAnthropic(body: [String: Any], apiKey: String) async throws -> [String: Any] { + let url = URL(string: "https://api.anthropic.com/v1/messages")! + var req = URLRequest(url: url) + req.httpMethod = "POST" + req.setValue("application/json", forHTTPHeaderField: "Content-Type") + req.setValue(apiKey, forHTTPHeaderField: "x-api-key") + req.setValue("2023-06-01", forHTTPHeaderField: "anthropic-version") + req.httpBody = try JSONSerialization.data(withJSONObject: body) + + let (data, response) = try await URLSession.shared.data(for: req) + guard let http = response as? HTTPURLResponse, http.statusCode == 200 else { + let msg = String(data: data, encoding: .utf8) ?? "Unknown API error" + throw CallModeError.apiError(msg) + } + guard let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else { + throw CallModeError.invalidResponse + } + return json + } +} + +// MARK: - Supporting Types + +struct CallMessage: Identifiable { + let id: UUID = UUID() + let role: Role + let text: String + /// Timestamp — used to show time labels in the call feed. + let createdAt: Date = Date() + /// Set when the message comes from a plugin participant (autoclawd_send_participant_message). + var participantID: String? = nil + var participantName: String? = nil + /// Inline image attached to this message (e.g. Pencil screenshot, ScreenGrab). + var imageData: Data? = nil + /// True for AI-generated narrative messages (AutoClawd reactions, fake conversation). + /// These render with a dashed/faint border vs solid for real tool events. + var isGenerated: Bool = false + + enum Role { case user, assistant, tool, error, external, participant } +} + +enum CallModeError: Error, LocalizedError { + case invalidResponse + case apiError(String) + + var errorDescription: String? { + switch self { + case .invalidResponse: return "Invalid API response from Anthropic." + case .apiError(let m): return m + } + } +} diff --git a/Sources/CallRoom.swift b/Sources/CallRoom.swift new file mode 100644 index 0000000..e2c3d3c --- /dev/null +++ b/Sources/CallRoom.swift @@ -0,0 +1,183 @@ +import Foundation +import SwiftUI + +// MARK: - ParticipantKind + +enum ParticipantKind: Equatable { + case llama // AutoClawd PM — always present + case claudeCode // joins via MCP session + case connection(id: String, name: String, systemImage: String) // plugin/tool participants +} + +// MARK: - ParticipantState + +enum ParticipantState { + case idle // present, quiet + case thinking // processing (spinner) + case streaming // outputting text + case paused // muted — not receiving transcript context +} + +// MARK: - CallParticipant + +struct CallParticipant: Identifiable { + let id: String + let kind: ParticipantKind + var state: ParticipantState = .idle + var isPaused: Bool = false + var lastActivity: Date? + + var displayName: String { + switch kind { + case .llama: return "AutoClawd" + case .claudeCode: return "Claw'd" + case .connection(_, let name, _): return name + } + } + + var mascotSystemImage: String { + switch kind { + case .llama: return "brain" + case .claudeCode: return "hammer.fill" + case .connection(_, _, let icon): return icon + } + } + + /// Consistent color per participant — connections derive hue from their ID. + var tileColor: Color { + switch kind { + case .llama: return .teal + case .claudeCode: return .orange + case .connection(let id, _, _): + let hash = id.unicodeScalars.reduce(0) { ($0 &+ Int($1.value)) % 360 } + return Color(hue: Double(hash) / 360.0, saturation: 0.65, brightness: 0.95) + } + } + + /// Gesture finger slot (1-based) based on current participant order. + /// Updated externally by CallRoom when participants array changes. + var gestureSlot: Int = 1 +} + +// MARK: - CallRoom + +/// Manages the set of participants in the active call and which one the user is addressing. +/// Llama is always participant[0] and cannot be removed. +@MainActor +final class CallRoom: ObservableObject { + + // MARK: Published State + + @Published private(set) var participants: [CallParticipant] = [] + @Published var activeParticipantID: String = "llama" + + // MARK: Init + + init() { + var llama = CallParticipant(id: "llama", kind: .llama) + llama.gestureSlot = 1 + participants = [llama] + } + + // MARK: - Active Participant + + var activeParticipant: CallParticipant? { + participants.first { $0.id == activeParticipantID } + } + + /// Select participant by left-hand finger count (1-based index into participants array). + func selectByGesture(fingerCount: Int) { + let index = fingerCount - 1 + guard index >= 0, index < participants.count else { return } + activeParticipantID = participants[index].id + } + + // MARK: - Join / Leave + + func claudeCodeJoined() { + guard !participants.contains(where: { $0.kind == .claudeCode }) else { + // Already present — refresh lastActivity + updateLastActivity(id: "claude-code") + return + } + var p = CallParticipant(id: "claude-code", kind: .claudeCode, lastActivity: Date()) + rebuildSlots() + p.gestureSlot = participants.count + 1 + participants.append(p) + rebuildSlots() + } + + func claudeCodeLeft() { + participants.removeAll { $0.kind == .claudeCode } + if activeParticipantID == "claude-code" { activeParticipantID = "llama" } + rebuildSlots() + } + + func connectionJoined(id: String, name: String, systemImage: String = "cable.connector") { + guard !participants.contains(where: { $0.id == id }) else { + updateLastActivity(id: id) + return + } + var p = CallParticipant(id: id, kind: .connection(id: id, name: name, systemImage: systemImage)) + p.gestureSlot = participants.count + 1 + participants.append(p) + rebuildSlots() + } + + func connectionLeft(id: String) { + participants.removeAll { $0.id == id } + if activeParticipantID == id { activeParticipantID = "llama" } + rebuildSlots() + } + + // MARK: - Pause / Resume / Remove + + func togglePause(id: String) { + guard let idx = participants.firstIndex(where: { $0.id == id }) else { return } + participants[idx].isPaused.toggle() + if participants[idx].isPaused { participants[idx].state = .paused } + else if participants[idx].state == .paused { participants[idx].state = .idle } + } + + /// Remove a participant (Llama cannot be removed). + func remove(id: String) { + guard id != "llama" else { return } + participants.removeAll { $0.id == id } + if activeParticipantID == id { activeParticipantID = "llama" } + rebuildSlots() + } + + // MARK: - State Updates + + func setState(_ state: ParticipantState, for id: String) { + guard let idx = participants.firstIndex(where: { $0.id == id }) else { return } + guard !participants[idx].isPaused else { return } + participants[idx].state = state + participants[idx].lastActivity = Date() + } + + func updateLastActivity(id: String) { + guard let idx = participants.firstIndex(where: { $0.id == id }) else { return } + participants[idx].lastActivity = Date() + } + + // MARK: - MCP Pause Gating + + /// True when Claude Code is in the room and NOT paused — MCP transcript is live. + var claudeCodeIsActive: Bool { + guard let p = participants.first(where: { $0.kind == .claudeCode }) else { return false } + return !p.isPaused + } + + var claudeCodeIsPresent: Bool { + participants.contains { $0.kind == .claudeCode } + } + + // MARK: - Helpers + + private func rebuildSlots() { + for i in participants.indices { + participants[i].gestureSlot = i + 1 + } + } +} diff --git a/Sources/CallStreamWidget.swift b/Sources/CallStreamWidget.swift new file mode 100644 index 0000000..d156f6a --- /dev/null +++ b/Sources/CallStreamWidget.swift @@ -0,0 +1,119 @@ +import AppKit +import SwiftUI + +// MARK: - CallStreamWidget + +/// Always-on-top floating widget that shows the live call stream — agent tiles, +/// camera feed, and conversation feed. Separate from the main pill so it can +/// overlay any screen or app during a Claude Code session. +/// +/// Activated when pillMode == .callMode and the setting is enabled. +/// Draggable from anywhere; snaps to the bottom-right corner by default. +final class CallStreamWidget: NSPanel { + + static let defaultWidth: CGFloat = 420 + static let defaultHeight: CGFloat = 560 + + private var hostingView: NSHostingView? + + // Smooth drag + private var initialMouseLoc: NSPoint = .zero + private var initialOrigin: NSPoint = .zero + + init() { + super.init( + contentRect: NSRect(x: 0, y: 0, + width: Self.defaultWidth, + height: Self.defaultHeight), + styleMask: [.borderless, .nonactivatingPanel, .utilityWindow], + backing: .buffered, + defer: false + ) + configure() + } + + private func configure() { + isOpaque = false + backgroundColor = .clear + hasShadow = true + level = .floating + collectionBehavior = [.canJoinAllSpaces, .stationary, .ignoresCycle] + isMovableByWindowBackground = false + animationBehavior = .utilityWindow + + // Default position: bottom-right, 20pt inset from visible frame + if let screen = NSScreen.main { + let vf = screen.visibleFrame + let x = vf.maxX - Self.defaultWidth - 20 + let y = vf.minY + 20 + setFrameOrigin(NSPoint(x: x, y: y)) + } + } + + func setContent(_ view: V) { + let hosting = NSHostingView(rootView: AnyView(view)) + hosting.frame = contentView?.bounds ?? .zero + hosting.autoresizingMask = [.width, .height] + hosting.wantsLayer = true + hosting.layer?.backgroundColor = CGColor.clear + hosting.layer?.isOpaque = false + contentView = hosting + hostingView = hosting + } + + // MARK: - Dragging + + override func mouseDown(with event: NSEvent) { + initialMouseLoc = NSEvent.mouseLocation + initialOrigin = frame.origin + } + + override func mouseDragged(with event: NSEvent) { + let cur = NSEvent.mouseLocation + setFrameOrigin(NSPoint( + x: initialOrigin.x + (cur.x - initialMouseLoc.x), + y: initialOrigin.y + (cur.y - initialMouseLoc.y) + )) + } + + // MARK: - Visibility + + func show() { + orderFront(nil) + } + + func hide() { + orderOut(nil) + } + + /// Animate in from bottom (slide + fade). + func animateIn() { + alphaValue = 0 + let target = frame + var start = target + start.origin.y -= 24 + setFrame(start, display: false) + + NSAnimationContext.runAnimationGroup { ctx in + ctx.duration = 0.28 + ctx.timingFunction = CAMediaTimingFunction(name: .easeOut) + animator().setFrame(target, display: true) + animator().alphaValue = 1 + } + orderFront(nil) + } + + /// Animate out (slide + fade). + func animateOut(completion: (() -> Void)? = nil) { + NSAnimationContext.runAnimationGroup { ctx in + ctx.duration = 0.2 + ctx.timingFunction = CAMediaTimingFunction(name: .easeIn) + var end = frame; end.origin.y -= 24 + animator().setFrame(end, display: true) + animator().alphaValue = 0 + } completionHandler: { + self.orderOut(nil) + completion?() + } + } +} diff --git a/Sources/CallStreamWidgetView.swift b/Sources/CallStreamWidgetView.swift new file mode 100644 index 0000000..2977165 --- /dev/null +++ b/Sources/CallStreamWidgetView.swift @@ -0,0 +1,719 @@ +import SwiftUI +import AppKit + +// MARK: - CallStreamWidgetView +// +// Brutalist design language — no circles, no soft rounding, sharp geometric. +// The feed is a story, not a log. Sections: +// +// ┌─ HEADER: CALL STREAM + timer + close ─────────────────┐ +// │ MISSION: user's goal (first spoken message) │ +// ├─ PARTICIPANTS: rectangular tiles, thick top accent ────┤ +// │ TASKS: pending todo queue (top 3) │ +// ├─ STREAM ──────────────────────────────────────────────┤ +// │ Group chat — NAME ─── time / message / image │ +// ├─ SPOTLIGHT: current file or image (auto-shown) ────────┤ +// │ END CALL bar │ +// └────────────────────────────────────────────────────────┘ + +struct CallStreamWidgetView: View { + @ObservedObject var appState: AppState + let onClose: () -> Void + + @State private var sessionSeconds: Int = 0 + @State private var spotlightImage: NSImage? = nil + @State private var spotlightFile: String? = nil + + private let sessionTimer = Timer.publish(every: 1, on: .main, in: .common).autoconnect() + + // Palette + private let bg = Color(red: 0.067, green: 0.067, blue: 0.067) + private let surf = Color(red: 0.102, green: 0.102, blue: 0.102) + private let border = Color.white.opacity(0.07) + + var body: some View { + VStack(spacing: 0) { + header + rowDivider + + if let goal = missionGoal { + missionBar(goal) + rowDivider + } + + participantStrip + rowDivider + + let tasks = pendingTasks + if !tasks.isEmpty { + taskSection(tasks) + rowDivider + } + + streamHeader + streamFeed + + if spotlightImage != nil || spotlightFile != nil { + rowDivider + spotlightPanel + } + + rowDivider + bottomBar + } + .frame(width: 420) + .background(bg) + .clipShape(RoundedRectangle(cornerRadius: 3, style: .continuous)) + .overlay( + RoundedRectangle(cornerRadius: 3, style: .continuous) + .stroke(Color.white.opacity(0.09), lineWidth: 0.5) + ) + .shadow(color: .black.opacity(0.65), radius: 36, y: 16) + .shadow(color: .black.opacity(0.20), radius: 6, y: 2) + .onReceive(sessionTimer) { _ in sessionSeconds += 1 } + .onChange(of: appState.callModeSession.messages.count) { _ in + updateSpotlight() + } + } + + // MARK: - Header + + private var header: some View { + HStack(spacing: 10) { + // REC dot + Circle() + .fill(Color.red) + .frame(width: 6, height: 6) + .shadow(color: .red.opacity(0.9), radius: 5) + + Text("CALL STREAM") + .font(.system(size: 9, weight: .bold, design: .monospaced)) + .foregroundColor(.white.opacity(0.45)) + .tracking(2) + + Spacer() + + Text(formatDuration(sessionSeconds)) + .font(.system(size: 10, design: .monospaced)) + .foregroundColor(.white.opacity(0.28)) + .monospacedDigit() + + closeButton + } + .padding(.horizontal, 14) + .padding(.vertical, 9) + .background(surf) + } + + private var closeButton: some View { + Button(action: onClose) { + Image(systemName: "xmark") + .font(.system(size: 8, weight: .semibold)) + .foregroundColor(.white.opacity(0.3)) + .frame(width: 18, height: 18) + .background(Rectangle().fill(.white.opacity(0.07))) + } + .buttonStyle(.plain) + } + + // MARK: - Mission + + private var missionGoal: String? { + appState.callModeSession.messages.first(where: { $0.role == .user })?.text + } + + private func missionBar(_ goal: String) -> some View { + HStack(alignment: .top, spacing: 12) { + Rectangle() + .fill(Color.orange) + .frame(width: 2) + .frame(maxHeight: .infinity) + + VStack(alignment: .leading, spacing: 3) { + sectionLabel("MISSION") + Text(goal) + .font(.system(size: 11)) + .foregroundColor(.white.opacity(0.72)) + .lineLimit(3) + } + Spacer() + } + .padding(.horizontal, 14) + .padding(.vertical, 9) + .fixedSize(horizontal: false, vertical: true) + } + + // MARK: - Participant strip (brutalist — no circles) + + private var participantStrip: some View { + HStack(spacing: 1) { + userTile + ForEach(appState.callRoom.participants) { p in + ParticipantBrutalistTile( + participant: p, + isActive: p.id == appState.callRoom.activeParticipantID, + onTap: { appState.callRoom.activeParticipantID = p.id } + ) + } + } + .frame(maxWidth: .infinity, minHeight: 76) + .background(surf) + } + + private var userTile: some View { + VStack(spacing: 5) { + ZStack { + Rectangle() + .fill(.white.opacity(0.05)) + .frame(width: 32, height: 32) + if appState.cameraEnabled && appState.cameraService.isRunning { + CameraPreviewView(session: appState.cameraService.captureSession) + .frame(width: 32, height: 32) + .clipped() + } else { + Image(systemName: "person.fill") + .font(.system(size: 13)) + .foregroundColor(.white.opacity(0.35)) + } + } + + Text("YOU") + .font(.system(size: 7, weight: .bold, design: .monospaced)) + .foregroundColor(.white.opacity(0.28)) + .tracking(1) + + // Mic bars + HStack(spacing: 2) { + ForEach(0..<3, id: \.self) { _ in + Rectangle() + .fill(Color.white.opacity(appState.isListening ? 0.5 : 0.1)) + .frame(width: 2, height: 5) + } + } + } + .frame(maxWidth: .infinity) + .frame(height: 76) + .overlay(alignment: .top) { + Rectangle().fill(.white.opacity(0.2)).frame(height: 2) + } + } + + // MARK: - Tasks + + private var pendingTasks: [StructuredTodo] { + Array(appState.structuredTodos.filter { !$0.isExecuted }.prefix(3)) + } + + private func taskSection(_ tasks: [StructuredTodo]) -> some View { + VStack(spacing: 0) { + HStack { + sectionLabel("TASKS") + .padding(.horizontal, 14) + .padding(.vertical, 7) + Spacer() + Text("\(tasks.count) pending") + .font(.system(size: 7, design: .monospaced)) + .foregroundColor(.white.opacity(0.18)) + .padding(.trailing, 14) + } + .background(surf) + + rowDivider + + VStack(spacing: 0) { + ForEach(Array(tasks.enumerated()), id: \.element.id) { i, todo in + HStack(spacing: 10) { + Rectangle() + .fill(i == 0 ? Color.orange : .white.opacity(0.12)) + .frame(width: 2, height: 14) + Image(systemName: i == 0 ? "arrow.right" : "circle") + .font(.system(size: 8)) + .foregroundColor(i == 0 ? .orange : .white.opacity(0.2)) + Text(todo.content) + .font(.system(size: 10)) + .foregroundColor(i == 0 ? .white.opacity(0.82) : .white.opacity(0.35)) + .lineLimit(1) + Spacer() + } + .padding(.horizontal, 14) + .padding(.vertical, 7) + .background(i == 0 ? Color.orange.opacity(0.04) : Color.clear) + + if i < tasks.count - 1 { + Rectangle() + .fill(border) + .frame(maxWidth: .infinity, maxHeight: 0.5) + .padding(.leading, 14) + } + } + } + } + } + + // MARK: - Stream + + private var streamHeader: some View { + HStack { + sectionLabel("STREAM") + Spacer() + if appState.callModeSession.isProcessing { + HStack(spacing: 5) { + ProgressView().controlSize(.mini).tint(.teal) + Text("thinking...") + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(.teal.opacity(0.65)) + } + } + } + .padding(.horizontal, 14) + .padding(.vertical, 7) + .background(surf) + } + + private var streamFeed: some View { + ScrollViewReader { proxy in + ScrollView { + LazyVStack(spacing: 0) { + ForEach(appState.callModeSession.messages) { msg in + BrutalistChatMessage( + msg: msg, + color: participantColor(for: msg), + name: participantName(for: msg) + ) + .id(msg.id) + } + + // Live user speech + if !appState.liveTranscriptText.isEmpty { + BrutalistChatMessage( + msg: CallMessage( + role: .user, + text: appState.liveTranscriptText, + participantName: "you" + ), + color: .white, + name: "YOU" + ) + .id("live") + } + + // Empty state + if appState.callModeSession.messages.isEmpty && appState.liveTranscriptText.isEmpty { + VStack(spacing: 8) { + Rectangle() + .fill(.white.opacity(0.04)) + .frame(width: 1, height: 32) + .padding(.top, 24) + Text("speak or start a claude code session") + .font(.system(size: 10, design: .monospaced)) + .foregroundColor(.white.opacity(0.14)) + Rectangle() + .fill(.white.opacity(0.04)) + .frame(width: 1, height: 24) + } + .frame(maxWidth: .infinity) + .padding(.bottom, 24) + } + } + } + .frame(maxHeight: .infinity) + .onChange(of: appState.callModeSession.messages.count) { _ in + withAnimation(.easeOut(duration: 0.2)) { + if let last = appState.callModeSession.messages.last { + proxy.scrollTo(last.id, anchor: .bottom) + } + } + } + .onChange(of: appState.liveTranscriptText) { _ in + withAnimation { proxy.scrollTo("live", anchor: .bottom) } + } + } + } + + // MARK: - Spotlight + + @ViewBuilder + private var spotlightPanel: some View { + VStack(spacing: 0) { + HStack(spacing: 7) { + Rectangle() + .fill(Color.yellow.opacity(0.6)) + .frame(width: 2, height: 10) + sectionLabel("SPOTLIGHT") + Spacer() + Button(action: { withAnimation { spotlightImage = nil; spotlightFile = nil } }) { + Image(systemName: "xmark") + .font(.system(size: 7)) + .foregroundColor(.white.opacity(0.2)) + } + .buttonStyle(.plain) + .padding(.trailing, 14) + } + .padding(.horizontal, 14) + .padding(.vertical, 7) + .background(surf) + + rowDivider + + if let img = spotlightImage { + Image(nsImage: img) + .resizable() + .aspectRatio(contentMode: .fit) + .frame(maxWidth: .infinity, maxHeight: 190) + .background(Color.black) + } else if let file = spotlightFile { + HStack(spacing: 12) { + Rectangle() + .fill(Color.yellow.opacity(0.08)) + .frame(width: 32, height: 32) + .overlay( + Image(systemName: fileIcon(file)) + .font(.system(size: 14)) + .foregroundColor(.yellow.opacity(0.55)) + ) + + VStack(alignment: .leading, spacing: 3) { + Text(fileName(file)) + .font(.system(size: 11, design: .monospaced)) + .foregroundColor(.white.opacity(0.75)) + Text(fileCategory(file)) + .font(.system(size: 9, design: .monospaced)) + .foregroundColor(.white.opacity(0.28)) + } + + Spacer() + + Text("reading") + .font(.system(size: 8, weight: .semibold, design: .monospaced)) + .foregroundColor(.orange) + .padding(.horizontal, 7) + .padding(.vertical, 4) + .background(Rectangle().fill(Color.orange.opacity(0.10))) + } + .padding(.horizontal, 14) + .padding(.vertical, 10) + } + } + .transition(.asymmetric( + insertion: .move(edge: .bottom).combined(with: .opacity), + removal: .opacity + )) + .animation(.easeOut(duration: 0.2), value: spotlightFile) + .animation(.easeOut(duration: 0.2), value: spotlightImage != nil) + } + + // MARK: - Bottom bar + + private var bottomBar: some View { + HStack(spacing: 14) { + // Waveform + HStack(spacing: 2) { + ForEach(0..<10, id: \.self) { i in + Rectangle() + .fill(Color.green.opacity(appState.isListening ? 0.7 : 0.18)) + .frame(width: 2, height: waveH(i)) + .animation(.easeInOut(duration: 0.1), value: appState.audioLevel) + } + } + .frame(width: 30) + + Spacer() + + if !appState.callModeSession.messages.isEmpty { + Text("\(appState.callModeSession.messages.count) events") + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(.white.opacity(0.18)) + } + + // End call — brutalist: red square + Button { + appState.stopListening() + appState.pillMode = .ambientIntelligence + } label: { + HStack(spacing: 6) { + Rectangle() + .fill(.white) + .frame(width: 7, height: 7) + Text("END CALL") + .font(.system(size: 9, weight: .bold, design: .monospaced)) + .tracking(1) + } + .foregroundColor(.white) + .padding(.horizontal, 12) + .padding(.vertical, 7) + .background(Rectangle().fill(Color.red.opacity(0.85))) + } + .buttonStyle(.plain) + } + .padding(.horizontal, 14) + .padding(.vertical, 10) + .background(surf) + } + + // MARK: - Shared helpers + + private var rowDivider: some View { + Rectangle() + .fill(border) + .frame(maxWidth: .infinity, maxHeight: 1) + } + + private func sectionLabel(_ text: String) -> some View { + Text(text) + .font(.system(size: 8, weight: .bold, design: .monospaced)) + .foregroundColor(.white.opacity(0.25)) + .tracking(2) + } + + private func participantColor(for msg: CallMessage) -> Color { + if let pid = msg.participantID, + let p = appState.callRoom.participants.first(where: { $0.id == pid }) { + return p.tileColor + } + switch msg.role { + case .user: return .white + case .assistant: return .teal + case .external: return .orange + case .tool: return .yellow + case .error: return .red + case .participant: return .purple + } + } + + private func participantName(for msg: CallMessage) -> String { + if let name = msg.participantName { return name.uppercased() } + switch msg.role { + case .user: return "YOU" + case .assistant: return "AUTOCLAWD" + case .external: return "CLAWD" + case .tool: return "TOOL" + case .error: return "ERROR" + case .participant: return "AGENT" + } + } + + // MARK: - Spotlight helpers + + private func updateSpotlight() { + let msgs = appState.callModeSession.messages + for msg in msgs.suffix(6).reversed() { + if let data = msg.imageData, let img = NSImage(data: data) { + withAnimation { spotlightImage = img; spotlightFile = nil } + return + } + } + for msg in msgs.suffix(6).reversed() { + if let file = extractFilename(from: msg.text) { + withAnimation { spotlightFile = file; spotlightImage = nil } + return + } + } + } + + private func extractFilename(from text: String) -> String? { + // Matches things like "Sources/Foo.swift" or "foo.ts" etc. + let pat = #"[\w\-./]+\.(swift|ts|tsx|py|js|json|md|yaml|yml|sh|go|rs|kt|css|html)"# + guard let range = text.range(of: pat, options: .regularExpression), + text[range].count > 5 else { return nil } + return String(text[range]) + } + + private func fileIcon(_ path: String) -> String { + switch URL(fileURLWithPath: path).pathExtension.lowercased() { + case "swift": return "swift" + case "ts", "tsx": return "t.square" + case "py": return "terminal" + case "js": return "j.square" + case "json": return "curlybraces" + case "md": return "text.alignleft" + case "sh": return "terminal.fill" + default: return "doc.text" + } + } + + private func fileName(_ path: String) -> String { URL(fileURLWithPath: path).lastPathComponent } + + private func fileCategory(_ path: String) -> String { + switch URL(fileURLWithPath: path).pathExtension.lowercased() { + case "swift": return "swift source" + case "ts", "tsx": return "typescript" + case "py": return "python" + case "js": return "javascript" + case "json": return "configuration" + case "md": return "documentation" + default: return "source file" + } + } + + private func waveH(_ i: Int) -> CGFloat { + guard appState.isListening else { return 3 } + let ph = Double(i) * 0.8 + return 3 + (sin(ph + Double(appState.audioLevel) * 6) * 0.5 + 0.5) + * CGFloat(appState.audioLevel) * 12 + } + + private func formatDuration(_ sec: Int) -> String { + String(format: "%d:%02d", sec / 60, sec % 60) + } +} + +// MARK: - ParticipantBrutalistTile + +private struct ParticipantBrutalistTile: View { + let participant: CallParticipant + let isActive: Bool + let onTap: () -> Void + + private var color: Color { participant.tileColor } + + var body: some View { + Button(action: onTap) { + VStack(spacing: 5) { + // Square icon/mascot — NO circles + ZStack { + Rectangle() + .fill(color.opacity(isActive ? 0.12 : 0.06)) + .frame(width: 34, height: 34) + + if let ns = NSImage(named: "mascot-\(participant.id)") { + Image(nsImage: ns) + .resizable() + .scaledToFit() + .frame(width: 24, height: 24) + } else { + Image(systemName: participant.mascotSystemImage) + .font(.system(size: 14, weight: .medium)) + .foregroundColor(participant.isPaused ? .gray.opacity(0.3) : color) + } + + // Activity border (square pulse instead of circle) + if participant.state == .streaming || participant.state == .thinking { + Rectangle() + .stroke(color.opacity(0.55), lineWidth: 1) + .frame(width: 38, height: 38) + } + } + + Text(participant.displayName.uppercased()) + .font(.system(size: 7, weight: .bold, design: .monospaced)) + .foregroundColor(isActive ? color : .white.opacity(0.28)) + .tracking(0.5) + .lineLimit(1) + + Text(stateText) + .font(.system(size: 7, design: .monospaced)) + .foregroundColor(stateColor.opacity(0.7)) + } + .frame(maxWidth: .infinity) + .frame(height: 76) + .background(isActive ? color.opacity(0.055) : Color.clear) + // Thick top accent bar + .overlay(alignment: .top) { + Rectangle() + .fill(color) + .frame(height: isActive ? 3 : 1) + .opacity(isActive ? 1.0 : 0.3) + } + // Side/bottom border + .overlay( + Rectangle() + .stroke(Color.white.opacity(0.05), lineWidth: 0.5) + ) + } + .buttonStyle(.plain) + } + + private var stateText: String { + if participant.isPaused { return "paused" } + switch participant.state { + case .idle: return "idle" + case .thinking: return "thinking" + case .streaming: return "working" + case .paused: return "paused" + } + } + + private var stateColor: Color { + switch participant.state { + case .streaming: return color + case .thinking: return .yellow + default: return .white.opacity(0.3) + } + } +} + +// MARK: - BrutalistChatMessage + +private struct BrutalistChatMessage: View { + let msg: CallMessage + let color: Color + let name: String + + private static let timeFmt: DateFormatter = { + let f = DateFormatter() + f.dateFormat = "H:mm" + return f + }() + + var body: some View { + VStack(alignment: .leading, spacing: 0) { + // NAME ─────────────── time + HStack(spacing: 8) { + Text(name) + .font(.system(size: 9, weight: .bold, design: .monospaced)) + .foregroundColor(color.opacity(msg.isGenerated ? 0.55 : 1.0)) + + if msg.isGenerated { + Text("~") + .font(.system(size: 9, design: .monospaced)) + .italic() + .foregroundColor(color.opacity(0.4)) + } + + Rectangle() + .fill(color.opacity(msg.isGenerated ? 0.12 : 0.22)) + .frame(maxWidth: .infinity, maxHeight: 1) + + Text(Self.timeFmt.string(from: msg.createdAt)) + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(.white.opacity(0.18)) + .monospacedDigit() + } + .padding(.horizontal, 14) + .padding(.top, 12) + + // Message text + if !msg.text.isEmpty { + Text(msg.text) + .font(.system(size: 12)) + .foregroundColor(.white.opacity(msg.isGenerated ? 0.50 : 0.82)) + .fixedSize(horizontal: false, vertical: true) + .textSelection(.enabled) + .padding(.horizontal, 14) + .padding(.top, 5) + .padding(.bottom, msg.imageData != nil ? 6 : 12) + } + + // Inline image (Pencil screenshot, ScreenGrab, etc.) + if let data = msg.imageData, let ns = NSImage(data: data) { + Image(nsImage: ns) + .resizable() + .aspectRatio(contentMode: .fit) + .frame(maxWidth: .infinity, maxHeight: 200) + .background(Color.black) + .overlay( + Rectangle() + .stroke(color.opacity(0.22), lineWidth: 0.5) + ) + .padding(.horizontal, 14) + .padding(.bottom, 12) + } + + // Bottom rule + Rectangle() + .fill(Color.white.opacity(0.04)) + .frame(maxWidth: .infinity, maxHeight: 1) + } + .opacity(msg.isGenerated ? 0.70 : 1.0) + } +} diff --git a/Sources/CameraPreviewView.swift b/Sources/CameraPreviewView.swift index 480c171..fb463c0 100644 --- a/Sources/CameraPreviewView.swift +++ b/Sources/CameraPreviewView.swift @@ -187,6 +187,7 @@ struct CameraFeedWidget: View { Image(decorative: image, scale: 1.0) .resizable() .aspectRatio(contentMode: .fill) + .frame(maxWidth: .infinity, maxHeight: .infinity) .clipShape(RoundedRectangle(cornerRadius: 16, style: .continuous)) } else { VStack(spacing: 6) { diff --git a/Sources/ChunkManager.swift b/Sources/ChunkManager.swift index fb41b54..9f7a17e 100644 --- a/Sources/ChunkManager.swift +++ b/Sources/ChunkManager.swift @@ -465,6 +465,15 @@ final class ChunkManager: ObservableObject { } Log.info(.pipeline, "Chunk \(index) [sess:\(label)]: raw text accumulated (pipeline deferred to session end)") + case .callMode: + // Bypass all Llama stages — send directly to Claude via CallModeSession. + let chunk = transcript + await MainActor.run { + guard let appState = self.appState else { return } + Task { await appState.callModeSession.send(text: chunk) } + } + Log.info(.pipeline, "Chunk \(index) [call]: forwarded to CallModeSession") + case .aiSearch: guard let qaService, let qaStore else { break } do { diff --git a/Sources/HookNarrationService.swift b/Sources/HookNarrationService.swift new file mode 100644 index 0000000..1ee76e6 --- /dev/null +++ b/Sources/HookNarrationService.swift @@ -0,0 +1,320 @@ +import Foundation + +// MARK: - HookEvent + +/// A parsed Claude Code hook event (PostToolUse, Stop, PreToolUse, etc.). +struct HookEvent { + let eventName: String // "PostToolUse", "Stop", "PreToolUse", … + let toolName: String? + let toolInput: [String: Any]? + let toolResponse: [String: Any]? + let sessionID: String? + let rawJSON: [String: Any] + + /// True when the hook signals the session has finished. + var isStop: Bool { eventName == "Stop" } + + /// True when the hook signals a tool is about to run (pre-tool). + var isPreTool: Bool { eventName == "PreToolUse" } +} + +// MARK: - ToolParticipant + +/// A tool (MCP server) that used as a participant in the call room feed. +struct ToolParticipant { + let id: String // e.g. "pencil" + let name: String // e.g. "Pencil" + let systemImage: String // SF symbol +} + +// MARK: - NarrationBundle + +/// Rich narration package returned for each hook event. +/// Contains everything needed to post a multi-message "conversation" in the call feed. +struct NarrationBundle { + /// What Claw'd says (real, solid-border message). + let narration: String + /// The tool participant derived from the MCP tool name (nil for built-in tools). + let toolParticipant: ToolParticipant? + /// Short summary of the tool's response text (optional). + let toolResponseText: String? + /// Image bytes extracted from the tool response (e.g. Pencil screenshot). + let imageData: Data? + /// AutoClawd's generated reaction — rendered as a dashed/faint generated message. + let autoClawdReaction: String? +} + +// MARK: - HookNarrationService + +/// Translates raw Claude Code hook events into NarrationBundles for the call-room feed. +/// +/// Pipeline: +/// 1. Parse the raw JSON from `/hook` into a `HookEvent`. +/// 2. Extract tool participant (for MCP tools like `mcp__pencil__*`). +/// 3. Extract image data from the tool response if present. +/// 4. `narrate(_:)` calls Ollama Llama for a short narration; falls back to template. +/// 5. Optionally generates an AutoClawd reaction via a second Ollama call. +/// +/// The bundle is then used by AppDelegate to auto-join tool participants and post +/// a two-sided conversation in the feed (Claw'd speaks, tool responds, AutoClawd reacts). +final class HookNarrationService: @unchecked Sendable { + + private let ollama = OllamaService() + + // MARK: - Parse + + /// Parse raw hook JSON into a HookEvent. + static func parse(_ json: [String: Any]) -> HookEvent { + HookEvent( + eventName: json["hook_event_name"] as? String + ?? json["type"] as? String + ?? "Unknown", + toolName: json["tool_name"] as? String, + toolInput: json["tool_input"] as? [String: Any], + toolResponse: json["tool_response"] as? [String: Any], + sessionID: json["session_id"] as? String, + rawJSON: json + ) + } + + // MARK: - Narrate + + /// Returns a NarrationBundle describing what just happened. + func narrate(_ event: HookEvent) async -> NarrationBundle { + if event.isStop { + return NarrationBundle( + narration: "Claw'd finished the task.", + toolParticipant: nil, + toolResponseText: nil, + imageData: nil, + autoClawdReaction: nil + ) + } + + let tp = HookNarrationService.toolParticipant(from: event.toolName) + let imageData = HookNarrationService.extractImageData(from: event.toolResponse) + let summary = templateSummary(event) + + // Ask Llama for a casual narration sentence + var narration = summary + do { + let prompt = """ + You are a narrator watching an AI coding assistant named "Claw'd" work. \ + In one short, casual sentence (10–15 words max), narrate what it just did. \ + Be specific. Never start with "The AI" — always use "Claw'd". \ + Don't add quotes around the sentence. + + Event: \(summary) + + Narration: + """ + var llm = try await ollama.generate(prompt: prompt, numPredict: 60) + llm = llm.components(separatedBy: "\n").first ?? llm + llm = llm + .trimmingCharacters(in: .whitespacesAndNewlines) + .trimmingCharacters(in: CharacterSet(charactersIn: "\"'")) + if !llm.isEmpty { narration = llm } + } catch {} + + // Generate AutoClawd reaction for interesting MCP events or image results + var reaction: String? = nil + if tp != nil || imageData != nil { + reaction = await generateAutoClawdReaction(for: narration, hasImage: imageData != nil) + } + + return NarrationBundle( + narration: narration, + toolParticipant: tp, + toolResponseText: nil, + imageData: imageData, + autoClawdReaction: reaction + ) + } + + // MARK: - Tool Participant Extraction + + /// Parses `mcp__server__tool` tool names and returns a ToolParticipant. + static func toolParticipant(from toolName: String?) -> ToolParticipant? { + guard let name = toolName, name.hasPrefix("mcp__") else { return nil } + let parts = name.components(separatedBy: "__") + guard parts.count >= 2 else { return nil } + let serverID = parts[1] + return ToolParticipant( + id: serverID, + name: serverID.capitalized, + systemImage: systemImageForServer(serverID) + ) + } + + private static func systemImageForServer(_ server: String) -> String { + switch server { + case "pencil": return "paintpalette" + case "figma": return "rectangle.3.group" + case "github": return "cat" + case "linear": return "square.and.pencil" + case "notion": return "doc.text" + case "googlesheets": return "tablecells" + default: return "cable.connector" + } + } + + // MARK: - Image Extraction + + /// Extracts raw JPEG/PNG bytes from a tool_response JSON payload. + /// Handles several content formats used by MCP tools. + static func extractImageData(from toolResponse: [String: Any]?) -> Data? { + guard let resp = toolResponse else { return nil } + + // Pattern 1: {"type": "image", "data": "base64..."} (direct) + if resp["type"] as? String == "image", + let dataStr = resp["data"] as? String, + let d = Data(base64Encoded: dataStr) { + return d + } + + // Pattern 2: {"content": [{"type": "image", "data": "..."}]} + if let content = resp["content"] as? [[String: Any]] { + for item in content { + if item["type"] as? String == "image" { + if let dataStr = item["data"] as? String, + let d = Data(base64Encoded: dataStr) { return d } + if let source = item["source"] as? [String: Any], + let dataStr = source["data"] as? String, + let d = Data(base64Encoded: dataStr) { return d } + } + } + } + + // Pattern 3: {"result": [{"type": "image", "source": {"data": "..."}}]} + if let result = resp["result"] as? [[String: Any]] { + for item in result { + if item["type"] as? String == "image", + let source = item["source"] as? [String: Any], + let dataStr = source["data"] as? String, + let d = Data(base64Encoded: dataStr) { return d } + } + } + + return nil + } + + // MARK: - AutoClawd Reaction + + private func generateAutoClawdReaction(for narration: String, hasImage: Bool) async -> String? { + let imageHint = hasImage ? " A screenshot or image was returned." : "" + let prompt = """ + You are AutoClawd, an AI project manager watching Claw'd (your coding AI) work. + In one casual, short sentence (8–12 words), react to what just happened.\(imageHint) + Be observant — sometimes curious, sometimes encouraging, sometimes dry. + Never start with "I". Don't add quotes. + + What happened: \(narration) + + Your reaction: + """ + do { + var reaction = try await ollama.generate(prompt: prompt, numPredict: 40) + reaction = reaction.components(separatedBy: "\n").first ?? reaction + reaction = reaction + .trimmingCharacters(in: .whitespacesAndNewlines) + .trimmingCharacters(in: CharacterSet(charactersIn: "\"'")) + return reaction.isEmpty ? nil : reaction + } catch { + return nil + } + } + + // MARK: - Template Fallback + + private func templateSummary(_ event: HookEvent) -> String { + guard let tool = event.toolName else { + return "Working…" + } + let input = event.toolInput ?? [:] + + switch tool { + + case "Read": + if let path = input["file_path"] as? String { + return "Reading \(fileName(path))" + } + return "Reading a file" + + case "Write": + if let path = input["file_path"] as? String { + return "Writing \(fileName(path))" + } + return "Writing a file" + + case "Edit": + if let path = input["file_path"] as? String { + return "Editing \(fileName(path))" + } + return "Editing a file" + + case "MultiEdit": + if let path = input["file_path"] as? String { + return "Multi-editing \(fileName(path))" + } + return "Applying multiple edits" + + case "Bash": + if let cmd = input["command"] as? String { + let short = String(cmd.prefix(50)) + return "Running: \(short)\(cmd.count > 50 ? "…" : "")" + } + return "Running a shell command" + + case "Glob": + if let pattern = input["pattern"] as? String { + return "Searching for files matching '\(pattern)'" + } + return "Searching files" + + case "Grep": + if let pattern = input["pattern"] as? String { + return "Searching code for '\(String(pattern.prefix(40)))'" + } + return "Searching in files" + + case "Task": + if let desc = input["description"] as? String { + return "Spawning sub-agent: \(String(desc.prefix(40)))" + } + return "Launching a sub-agent" + + case "WebFetch": + if let urlStr = input["url"] as? String, + let host = URL(string: urlStr)?.host { + return "Fetching \(host)" + } + return "Fetching a web page" + + case "WebSearch": + if let query = input["query"] as? String { + return "Searching the web for '\(String(query.prefix(40)))'" + } + return "Searching the web" + + case "TodoWrite": + return "Updating the task list" + + case "NotebookEdit": + return "Editing a notebook cell" + + default: + // MCP tools: mcp__server__action -> "Using Pencil: batch_design" + if tool.hasPrefix("mcp__") { + let parts = tool.components(separatedBy: "__") + if parts.count >= 3 { + return "Using \(parts[1].capitalized): \(parts[2...].joined(separator: "_"))" + } + } + return "Using \(tool)" + } + } + + private func fileName(_ path: String) -> String { + URL(fileURLWithPath: path).lastPathComponent + } +} diff --git a/Sources/MCPConfigManager.swift b/Sources/MCPConfigManager.swift index 52efb2c..64b40d0 100644 --- a/Sources/MCPConfigManager.swift +++ b/Sources/MCPConfigManager.swift @@ -33,6 +33,54 @@ enum MCPConfigManager { } } + // MARK: - Claude Code Hooks Config + + /// Write (or merge) AutoClawd hook entries into ~/.claude/settings.json. + /// + /// Registers `PostToolUse` and `Stop` hooks that curl the AutoClawd hook + /// endpoint on every Claude Code tool event. This lets AutoClawd narrate + /// what Claude Code is doing in real time inside the call room feed. + /// + /// Safe to call repeatedly — only the `hooks` key is overwritten; all + /// other existing settings are preserved. + static func writeHooksConfig() { + let settingsURL = FileManager.default.homeDirectoryForCurrentUser + .appendingPathComponent(".claude") + .appendingPathComponent("settings.json") + + // Load existing settings or start fresh + var settings: [String: Any] = [:] + if let data = try? Data(contentsOf: settingsURL), + let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any] { + settings = json + } + + let hookCommand = "curl -s -X POST http://localhost:7892/hook -H 'Content-Type: application/json' -d @-" + let hookEntry: [String: Any] = ["type": "command", "command": hookCommand] + let hookGroup: [String: Any] = ["matcher": "", "hooks": [hookEntry]] + + var hooks = settings["hooks"] as? [String: Any] ?? [:] + hooks["PostToolUse"] = [hookGroup] + hooks["Stop"] = [hookGroup] + settings["hooks"] = hooks + + do { + try FileManager.default.createDirectory( + at: settingsURL.deletingLastPathComponent(), + withIntermediateDirectories: true, + attributes: nil + ) + let data = try JSONSerialization.data( + withJSONObject: settings, + options: [.prettyPrinted, .sortedKeys] + ) + try data.write(to: settingsURL, options: .atomic) + Log.info(.system, "MCPConfigManager: hooks written to \(settingsURL.path)") + } catch { + Log.warn(.system, "MCPConfigManager: failed to write hooks config — \(error)") + } + } + private static func findMCPBinary() -> String? { // 1. Inside the app bundle (distribution) let bundleBinary = Bundle.main.bundleURL diff --git a/Sources/MCPServer.swift b/Sources/MCPServer.swift new file mode 100644 index 0000000..2694d2e --- /dev/null +++ b/Sources/MCPServer.swift @@ -0,0 +1,539 @@ +import Foundation +import Network + +// MARK: - MCPServer + +/// Embeds a lightweight HTTP MCP server into AutoClawd so any Claude Code session +/// can call screen-grab tools directly — without a separate process. +/// +/// Configure Claude Code once (in ~/.claude/mcp.json or .mcp.json in project root): +/// ```json +/// { +/// "mcpServers": { +/// "autoclawd": { +/// "type": "http", +/// "url": "http://localhost:7892/mcp" +/// } +/// } +/// } +/// ``` +/// +/// Available tools: +/// autoclawd_get_screen — full screen or region: OCR text + JPEG screenshot +/// autoclawd_get_cursor_context — 600×400 crop around cursor: OCR + AX element +/// autoclawd_get_selection — currently selected text + selection screenshot +/// autoclawd_get_audio_transcript— rolling mic transcript buffer +/// +/// Transport: HTTP/1.1, JSON-RPC 2.0, single-response (no SSE needed for these tools). +final class MCPServer: @unchecked Sendable { + + static let shared = MCPServer() + + let port: UInt16 = 7892 + + private var listener: NWListener? + private var screenGrab: ScreenGrabService? + /// Called on @MainActor — returns the current session transcript text. + private var transcriptProvider: (@MainActor () -> String)? + /// Called on @MainActor — true if Claude Code participant is paused; gates transcript. + private var isPausedProvider: (@MainActor () -> Bool)? + /// Called on @MainActor — pushes text to the call mode canvas. + private var canvasWriter: (@MainActor (String) -> Void)? + /// Fired on @MainActor when a Claude Code session calls `initialize`. + private var onJoined: (@MainActor () -> Void)? + /// Fired on @MainActor when no MCP activity for `leaveTimeoutSeconds`. + private var onLeft: (@MainActor () -> Void)? + /// Invite a plugin/tool as a call participant: (id, name, systemImage). + private var onInviteParticipant: (@MainActor (String, String, String) -> Void)? + /// Set a participant's state: (id, stateString). + private var onSetParticipantState: (@MainActor (String, String) -> Void)? + /// Append a feed message attributed to a participant: (id, name, text). + private var onParticipantMessage: (@MainActor (String, String, String) -> Void)? + /// Remove a participant from the call: (id). + private var onRemoveParticipant: (@MainActor (String) -> Void)? + /// Fired on @MainActor when a Claude Code hook event arrives on POST /hook. + private var onHookEvent: (@MainActor (HookEvent) -> Void)? + + /// Last time any MCP request was received from a Claude Code session. + private var lastActivityDate: Date? + private var leaveTimer: Timer? + private let leaveTimeoutSeconds: TimeInterval = 60 + + // MARK: - Lifecycle + + /// Start the server. Idempotent — safe to call multiple times. + func start(screenGrab: ScreenGrabService, + transcriptProvider: @escaping @MainActor () -> String, + isPausedProvider: (@MainActor () -> Bool)? = nil, + canvasWriter: (@MainActor (String) -> Void)? = nil, + onJoined: (@MainActor () -> Void)? = nil, + onLeft: (@MainActor () -> Void)? = nil, + onInviteParticipant: (@MainActor (String, String, String) -> Void)? = nil, + onSetParticipantState: (@MainActor (String, String) -> Void)? = nil, + onParticipantMessage: (@MainActor (String, String, String) -> Void)? = nil, + onRemoveParticipant: (@MainActor (String) -> Void)? = nil, + onHookEvent: (@MainActor (HookEvent) -> Void)? = nil) { + guard listener == nil else { return } + self.screenGrab = screenGrab + self.transcriptProvider = transcriptProvider + self.isPausedProvider = isPausedProvider + self.canvasWriter = canvasWriter + self.onJoined = onJoined + self.onLeft = onLeft + self.onInviteParticipant = onInviteParticipant + self.onSetParticipantState = onSetParticipantState + self.onParticipantMessage = onParticipantMessage + self.onRemoveParticipant = onRemoveParticipant + self.onHookEvent = onHookEvent + + do { + let params = NWParameters.tcp + params.allowLocalEndpointReuse = true + listener = try NWListener(using: params, on: NWEndpoint.Port(rawValue: port)!) + listener?.newConnectionHandler = { [weak self] conn in self?.accept(conn) } + listener?.stateUpdateHandler = { [weak self] state in + guard let self else { return } + switch state { + case .ready: + Log.info(.system, "MCPServer: ready — http://localhost:\(self.port)/mcp") + case .failed(let err): + Log.warn(.system, "MCPServer: listener failed — \(err)") + default: + break + } + } + listener?.start(queue: .global(qos: .utility)) + } catch { + Log.warn(.system, "MCPServer: could not bind port \(port) — \(error)") + } + } + + func stop() { + listener?.cancel() + listener = nil + leaveTimer?.invalidate() + leaveTimer = nil + } + + // MARK: - Activity Tracking + + /// Called on every inbound MCP request so we can detect session disconnection via timeout. + private func recordActivity() { + lastActivityDate = Date() + // Reset the leave timer each time activity is seen. + leaveTimer?.invalidate() + leaveTimer = Timer.scheduledTimer(withTimeInterval: leaveTimeoutSeconds, + repeats: false) { [weak self] _ in + self?.fireLeft() + } + RunLoop.main.add(leaveTimer!, forMode: .common) + } + + private func fireLeft() { + lastActivityDate = nil + leaveTimer = nil + guard let cb = onLeft else { return } + Task { @MainActor in cb() } + } + + // MARK: - Connection Handling + + private func accept(_ connection: NWConnection) { + connection.start(queue: .global(qos: .utility)) + receiveHTTP(connection: connection, buffer: Data()) + } + + /// Accumulate received bytes until we have a complete HTTP request, then process it. + private func receiveHTTP(connection: NWConnection, buffer: Data) { + connection.receive(minimumIncompleteLength: 1, maximumLength: 65_536) { [weak self] chunk, _, isComplete, error in + guard let self else { return } + if let err = error { + Log.warn(.system, "MCPServer: receive error \(err)") + return + } + var buf = buffer + if let chunk { buf.append(chunk) } + + if let (method, path, body) = self.parseHTTPRequest(buf) { + self.handleRequest(method: method, path: path, body: body, connection: connection) + } else if !isComplete { + self.receiveHTTP(connection: connection, buffer: buf) + } + } + } + + // MARK: - HTTP Parsing + + /// Returns (method, path, body) once a complete HTTP/1.1 request is buffered. + private func parseHTTPRequest(_ data: Data) -> (String, String, Data)? { + // Headers end at \r\n\r\n + let sep = Data([0x0d, 0x0a, 0x0d, 0x0a]) + guard let sepRange = data.range(of: sep) else { return nil } + + let headerData = data[data.startIndex..= 2 else { return nil } + let method = String(parts[0]) + let path = String(parts[1]) + + // Content-Length + var contentLength = 0 + for line in lines.dropFirst() { + let kv = line.split(separator: ":", maxSplits: 1) + if kv.count == 2, + kv[0].lowercased().trimmingCharacters(in: .whitespaces) == "content-length" { + contentLength = Int(kv[1].trimmingCharacters(in: .whitespaces)) ?? 0 + } + } + + let bodyStart = sepRange.upperBound + let available = data.distance(from: bodyStart, to: data.endIndex) + guard available >= contentLength else { return nil } // wait for more data + + let bodyEnd = data.index(bodyStart, offsetBy: contentLength) + return (method, path, Data(data[bodyStart.. Any { + switch method { + + case "initialize": + // Fire join callback — a Claude Code session just connected. + if let cb = onJoined { + Task { @MainActor in cb() } + } + return [ + "protocolVersion": "2024-11-05", + "capabilities": ["tools": [:] as [String: Any]], + "serverInfo": ["name": "autoclawd", "version": "1.0"] + ] as [String: Any] + + case "notifications/initialized": + return [:] as [String: Any] + + case "tools/list": + return ["tools": toolDefinitions()] + + case "tools/call": + let name = params["name"] as? String ?? "" + let args = params["arguments"] as? [String: Any] ?? [:] + return await callTool(name: name, args: args) + + default: + return ["error": "Unknown method: \(method)"] as [String: Any] + } + } + + // MARK: - Tool Definitions + + private func toolDefinitions() -> [[String: Any]] { + [ + [ + "name": "autoclawd_get_screen", + "description": """ + Capture the current screen with Vision OCR text and a JPEG screenshot. \ + Optionally crop to a pixel region (screen-space, top-left origin). \ + Returns structured text for reading plus the raw image for visual inspection. \ + Call this first to get overall screen context. + """, + "inputSchema": [ + "type": "object", + "properties": [ + "region": [ + "type": "object", + "description": "Pixel region to crop. Omit for full screen.", + "properties": [ + "x": ["type": "number"], + "y": ["type": "number"], + "width": ["type": "number"], + "height": ["type": "number"] + ] + ] + ] + ] as [String: Any] + ], + [ + "name": "autoclawd_get_cursor_context", + "description": """ + Capture a 600×400 screenshot centred on the user's current cursor position, \ + with OCR text and the UI element under the cursor. \ + Use this when the user points at something on screen without naming it. + """, + "inputSchema": ["type": "object", "properties": [:] as [String: Any]] + ], + [ + "name": "autoclawd_get_selection", + "description": """ + Get the user's currently highlighted/selected text and a screenshot \ + of that selection region. Use this when the user selects code, \ + an error message, a file path, or any text before speaking. + """, + "inputSchema": ["type": "object", "properties": [:] as [String: Any]] + ], + [ + "name": "autoclawd_get_audio_transcript", + "description": """ + Get the recent spoken audio transcript from the user's microphone session. \ + Useful to review what was said in the last few minutes without the user \ + having to repeat themselves. + """, + "inputSchema": [ + "type": "object", + "properties": [ + "max_chars": [ + "type": "number", + "description": "Maximum characters to return (default 2000, most-recent)." + ] + ] + ] as [String: Any] + ], + [ + "name": "autoclawd_set_canvas", + "description": """ + Push a text message to the AutoClawd widget canvas so the user can see it \ + on the floating pill. Use this to announce your presence ("Claude Code joined \ + the call"), stream responses, or display status updates directly on the widget. + """, + "inputSchema": [ + "type": "object", + "properties": [ + "text": [ + "type": "string", + "description": "The message to display on the call mode canvas." + ] + ], + "required": ["text"] + ] as [String: Any] + ], + ] + } + + // NOTE: Participant tile tools (invite/set_state/send_message/remove) are intentionally + // NOT exposed via tools/list. Tile management is UI-driven (user taps Invite) or will + // be inferred from Claude Code's existing tool-call stream — never by making Claude Code + // call extra MCP tools that burn tokens on pure UI bookkeeping. + + // MARK: - Tool Execution + + private func callTool(name: String, args: [String: Any]) async -> [String: Any] { + var content: [[String: Any]] = [] + + switch name { + + case "autoclawd_get_screen": + var region: CGRect? + if let r = args["region"] as? [String: Any], + let x = r["x"] as? CGFloat, let y = r["y"] as? CGFloat, + let w = r["width"] as? CGFloat, let h = r["height"] as? CGFloat { + region = CGRect(x: x, y: y, width: w, height: h) + } + let grab = await screenGrab?.captureScreen(region: region) + ?? ScreenGrab(ocrText: "", metadata: "Screen service unavailable", + imageJPEGData: nil, capturedAt: Date()) + content = screenGrabBlocks(grab) + + case "autoclawd_get_cursor_context": + let grab = await screenGrab?.captureCursorContext() + ?? ScreenGrab(ocrText: "", metadata: "Screen service unavailable", + imageJPEGData: nil, capturedAt: Date()) + content = screenGrabBlocks(grab) + + case "autoclawd_get_selection": + let sel = await screenGrab?.captureSelection() + ?? SelectionGrab(selectedText: "", contextImageJPEGData: nil, capturedAt: Date()) + if sel.selectedText.isEmpty && sel.contextImageJPEGData == nil { + content = [["type": "text", "text": "No text currently selected."]] + } else { + if !sel.selectedText.isEmpty { + content.append(["type": "text", + "text": "Selected text:\n\(sel.selectedText)"]) + } + if let jpeg = sel.contextImageJPEGData { + content.append(["type": "image", + "data": jpeg.base64EncodedString(), + "mimeType": "image/jpeg"]) + } + } + + case "autoclawd_get_audio_transcript": + let maxChars = args["max_chars"] as? Int ?? 2_000 + let provider = transcriptProvider + let paused = isPausedProvider + let (transcript, isPaused) = await MainActor.run { + (provider?() ?? "", paused?() ?? false) + } + if isPaused { + content = [["type": "text", + "text": "Transcript paused — user and AutoClawd are planning. Stand by."]] + } else { + let trimmed = transcript.count > maxChars + ? String(transcript.suffix(maxChars)) + : transcript + content = [["type": "text", + "text": trimmed.isEmpty ? "No transcript available." : trimmed]] + } + + case "autoclawd_set_canvas": + let text = args["text"] as? String ?? "" + if !text.isEmpty, let writer = canvasWriter { + let w = writer + Task { @MainActor in w(text) } + content = [["type": "text", "text": "Canvas updated."]] + } else { + content = [["type": "text", "text": "No text provided or canvas unavailable."]] + } + + case "autoclawd_invite_participant": + let id = args["id"] as? String ?? "" + let name = args["name"] as? String ?? "" + let icon = args["system_image"] as? String ?? "cable.connector" + if !id.isEmpty, let cb = onInviteParticipant { + Task { @MainActor in cb(id, name, icon) } + content = [["type": "text", "text": "\(name) joined the call."]] + } else { + content = [["type": "text", "text": "Could not invite participant."]] + } + + case "autoclawd_set_participant_state": + let id = args["id"] as? String ?? "" + let stateStr = args["state"] as? String ?? "idle" + if !id.isEmpty, let cb = onSetParticipantState { + Task { @MainActor in cb(id, stateStr) } + content = [["type": "text", "text": "State updated."]] + } else { + content = [["type": "text", "text": "Could not update participant state."]] + } + + case "autoclawd_send_participant_message": + let id = args["id"] as? String ?? "" + let name = args["name"] as? String ?? "" + let text = args["text"] as? String ?? "" + if !id.isEmpty, !text.isEmpty, let cb = onParticipantMessage { + Task { @MainActor in cb(id, name, text) } + content = [["type": "text", "text": "Message posted."]] + } else { + content = [["type": "text", "text": "Could not post participant message."]] + } + + case "autoclawd_remove_participant": + let id = args["id"] as? String ?? "" + if !id.isEmpty, let cb = onRemoveParticipant { + Task { @MainActor in cb(id) } + content = [["type": "text", "text": "Participant removed."]] + } else { + content = [["type": "text", "text": "Could not remove participant."]] + } + + default: + content = [["type": "text", "text": "Unknown tool: \(name)"]] + } + + return ["content": content] + } + + /// Build MCP content blocks from a ScreenGrab (text + optional image). + private func screenGrabBlocks(_ grab: ScreenGrab) -> [[String: Any]] { + var blocks: [[String: Any]] = [] + let textParts = [ + grab.metadata.isEmpty ? nil : grab.metadata, + grab.ocrText.isEmpty ? nil : "Screen text:\n\(grab.ocrText)" + ].compactMap { $0 } + if !textParts.isEmpty { + blocks.append(["type": "text", "text": textParts.joined(separator: "\n\n")]) + } + if let jpeg = grab.imageJPEGData { + blocks.append(["type": "image", + "data": jpeg.base64EncodedString(), + "mimeType": "image/jpeg"]) + } + return blocks + } + + // MARK: - JSON-RPC Helpers + + private func rpcSuccess(id: Int?, result: Any) -> Data { + var obj: [String: Any] = ["jsonrpc": "2.0", "result": result] + if let id { obj["id"] = id } + return (try? JSONSerialization.data(withJSONObject: obj)) ?? Data() + } + + private func rpcError(code: Int, message: String) -> Data { + let obj: [String: Any] = [ + "jsonrpc": "2.0", + "error": ["code": code, "message": message] + ] + return (try? JSONSerialization.data(withJSONObject: obj)) ?? Data() + } +} diff --git a/Sources/MainPanelView.swift b/Sources/MainPanelView.swift index 2f1e7da..2789118 100644 --- a/Sources/MainPanelView.swift +++ b/Sources/MainPanelView.swift @@ -61,10 +61,18 @@ struct MainPanelView: View { @ViewBuilder private var content: some View { ZStack { + // PixelWorldView is always alive to preserve WKWebView state, + // but hidden during Call Mode (replaced by zoom-call view). PixelWorldView(appState: appState) .frame(maxWidth: .infinity, maxHeight: .infinity) - .opacity(selectedTab == .world ? 1 : 0) - .allowsHitTesting(selectedTab == .world) + .opacity(selectedTab == .world && appState.pillMode != .callMode ? 1 : 0) + .allowsHitTesting(selectedTab == .world && appState.pillMode != .callMode) + + // Call Mode room: replaces HQ view when Call Mode is active. + CallModeRoomView(appState: appState) + .frame(maxWidth: .infinity, maxHeight: .infinity) + .opacity(selectedTab == .world && appState.pillMode == .callMode ? 1 : 0) + .allowsHitTesting(selectedTab == .world && appState.pillMode == .callMode) ProjectsListView(appState: appState) .frame(maxWidth: .infinity, maxHeight: .infinity) @@ -86,6 +94,209 @@ struct MainPanelView: View { } +// MARK: - CallModeZoomView + +/// 3-panel "zoom call" layout shown in the panel's World tab when Call Mode is active. +/// Top: scrollable Claude message thread. Bottom: camera (left) + screen preview (right). +struct CallModeZoomView: View { + @ObservedObject var appState: AppState + + var body: some View { + VStack(spacing: 1) { + // Top: Claude messages thread + messagesPanel + .frame(maxWidth: .infinity, maxHeight: .infinity) + + // Bottom: camera + screen side by side + HStack(spacing: 1) { + cameraPanel + screenPanel + } + .frame(maxWidth: .infinity, minHeight: 200, maxHeight: 220) + } + .background(Color.black) + } + + // MARK: - Messages Panel + + private var messagesPanel: some View { + ZStack(alignment: .topLeading) { + Color(nsColor: .windowBackgroundColor).opacity(0.05) + + ScrollViewReader { proxy in + ScrollView { + LazyVStack(alignment: .leading, spacing: 10) { + ForEach(appState.callModeSession.messages) { msg in + CallZoomMessageRow(message: msg) + .id(msg.id) + } + } + .padding(16) + } + .onChange(of: appState.callModeSession.messages.count) { _ in + if let last = appState.callModeSession.messages.last { + withAnimation { proxy.scrollTo(last.id, anchor: .bottom) } + } + } + } + + // Processing indicator + if appState.callModeSession.isProcessing { + VStack { + Spacer() + HStack(spacing: 6) { + ProgressView().controlSize(.mini).tint(.cyan) + Text("Claude thinking…") + .font(.system(size: 10, design: .monospaced)) + .foregroundColor(.cyan.opacity(0.7)) + } + .padding(.horizontal, 16) + .padding(.vertical, 8) + } + } + + // Empty state + if appState.callModeSession.messages.isEmpty { + VStack(spacing: 8) { + Image(systemName: "phone.bubble") + .font(.system(size: 32)) + .foregroundColor(.cyan.opacity(0.25)) + Text("CALL MODE ACTIVE") + .font(.system(size: 11, weight: .semibold, design: .monospaced)) + .foregroundColor(.cyan.opacity(0.3)) + Text("Speak to start the call") + .font(.system(size: 10, design: .monospaced)) + .foregroundColor(.white.opacity(0.2)) + } + .frame(maxWidth: .infinity, maxHeight: .infinity) + } + } + } + + // MARK: - Camera Panel + + private var cameraPanel: some View { + ZStack { + Color.black + if appState.cameraEnabled && appState.cameraService.isRunning { + CameraPreviewView(session: appState.cameraService.captureSession) + .frame(maxWidth: .infinity, maxHeight: .infinity) + } else { + VStack(spacing: 4) { + Image(systemName: "camera.fill") + .font(.system(size: 18)) + .foregroundColor(.white.opacity(0.15)) + Text("Camera Off") + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(.white.opacity(0.2)) + } + } + VStack { + Spacer() + HStack { + feedBadge(label: "LIVE", color: .red) + Spacer() + } + .padding(8) + } + } + .clipShape(RoundedRectangle(cornerRadius: 0)) + } + + // MARK: - Screen Panel + + private var screenPanel: some View { + ZStack { + Color.black + if let img = appState.screenPreviewImage { + Image(decorative: img, scale: 1.0) + .resizable() + .aspectRatio(contentMode: .fit) + .frame(maxWidth: .infinity, maxHeight: .infinity) + } else { + VStack(spacing: 4) { + Image(systemName: "rectangle.on.rectangle") + .font(.system(size: 18)) + .foregroundColor(.white.opacity(0.15)) + Text("No Screen") + .font(.system(size: 8, design: .monospaced)) + .foregroundColor(.white.opacity(0.2)) + } + } + VStack { + Spacer() + HStack { + feedBadge(label: "SCREEN", color: .cyan) + Spacer() + } + .padding(8) + } + } + } + + // MARK: - Badge + + private func feedBadge(label: String, color: Color) -> some View { + HStack(spacing: 3) { + Circle().fill(color).frame(width: 5, height: 5) + Text(label) + .font(.system(size: 7, weight: .bold, design: .monospaced)) + .foregroundColor(.white.opacity(0.8)) + } + .padding(.horizontal, 5) + .padding(.vertical, 2) + .background(Capsule().fill(Color.black.opacity(0.6))) + } +} + +// MARK: - CallZoomMessageRow + +private struct CallZoomMessageRow: View { + let message: CallMessage + + var body: some View { + HStack(alignment: .top, spacing: 8) { + Circle() + .fill(roleColor) + .frame(width: 7, height: 7) + .padding(.top, 4) + VStack(alignment: .leading, spacing: 3) { + Text(roleLabel) + .font(.system(size: 9, weight: .semibold, design: .monospaced)) + .foregroundColor(roleColor.opacity(0.6)) + Text(message.text) + .font(.system(size: 12, + design: message.role == .tool ? .monospaced : .default)) + .foregroundColor(.primary.opacity(message.role == .user ? 1.0 : 0.85)) + .textSelection(.enabled) + .fixedSize(horizontal: false, vertical: true) + } + } + } + + private var roleColor: Color { + switch message.role { + case .user: return .white + case .assistant: return .cyan + case .tool: return .yellow + case .error: return .red + case .external: return .green + case .participant: return .purple + } + } + + private var roleLabel: String { + switch message.role { + case .user: return "YOU" + case .assistant: return "CLAUDE" + case .tool: return "TOOL" + case .error: return "ERR" + case .external: return "CC" + case .participant: return message.participantName?.uppercased() ?? "PLUGIN" + } + } +} + // MARK: - ExecutionOutputView struct ExecutionOutputView: View { diff --git a/Sources/PillMode.swift b/Sources/PillMode.swift index 2181484..8bfe461 100644 --- a/Sources/PillMode.swift +++ b/Sources/PillMode.swift @@ -6,6 +6,7 @@ enum PillMode: String, CaseIterable { case transcription = "transcription" case aiSearch = "aiSearch" case meeting = "meeting" // Meeting notes: accumulate → analyse at end + case callMode = "callMode" // Direct Claude via Anthropic API; Llama bypassed var displayName: String { switch self { @@ -13,6 +14,7 @@ enum PillMode: String, CaseIterable { case .transcription: return "Transcribe" case .aiSearch: return "AI Search" case .meeting: return "Meeting" + case .callMode: return "Call" } } @@ -22,6 +24,7 @@ enum PillMode: String, CaseIterable { case .transcription: return "text.cursor" case .aiSearch: return "magnifyingglass" case .meeting: return "person.2.wave.2" + case .callMode: return "phone.bubble" } } @@ -31,6 +34,7 @@ enum PillMode: String, CaseIterable { case .transcription: return "[TRS]" case .aiSearch: return "[SRC]" case .meeting: return "[MTG]" + case .callMode: return "[CLL]" } } @@ -40,6 +44,7 @@ enum PillMode: String, CaseIterable { case .transcription: return .accentColor case .aiSearch: return .accentColor case .meeting: return .purple + case .callMode: return .cyan } } diff --git a/Sources/PipelineModels.swift b/Sources/PipelineModels.swift index 84b8882..40010ce 100644 --- a/Sources/PipelineModels.swift +++ b/Sources/PipelineModels.swift @@ -16,6 +16,10 @@ enum PipelineSource: String, Codable { case code /// WhatsApp self-chat → full pipeline + QA reply case whatsapp + /// Call mode — direct to Claude via Anthropic API, Llama bypassed entirely. + /// Audio transcript is handed straight to CallModeSession; MCP tools serve + /// visual context (screen, cursor, selection) on demand. + case callMode } // MARK: - Cleaned Transcript diff --git a/Sources/ScreenGrabService.swift b/Sources/ScreenGrabService.swift new file mode 100644 index 0000000..dd40645 --- /dev/null +++ b/Sources/ScreenGrabService.swift @@ -0,0 +1,251 @@ +import AppKit +import ApplicationServices +import CoreGraphics +import Foundation +import Vision + +// MARK: - Result Types + +/// Full screen or region capture result. +struct ScreenGrab { + let ocrText: String + let metadata: String // app name, window title, etc. + let imageJPEGData: Data? // nil if capture failed + let capturedAt: Date +} + +/// Selected text + context screenshot. +struct SelectionGrab { + let selectedText: String + let contextImageJPEGData: Data? + let capturedAt: Date +} + +// MARK: - ScreenGrabService + +/// On-demand screen / cursor / selection capture for Call Mode MCP tools. +/// +/// - `captureScreen(region:)` — full screen or cropped region, Vision OCR + JPEG +/// - `captureCursorContext()` — 600×400 crop around current cursor, OCR + AX element info +/// - `captureSelection()` — AX selected text + screenshot of selection bounds +/// +/// Wraps `ScreenVisionAnalyzer` for full-screen grabs; adds cursor and AX selection on top. +/// Thread-safe: all async methods dispatch to the right actor internally. +final class ScreenGrabService: @unchecked Sendable { + + private let visionAnalyzer = ScreenVisionAnalyzer() + + // MARK: - Full Screen / Region + + /// Capture the full screen (or a specific pixel region) with Vision OCR and a JPEG screenshot. + func captureScreen(region: CGRect? = nil) async -> ScreenGrab { + guard let snapshot = await visionAnalyzer.captureNow() else { + return ScreenGrab(ocrText: "", metadata: "Screen capture unavailable", + imageJPEGData: nil, capturedAt: Date()) + } + + let finalSnapshot: ScreenSnapshot + if let region { + let screen = await MainActor.run { NSScreen.main?.frame ?? .zero } + guard screen.width > 0, screen.height > 0 else { + return makeGrab(from: snapshot) + } + let normalized = CGRect( + x: region.minX / screen.width, + y: region.minY / screen.height, + width: region.width / screen.width, + height: region.height / screen.height + ) + finalSnapshot = await visionAnalyzer.applySelection(normalizedRect: normalized, to: snapshot) + } else { + finalSnapshot = snapshot + } + + return makeGrab(from: finalSnapshot) + } + + private func makeGrab(from snapshot: ScreenSnapshot) -> ScreenGrab { + let imageData = snapshot.savedImagePath.flatMap { path -> Data? in + guard let png = try? Data(contentsOf: URL(fileURLWithPath: path)) else { return nil } + return jpegFromPNG(png) + } + let parts = [ + snapshot.appName.map { "App: \($0)" }, + snapshot.windowTitle.map { "Window: \($0)" }, + snapshot.hasDialog ? "Modal/dialog visible" : nil + ].compactMap { $0 } + + return ScreenGrab( + ocrText: snapshot.croppedText ?? snapshot.extractedText, + metadata: parts.joined(separator: " | "), + imageJPEGData: imageData, + capturedAt: snapshot.capturedAt + ) + } + + // MARK: - Cursor Context + + /// Capture a 600×400 screenshot centred on the current cursor with OCR and AX element info. + /// Use this when the user points at something without saying explicitly what it is. + func captureCursorContext() async -> ScreenGrab { + let (mouseLocation, screenHeight) = await MainActor.run { + (NSEvent.mouseLocation, NSScreen.main?.frame.height ?? 900.0) + } + + // AppKit uses bottom-left origin; CGWindowListCreateImage uses top-left. + let cgX = mouseLocation.x + let cgY = screenHeight - mouseLocation.y + + let grabW: CGFloat = 600 + let grabH: CGFloat = 400 + let region = CGRect( + x: max(0, cgX - grabW / 2), + y: max(0, cgY - grabH / 2), + width: grabW, + height: grabH + ) + + guard let cgImage = CGWindowListCreateImage( + region, .optionOnScreenOnly, kCGNullWindowID, .bestResolution + ) else { + return ScreenGrab(ocrText: "", metadata: "Cursor capture failed", + imageJPEGData: nil, capturedAt: Date()) + } + + let ocrText = runOCR(on: cgImage) + let imageData = jpegFromCGImage(cgImage) + let elemInfo = axElementInfo(at: mouseLocation, screenHeight: screenHeight) + + let metadata = "Cursor at (\(Int(mouseLocation.x)), \(Int(mouseLocation.y)))" + + (elemInfo.isEmpty ? "" : " | \(elemInfo)") + + return ScreenGrab(ocrText: ocrText, metadata: metadata, + imageJPEGData: imageData, capturedAt: Date()) + } + + // MARK: - Selection + + /// Get the user's currently highlighted text and a screenshot of the selection bounds. + /// Use this when the user selects code, an error, or any text. + func captureSelection() async -> SelectionGrab { + let (text, bounds) = await MainActor.run { axSelectedTextAndBounds() } + let screenHeight = await MainActor.run { NSScreen.main?.frame.height ?? 900.0 } + + var imageData: Data? + if let b = bounds, b.width > 0, b.height > 0 { + // AX bounds use top-left origin on macOS (screen-space, not AppKit-space). + // Add padding around the selection. + let padded = CGRect( + x: b.minX - 24, + y: b.minY - 12, + width: b.width + 48, + height: b.height + 24 + ) + // CGWindowListCreateImage also uses top-left origin, so no flip needed here. + if let img = CGWindowListCreateImage( + padded, .optionOnScreenOnly, kCGNullWindowID, .bestResolution + ) { + imageData = jpegFromCGImage(img) + } + } + + return SelectionGrab( + selectedText: text ?? "", + contextImageJPEGData: imageData, + capturedAt: Date() + ) + } + + // MARK: - Accessibility Helpers + + /// Returns (selectedText, selectionBounds) for the focused UI element. + /// AX bounds are in screen-space top-left coordinates (same as CGWindow). + private func axSelectedTextAndBounds() -> (String?, CGRect?) { + let systemWide = AXUIElementCreateSystemWide() + + // Focused element + var focusedRef: CFTypeRef? + guard AXUIElementCopyAttributeValue( + systemWide, kAXFocusedUIElementAttribute as CFString, &focusedRef + ) == .success, let focused = focusedRef, + CFGetTypeID(focused) == AXUIElementGetTypeID() + else { return (nil, nil) } + + let element = focused as! AXUIElement // safe: type ID verified above + + // Selected text + var textRef: CFTypeRef? + _ = AXUIElementCopyAttributeValue(element, kAXSelectedTextAttribute as CFString, &textRef) + let text = textRef as? String + + // Selected text range + var rangeRef: CFTypeRef? + guard AXUIElementCopyAttributeValue( + element, kAXSelectedTextRangeAttribute as CFString, &rangeRef + ) == .success, let rangeVal = rangeRef else { return (text, nil) } + + // Bounding rect for that range + var boundsRef: CFTypeRef? + guard AXUIElementCopyParameterizedAttributeValue( + element, + kAXBoundsForRangeParameterizedAttribute as CFString, + rangeVal, + &boundsRef + ) == .success, let bVal = boundsRef, + CFGetTypeID(bVal) == AXValueGetTypeID() + else { return (text, nil) } + + var rect = CGRect.zero + AXValueGetValue(bVal as! AXValue, .cgRect, &rect) // safe: type ID verified + return (text, rect.width > 0 ? rect : nil) + } + + /// Description of the UI element under the cursor (role + title). + private func axElementInfo(at point: NSPoint, screenHeight: CGFloat) -> String { + let systemWide = AXUIElementCreateSystemWide() + var elementRef: AXUIElement? + // AX position uses top-left origin → flip AppKit Y + let axY = Float(screenHeight - point.y) + guard AXUIElementCopyElementAtPosition( + systemWide, Float(point.x), axY, &elementRef + ) == .success, let element = elementRef else { return "" } + + var roleRef: CFTypeRef? + AXUIElementCopyAttributeValue(element, kAXRoleAttribute as CFString, &roleRef) + let role = (roleRef as? String) ?? "" + + var titleRef: CFTypeRef? + AXUIElementCopyAttributeValue(element, kAXTitleAttribute as CFString, &titleRef) + let title = (titleRef as? String) ?? "" + + return [role, title].filter { !$0.isEmpty }.joined(separator: ": ") + } + + // MARK: - Vision OCR + + private func runOCR(on image: CGImage) -> String { + let req = VNRecognizeTextRequest() + req.recognitionLevel = .accurate + req.usesLanguageCorrection = true + let handler = VNImageRequestHandler(cgImage: image, options: [:]) + try? handler.perform([req]) + return (req.results ?? []) + .compactMap { $0.topCandidates(1).first?.string } + .joined(separator: "\n") + .trimmingCharacters(in: .whitespacesAndNewlines) + } + + // MARK: - Image Conversion + + private func jpegFromPNG(_ pngData: Data) -> Data? { + guard let rep = NSBitmapImageRep(data: pngData) else { return nil } + return rep.representation(using: .jpeg, properties: [.compressionFactor: 0.8]) + } + + private func jpegFromCGImage(_ cgImage: CGImage) -> Data? { + let nsImage = NSImage(cgImage: cgImage, size: .zero) + guard let tiff = nsImage.tiffRepresentation, + let rep = NSBitmapImageRep(data: tiff) else { return nil } + return rep.representation(using: .jpeg, properties: [.compressionFactor: 0.8]) + } +} diff --git a/Sources/SettingsManager.swift b/Sources/SettingsManager.swift index ae5b35f..9d2e78c 100644 --- a/Sources/SettingsManager.swift +++ b/Sources/SettingsManager.swift @@ -138,7 +138,8 @@ final class SettingsManager: @unchecked Sendable { private let kFaceTrackingEnabled = "face_tracking_enabled" private let kGestureHoldDuration = "gesture_hold_duration" private let kCameraAnalysisFPS = "camera_analysis_fps" - private let kSelectedCameraDeviceID = "selected_camera_device_id" + private let kSelectedCameraDeviceID = "selected_camera_device_id" + private let kCallStreamWidgetEnabled = "call_stream_widget_enabled" // MARK: - Properties @@ -388,5 +389,14 @@ final class SettingsManager: @unchecked Sendable { set { defaults.set(newValue, forKey: kSelectedCameraDeviceID) } } + // MARK: - Call Stream Widget + + /// When true, the floating Call Stream Widget appears whenever call mode is active. + /// Toggle in Settings → Call Mode. + var callStreamWidgetEnabled: Bool { + get { defaults.object(forKey: kCallStreamWidgetEnabled) as? Bool ?? true } + set { defaults.set(newValue, forKey: kCallStreamWidgetEnabled) } + } + private init() {} } diff --git a/Sources/SkillStore.swift b/Sources/SkillStore.swift index 9129538..2db1633 100644 --- a/Sources/SkillStore.swift +++ b/Sources/SkillStore.swift @@ -280,5 +280,64 @@ final class SkillStore: @unchecked Sendable { category: .marketing, isBuiltin: true ), + // Skill for Claude Code / CLI to know how to start and use Call Mode + Skill( + id: "call-mode-init", + name: "Start Call Mode", + description: "Tells Claude Code or Claude CLI how to switch AutoClawd into Call Mode and use the AutoClawd MCP tools for real-time screen and audio access during a call.", + promptTemplate: """ + To start Call Mode in AutoClawd: + + 1. Make sure AutoClawd is running (look for the floating pill widget on your screen). + 2. Click the mode icon on the pill widget and cycle to "Call" (cyan phone icon), OR say "call mode" aloud — the widget will switch automatically. + 3. Once in Call Mode, your voice is forwarded directly to Claude without Llama analysis. + + As Claude Code / Claude CLI, you have access to these AutoClawd MCP tools via the autoclawd MCP server: + - autoclawd_get_screen — full screen or region: Vision OCR text + JPEG screenshot + - autoclawd_get_cursor_context — 600×400 crop around cursor; use when user points at something + - autoclawd_get_selection — currently highlighted text + selection screenshot + - autoclawd_get_audio_transcript — recent spoken audio transcript (last 2000 chars) + + USAGE PATTERN: + - Call autoclawd_get_screen immediately to see what the user is looking at. + - Call autoclawd_get_cursor_context when user says "this", "here", or "that thing". + - Call autoclawd_get_selection when user highlights text before speaking. + - Keep responses short and conversational — this is a live call. + + To end Call Mode: cycle the widget back to any other mode. + + {{prompt}} + """, + workflowID: nil, + category: .development, + isBuiltin: true + ), + // Call Mode skill for the AutoClawd widget itself + Skill( + id: "call-mode", + name: "Call Mode", + description: "Activates when joining a video or voice call. Claude gets real-time access to your screen, cursor position, and selected text via AutoClawd MCP tools — acting like a meeting co-pilot that can see exactly what you see.", + promptTemplate: """ + You are now in Call Mode via AutoClawd. You have real-time sensory access to the user's environment through these MCP tools: + + AVAILABLE TOOLS: + - autoclawd_get_screen: Capture the full screen (or a pixel region) with Vision OCR text + JPEG screenshot. Call this first to orient yourself whenever the topic changes. + - autoclawd_get_cursor_context: Capture a 600×400 screenshot centred on the cursor with OCR. Use when the user says "this", "here", "that thing", or points at something without naming it. + - autoclawd_get_selection: Get the user's currently highlighted/selected text and a screenshot of that region. Use whenever the user selects code, an error, a file path, or any text before speaking. + - autoclawd_get_audio_transcript: Get the recent spoken transcript from the user's microphone. Use to review what was just said without the user repeating themselves. + + CALL MODE BEHAVIOUR: + - Call get_screen at the start of each new topic to understand context. + - Prefer get_cursor_context when spatial references are used ("this panel", "that button"). + - Prefer get_selection when the user has highlighted something — always check before answering questions about specific text. + - Keep responses concise and spoken-word friendly — this is a real-time call. + - Proactively mention what you can see to confirm your understanding. + + {{prompt}} + """, + workflowID: nil, + category: .development, + isBuiltin: true + ), ] } diff --git a/Sources/WidgetCanvasViews.swift b/Sources/WidgetCanvasViews.swift index 0aa3e7a..3bab5dd 100644 --- a/Sources/WidgetCanvasViews.swift +++ b/Sources/WidgetCanvasViews.swift @@ -428,6 +428,123 @@ struct MeetingCanvasView: View { } } +// MARK: - Call Mode Canvas + +/// Canvas for Call Mode — shows the real-time Claude conversation thread. +/// Voice chunks are sent directly to Claude; Claude calls screen/cursor/selection tools inline. +struct CallModeCanvasView: View { + @ObservedObject var session: CallModeSession + + var body: some View { + VStack(spacing: 0) { + // Header + HStack(spacing: 6) { + Image(systemName: "phone.bubble") + .font(.system(size: 9, weight: .semibold)) + .foregroundColor(.cyan) + Text("CALL MODE") + .font(.system(size: 8, weight: .bold, design: .monospaced)) + .foregroundColor(.white.opacity(0.45)) + Spacer() + if session.isProcessing { + HStack(spacing: 3) { + ProgressView() + .scaleEffect(0.5) + .frame(width: 10, height: 10) + Text("thinking") + .font(.system(size: 7)) + .foregroundColor(.cyan.opacity(0.60)) + } + } else { + Text("direct · claude") + .font(.system(size: 7)) + .foregroundColor(.white.opacity(0.20)) + } + } + .padding(.horizontal, 12) + .padding(.top, 10) + .padding(.bottom, 6) + + Divider().opacity(0.12) + + // Message thread + if session.messages.isEmpty { + VStack(spacing: 6) { + Image(systemName: "waveform") + .font(.system(size: 20, weight: .ultraLight)) + .foregroundColor(.cyan.opacity(0.18)) + Text("Speak to start the call") + .font(.system(size: 10)) + .foregroundColor(.white.opacity(0.22)) + } + .frame(maxWidth: .infinity, maxHeight: .infinity) + } else { + ScrollViewReader { proxy in + ScrollView(showsIndicators: false) { + LazyVStack(alignment: .leading, spacing: 6) { + ForEach(session.messages) { msg in + CallMessageBubble(message: msg) + .id(msg.id) + } + } + .padding(.horizontal, 10) + .padding(.vertical, 8) + } + .onChange(of: session.messages.count) { _ in + if let last = session.messages.last { + withAnimation { proxy.scrollTo(last.id, anchor: .bottom) } + } + } + } + } + } + .frame(maxWidth: .infinity, maxHeight: .infinity) + } +} + +/// Single message bubble in the call mode thread. +private struct CallMessageBubble: View { + let message: CallMessage + + var body: some View { + HStack(alignment: .top, spacing: 5) { + // Role indicator dot + Circle() + .fill(roleColor) + .frame(width: 5, height: 5) + .padding(.top, 4) + + Text(message.text) + .font(.system(size: 10, design: message.role == .tool ? .monospaced : .default)) + .foregroundColor(textColor) + .fixedSize(horizontal: false, vertical: true) + .frame(maxWidth: .infinity, alignment: .leading) + } + } + + private var roleColor: Color { + switch message.role { + case .user: return .white.opacity(0.45) + case .assistant: return .cyan + case .tool: return .yellow.opacity(0.60) + case .error: return .red + case .external: return .green + case .participant: return .purple + } + } + + private var textColor: Color { + switch message.role { + case .user: return .white.opacity(0.72) + case .assistant: return .white.opacity(0.88) + case .tool: return .white.opacity(0.38) + case .error: return .red.opacity(0.80) + case .external: return .green.opacity(0.85) + case .participant: return .white.opacity(0.85) + } + } +} + // MARK: - Project Picker Canvas (shared by Code + Tasks modes) /// Reusable tappable project list — used as the first canvas state in Code and Tasks modes.