diff --git a/web/src/components/LiveVoiceSession.tsx b/web/src/components/LiveVoiceSession.tsx new file mode 100644 index 00000000..c247d2b1 --- /dev/null +++ b/web/src/components/LiveVoiceSession.tsx @@ -0,0 +1,529 @@ +/* + * Copyright 2025 Rararulab + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +import { useState, useEffect, useRef, useCallback } from "react"; +import { Mic, MicOff, PhoneOff } from "lucide-react"; +import { buildWsUrl } from "@/adapters/rara-stream"; + +// --------------------------------------------------------------------------- +// Web Speech API type declarations +// The Web Speech API is not fully standardized and TypeScript's lib.dom does +// not include SpeechRecognition / SpeechRecognitionEvent. We declare the +// subset we use here to avoid pulling in @types/dom-speech-recognition. +// --------------------------------------------------------------------------- + +interface SpeechRecognitionEvent extends Event { + readonly resultIndex: number; + readonly results: SpeechRecognitionResultList; +} + +interface SpeechRecognitionErrorEvent extends Event { + readonly error: string; + readonly message: string; +} + +interface SpeechRecognitionInstance extends EventTarget { + continuous: boolean; + interimResults: boolean; + lang: string; + start(): void; + stop(): void; + abort(): void; + onresult: ((event: SpeechRecognitionEvent) => void) | null; + onerror: ((event: SpeechRecognitionErrorEvent) => void) | null; + onend: (() => void) | null; +} + +interface SpeechRecognitionConstructor { + new (): SpeechRecognitionInstance; +} + +declare global { + interface Window { + SpeechRecognition?: SpeechRecognitionConstructor; + webkitSpeechRecognition?: SpeechRecognitionConstructor; + } +} + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +type VoiceState = "idle" | "sending" | "speaking"; + +type LiveVoiceSessionProps = { + /** Returns the current session key for WebSocket connections. */ + getSessionKey: () => string | undefined; + /** Called when the backend finishes processing one turn. */ + onTurnComplete: () => void; + /** Called when the user ends the live voice session. */ + onClose: () => void; +}; + +// --------------------------------------------------------------------------- +// Audio Visualizer — ElevenLabs-inspired waveform bars +// --------------------------------------------------------------------------- + +/** Color palette keyed by voice state, adapted for dark theme. */ +const STATE_COLORS: Record = { + idle: "#525252", // zinc-600 — muted, ambient + sending: "#525252", // same gray, pulsing animation signals "thinking" + speaking: "#10b981", // emerald-500 — Rara is speaking +}; + +/** Blue highlight when user is actively speaking (mic input detected). */ +const HEARING_COLOR = "#3b82f6"; // blue-500 + +function AudioVisualizer({ + analyser, + state, +}: { + analyser: AnalyserNode | null; + state: VoiceState; +}) { + const canvasRef = useRef(null); + const animFrameRef = useRef(0); + // Persist per-bar phase offsets for idle breathing animation + const phaseOffsetsRef = useRef(null); + + useEffect(() => { + const canvas = canvasRef.current; + if (!canvas) return; + const ctx = canvas.getContext("2d"); + if (!ctx) return; + + const BAR_COUNT = 40; + const BAR_WIDTH = 3; + const BAR_GAP = 2; + const dataArray = analyser + ? new Uint8Array(analyser.frequencyBinCount) + : null; + + // Initialize stable random phase offsets (once) + if (!phaseOffsetsRef.current) { + phaseOffsetsRef.current = Array.from( + { length: BAR_COUNT }, + () => Math.random() * Math.PI * 2, + ); + } + const phaseOffsets = phaseOffsetsRef.current; + + // Sending state: soft pulse via opacity oscillation + let sendingPhase = 0; + + function draw() { + if (!ctx || !canvas) return; + const w = canvas.width; + const h = canvas.height; + ctx.clearRect(0, 0, w, h); + + if (analyser && dataArray) { + analyser.getByteFrequencyData(dataArray); + } + + // Detect whether user is producing audio (hearing state) + let avgLevel = 0; + if (dataArray) { + let sum = 0; + for (let i = 0; i < dataArray.length; i++) sum += dataArray[i]; + avgLevel = sum / dataArray.length; + } + const isHearing = state === "idle" && avgLevel > 12; + + // Pick bar color + const color = isHearing ? HEARING_COLOR : STATE_COLORS[state]; + + // Sending pulse: oscillate global opacity + let globalAlpha = 1; + if (state === "sending") { + sendingPhase += 0.03; + globalAlpha = 0.4 + 0.3 * Math.sin(sendingPhase); + } + + const totalBarsWidth = BAR_COUNT * (BAR_WIDTH + BAR_GAP) - BAR_GAP; + const offsetX = (w - totalBarsWidth) / 2; + const centerY = h / 2; + const now = performance.now() / 1000; // seconds + + ctx.lineCap = "round"; + + for (let i = 0; i < BAR_COUNT; i++) { + // Map frequency bin to bar height + const binIndex = dataArray + ? Math.floor((i / BAR_COUNT) * dataArray.length) + : 0; + const rawValue = dataArray ? dataArray[binIndex] : 0; + + let barHeight: number; + if (state === "idle" && !isHearing) { + // Idle breathing: gentle sinusoidal per-bar undulation + const breath = + Math.sin(now * 1.2 + phaseOffsets[i]) * 0.5 + 0.5; // 0..1 + barHeight = 3 + breath * 6; // 3..9px — subtle + } else if (state === "sending") { + // Thinking: slow wave with moderate height + const wave = + Math.sin(now * 2 + (i / BAR_COUNT) * Math.PI * 2) * 0.5 + 0.5; + barHeight = 4 + wave * 14; + } else { + // Hearing or speaking: driven by audio data + barHeight = Math.max(3, (rawValue / 255) * (h * 0.85)); + } + + const x = offsetX + i * (BAR_WIDTH + BAR_GAP); + + ctx.globalAlpha = globalAlpha; + ctx.fillStyle = color; + ctx.beginPath(); + ctx.roundRect( + x, + centerY - barHeight / 2, + BAR_WIDTH, + barHeight, + BAR_WIDTH / 2, + ); + ctx.fill(); + } + + ctx.globalAlpha = 1; + animFrameRef.current = requestAnimationFrame(draw); + } + + draw(); + + return () => { + cancelAnimationFrame(animFrameRef.current); + }; + }, [analyser, state]); + + return ( + + ); +} + +// --------------------------------------------------------------------------- +// LiveVoiceSession — main component +// --------------------------------------------------------------------------- + +/** + * Bottom voice panel for real-time voice conversation. + * Uses Web Speech API for continuous speech-to-text, sends transcribed text + * through the existing WebSocket chat API, and displays a waveform visualizer. + * + * UI styled after ElevenLabs design language, adapted for dark theme. + */ +export function LiveVoiceSession({ + getSessionKey, + onTurnComplete, + onClose, +}: LiveVoiceSessionProps) { + const [state, setState] = useState("idle"); + const [muted, setMuted] = useState(false); + const [interimText, setInterimText] = useState(""); + const [finalText, setFinalText] = useState(""); + const [error, setError] = useState(null); + const [analyser, setAnalyser] = useState(null); + + // Refs for cleanup-safe access + const recognitionRef = useRef(null); + const wsRef = useRef(null); + const audioCtxRef = useRef(null); + const streamRef = useRef(null); + const liveModeRef = useRef(true); + const mutedRef = useRef(false); + + // Keep mutedRef in sync with muted state + useEffect(() => { + mutedRef.current = muted; + }, [muted]); + + // --------------------------------------------------------------------------- + // Speech recognition management + // --------------------------------------------------------------------------- + + const resumeRecognition = useCallback(() => { + if (!liveModeRef.current || mutedRef.current) return; + try { + recognitionRef.current?.start(); + } catch { + // May already be running + } + }, []); + + // --------------------------------------------------------------------------- + // Send transcribed text to backend via WebSocket + // --------------------------------------------------------------------------- + + const sendText = useCallback( + (text: string) => { + const sessionKey = getSessionKey(); + if (!sessionKey || !text.trim()) return; + + setState("sending"); + setFinalText(text); + setInterimText(""); + + // Pause recognition while waiting for response + try { + recognitionRef.current?.stop(); + } catch { + // May already be stopped + } + + const wsUrl = buildWsUrl(sessionKey); + const ws = new WebSocket(wsUrl); + wsRef.current = ws; + + ws.onopen = () => { + ws.send(text); + }; + + ws.onmessage = (ev: MessageEvent) => { + try { + const event = JSON.parse(ev.data as string); + if (event.type === "done" || event.type === "message") { + ws.close(); + } else if (event.type === "error") { + console.error("Voice WS error:", event.message); + ws.close(); + } + } catch { + // Ignore non-JSON frames + } + }; + + ws.onerror = () => { + console.error("Voice WebSocket connection error"); + setState("idle"); + resumeRecognition(); + }; + + ws.onclose = () => { + wsRef.current = null; + onTurnComplete(); + setState("idle"); + setFinalText(""); + resumeRecognition(); + }; + }, + [getSessionKey, onTurnComplete, resumeRecognition], + ); + + // Initialize speech recognition and microphone audio + useEffect(() => { + const SpeechRecognitionCtor = + window.SpeechRecognition ?? window.webkitSpeechRecognition; + if (!SpeechRecognitionCtor) { + setError("Speech recognition is not supported in this browser."); + return; + } + + const recognition = new SpeechRecognitionCtor(); + recognition.continuous = true; + recognition.interimResults = true; + recognition.lang = "zh-CN"; + recognitionRef.current = recognition; + + recognition.onresult = (event) => { + const result = event.results[event.resultIndex]; + if (result.isFinal) { + const transcript = result[0].transcript.trim(); + if (transcript) { + sendText(transcript); + } + } else { + setInterimText(result[0].transcript); + } + }; + + recognition.onerror = (event) => { + // "no-speech" and "aborted" are expected during normal operation + if (event.error === "no-speech" || event.error === "aborted") return; + console.error("Speech recognition error:", event.error); + if (event.error === "not-allowed") { + setError("Microphone access denied. Please allow microphone access."); + } + }; + + recognition.onend = () => { + // Auto-restart if still in live mode and not muted + if (liveModeRef.current && !mutedRef.current) { + try { + recognition.start(); + } catch { + // May fail if already started + } + } + }; + + // Start listening + try { + recognition.start(); + } catch (err) { + console.error("Failed to start speech recognition:", err); + setError("Failed to start speech recognition."); + } + + // Set up AudioContext for visualizer + navigator.mediaDevices + .getUserMedia({ audio: true }) + .then((stream) => { + streamRef.current = stream; + const audioCtx = new AudioContext(); + audioCtxRef.current = audioCtx; + const source = audioCtx.createMediaStreamSource(stream); + const analyserNode = audioCtx.createAnalyser(); + analyserNode.fftSize = 256; + source.connect(analyserNode); + // Do NOT connect to destination — we don't want to hear our own mic + setAnalyser(analyserNode); + }) + .catch((err) => { + console.error("Microphone access for visualizer failed:", err); + // Non-fatal — visualizer just won't work + }); + + // Cleanup on unmount + return () => { + liveModeRef.current = false; + try { + recognition.stop(); + } catch { + // ignore + } + recognitionRef.current = null; + wsRef.current?.close(); + wsRef.current = null; + streamRef.current?.getTracks().forEach((t) => t.stop()); + audioCtxRef.current?.close(); + }; + }, [sendText]); + + // --------------------------------------------------------------------------- + // Mute / unmute + // --------------------------------------------------------------------------- + + const toggleMute = useCallback(() => { + setMuted((prev) => { + const next = !prev; + if (next) { + // Muting — stop recognition + try { + recognitionRef.current?.stop(); + } catch { + // ignore + } + } else { + // Unmuting — restart recognition + try { + recognitionRef.current?.start(); + } catch { + // ignore + } + } + return next; + }); + }, []); + + // --------------------------------------------------------------------------- + // Close session + // --------------------------------------------------------------------------- + + const handleClose = useCallback(() => { + liveModeRef.current = false; + try { + recognitionRef.current?.stop(); + } catch { + // ignore + } + wsRef.current?.close(); + onClose(); + }, [onClose]); + + // --------------------------------------------------------------------------- + // Status text + // --------------------------------------------------------------------------- + + const statusLabel = + error ?? + (state === "sending" + ? "THINKING" + : muted + ? "MUTED" + : "LISTENING"); + + // --------------------------------------------------------------------------- + // Render — ElevenLabs-inspired dark voice panel + // --------------------------------------------------------------------------- + + return ( +
+ {/* Waveform visualizer — the hero element */} + + + {/* Interim transcription or confirmed text */} +
+ {state === "sending" && finalText ? ( + + {finalText} + + ) : interimText ? ( + + {interimText} + + ) : null} +
+ + {/* Status label — uppercase, tracked, small */} +
+ {statusLabel} +
+ + {/* Control bar — pill buttons */} +
+ {/* Mute button */} + + + {/* End session button */} + +
+
+ ); +} diff --git a/web/src/pages/PiChat.tsx b/web/src/pages/PiChat.tsx index 2220b31a..eb91b5ff 100644 --- a/web/src/pages/PiChat.tsx +++ b/web/src/pages/PiChat.tsx @@ -34,7 +34,7 @@ import { createRaraStreamFn } from "@/adapters/rara-stream"; import { api } from "@/api/client"; import type { ChatSession, ChatMessageData } from "@/api/types"; import { useNavigate } from "react-router"; -import { VoiceRecorder } from "@/components/VoiceRecorder"; +import { LiveVoiceSession } from "@/components/LiveVoiceSession"; /** Strip `...` blocks from assistant text. */ function stripThinkTags(text: string): string { @@ -259,6 +259,7 @@ export default function PiChat() { const agentRef = useRef(null); const chatPanelRef = useRef(null); const [showSessionList, setShowSessionList] = useState(false); + const [liveMode, setLiveMode] = useState(false); const navigate = useNavigate(); /** Switch the agent to a different session, loading its history. */ @@ -405,15 +406,41 @@ export default function PiChat() { - {/* Voice recorder button — fixed top-right */} + {/* Voice toggle — fixed top-right */}
- agentRef.current?.sessionId} - onComplete={reloadMessages} - /> + {liveMode ? ( + + ) : ( + + )}
{/* Chat panel container */}
+ {/* Live voice panel — bottom overlay */} + {liveMode && ( + agentRef.current?.sessionId} + onTurnComplete={reloadMessages} + onClose={() => setLiveMode(false)} + /> + )} {/* Session list slide-over */} {showSessionList && (