From 79113c9906c1b33ede2122c0fac2976e0ac31ec3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 13 Apr 2026 21:52:21 +0800 Subject: [PATCH 1/2] feat(web): add LiveVoiceSession for real-time voice chat (#1345) Add a continuous voice conversation mode using the Web Speech API: - New LiveVoiceSession component with speech recognition, waveform visualizer (AudioContext + AnalyserNode), and control bar - Integrate into PiChat with liveMode toggle replacing VoiceRecorder - Three states: IDLE (listening), SENDING (thinking), SPEAKING (placeholder) - Auto-restart recognition on silence, mute/unmute support Closes #1345 Co-Authored-By: Claude Opus 4.6 (1M context) --- web/src/components/LiveVoiceSession.tsx | 450 ++++++++++++++++++++++++ web/src/pages/PiChat.tsx | 34 +- 2 files changed, 476 insertions(+), 8 deletions(-) create mode 100644 web/src/components/LiveVoiceSession.tsx diff --git a/web/src/components/LiveVoiceSession.tsx b/web/src/components/LiveVoiceSession.tsx new file mode 100644 index 00000000..a08776ea --- /dev/null +++ b/web/src/components/LiveVoiceSession.tsx @@ -0,0 +1,450 @@ +/* + * Copyright 2025 Rararulab + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +import { useState, useEffect, useRef, useCallback } from "react"; +import { Mic, MicOff, PhoneOff } from "lucide-react"; +import { buildWsUrl } from "@/adapters/rara-stream"; + +// --------------------------------------------------------------------------- +// Web Speech API type declarations +// The Web Speech API is not fully standardized and TypeScript's lib.dom does +// not include SpeechRecognition / SpeechRecognitionEvent. We declare the +// subset we use here to avoid pulling in @types/dom-speech-recognition. +// --------------------------------------------------------------------------- + +interface SpeechRecognitionEvent extends Event { + readonly resultIndex: number; + readonly results: SpeechRecognitionResultList; +} + +interface SpeechRecognitionErrorEvent extends Event { + readonly error: string; + readonly message: string; +} + +interface SpeechRecognitionInstance extends EventTarget { + continuous: boolean; + interimResults: boolean; + lang: string; + start(): void; + stop(): void; + abort(): void; + onresult: ((event: SpeechRecognitionEvent) => void) | null; + onerror: ((event: SpeechRecognitionErrorEvent) => void) | null; + onend: (() => void) | null; +} + +interface SpeechRecognitionConstructor { + new (): SpeechRecognitionInstance; +} + +declare global { + interface Window { + SpeechRecognition?: SpeechRecognitionConstructor; + webkitSpeechRecognition?: SpeechRecognitionConstructor; + } +} + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +type VoiceState = "idle" | "sending" | "speaking"; + +type LiveVoiceSessionProps = { + /** Returns the current session key for WebSocket connections. */ + getSessionKey: () => string | undefined; + /** Called when the backend finishes processing one turn. */ + onTurnComplete: () => void; + /** Called when the user ends the live voice session. */ + onClose: () => void; +}; + +// --------------------------------------------------------------------------- +// Audio Visualizer (inline — replaces LiveKit Agents UI dependency) +// --------------------------------------------------------------------------- + +function AudioVisualizer({ + analyser, + state, +}: { + analyser: AnalyserNode | null; + state: VoiceState; +}) { + const canvasRef = useRef(null); + const animFrameRef = useRef(0); + + useEffect(() => { + const canvas = canvasRef.current; + if (!canvas) return; + const ctx = canvas.getContext("2d"); + if (!ctx) return; + + const BAR_COUNT = 32; + const dataArray = analyser ? new Uint8Array(analyser.frequencyBinCount) : null; + + function draw() { + if (!ctx || !canvas) return; + const w = canvas.width; + const h = canvas.height; + ctx.clearRect(0, 0, w, h); + + // Determine color based on state + const color = + state === "sending" + ? "rgba(156, 163, 175, 0.5)" // gray — thinking + : state === "speaking" + ? "rgba(34, 197, 94, 0.7)" // green — speaking + : "rgba(59, 130, 246, 0.6)"; // blue — listening + + if (analyser && dataArray) { + analyser.getByteFrequencyData(dataArray); + } + + const barWidth = w / BAR_COUNT - 2; + const centerY = h / 2; + + for (let i = 0; i < BAR_COUNT; i++) { + // Map frequency bin to bar height + const binIndex = dataArray + ? Math.floor((i / BAR_COUNT) * dataArray.length) + : 0; + const value = dataArray ? dataArray[binIndex] : 0; + // Minimum bar height for idle state + const barHeight = Math.max(2, (value / 255) * (h * 0.8)); + + const x = i * (barWidth + 2) + 1; + ctx.fillStyle = color; + ctx.roundRect(x, centerY - barHeight / 2, barWidth, barHeight, 2); + ctx.fill(); + } + + animFrameRef.current = requestAnimationFrame(draw); + } + + draw(); + + return () => { + cancelAnimationFrame(animFrameRef.current); + }; + }, [analyser, state]); + + return ( + + ); +} + +// --------------------------------------------------------------------------- +// LiveVoiceSession — main component +// --------------------------------------------------------------------------- + +/** + * Bottom voice panel for real-time voice conversation. + * Uses Web Speech API for continuous speech-to-text, sends transcribed text + * through the existing WebSocket chat API, and displays a waveform visualizer. + */ +export function LiveVoiceSession({ + getSessionKey, + onTurnComplete, + onClose, +}: LiveVoiceSessionProps) { + const [state, setState] = useState("idle"); + const [muted, setMuted] = useState(false); + const [interimText, setInterimText] = useState(""); + const [finalText, setFinalText] = useState(""); + const [error, setError] = useState(null); + const [analyser, setAnalyser] = useState(null); + + // Refs for cleanup-safe access + const recognitionRef = useRef(null); + const wsRef = useRef(null); + const audioCtxRef = useRef(null); + const streamRef = useRef(null); + const liveModeRef = useRef(true); + const mutedRef = useRef(false); + + // Keep mutedRef in sync with muted state + useEffect(() => { + mutedRef.current = muted; + }, [muted]); + + // --------------------------------------------------------------------------- + // Speech recognition management + // --------------------------------------------------------------------------- + + const resumeRecognition = useCallback(() => { + if (!liveModeRef.current || mutedRef.current) return; + try { + recognitionRef.current?.start(); + } catch { + // May already be running + } + }, []); + + // --------------------------------------------------------------------------- + // Send transcribed text to backend via WebSocket + // --------------------------------------------------------------------------- + + const sendText = useCallback( + (text: string) => { + const sessionKey = getSessionKey(); + if (!sessionKey || !text.trim()) return; + + setState("sending"); + setFinalText(text); + setInterimText(""); + + // Pause recognition while waiting for response + try { + recognitionRef.current?.stop(); + } catch { + // May already be stopped + } + + const wsUrl = buildWsUrl(sessionKey); + const ws = new WebSocket(wsUrl); + wsRef.current = ws; + + ws.onopen = () => { + ws.send(text); + }; + + ws.onmessage = (ev: MessageEvent) => { + try { + const event = JSON.parse(ev.data as string); + if (event.type === "done" || event.type === "message") { + ws.close(); + } else if (event.type === "error") { + console.error("Voice WS error:", event.message); + ws.close(); + } + } catch { + // Ignore non-JSON frames + } + }; + + ws.onerror = () => { + console.error("Voice WebSocket connection error"); + setState("idle"); + resumeRecognition(); + }; + + ws.onclose = () => { + wsRef.current = null; + onTurnComplete(); + setState("idle"); + setFinalText(""); + resumeRecognition(); + }; + }, + [getSessionKey, onTurnComplete, resumeRecognition], + ); + + // Initialize speech recognition and microphone audio + useEffect(() => { + const SpeechRecognitionCtor = + window.SpeechRecognition ?? window.webkitSpeechRecognition; + if (!SpeechRecognitionCtor) { + setError("Speech recognition is not supported in this browser."); + return; + } + + const recognition = new SpeechRecognitionCtor(); + recognition.continuous = true; + recognition.interimResults = true; + recognition.lang = "zh-CN"; + recognitionRef.current = recognition; + + recognition.onresult = (event) => { + const result = event.results[event.resultIndex]; + if (result.isFinal) { + const transcript = result[0].transcript.trim(); + if (transcript) { + sendText(transcript); + } + } else { + setInterimText(result[0].transcript); + } + }; + + recognition.onerror = (event) => { + // "no-speech" and "aborted" are expected during normal operation + if (event.error === "no-speech" || event.error === "aborted") return; + console.error("Speech recognition error:", event.error); + if (event.error === "not-allowed") { + setError("Microphone access denied. Please allow microphone access."); + } + }; + + recognition.onend = () => { + // Auto-restart if still in live mode and not muted + if (liveModeRef.current && !mutedRef.current) { + try { + recognition.start(); + } catch { + // May fail if already started + } + } + }; + + // Start listening + try { + recognition.start(); + } catch (err) { + console.error("Failed to start speech recognition:", err); + setError("Failed to start speech recognition."); + } + + // Set up AudioContext for visualizer + navigator.mediaDevices + .getUserMedia({ audio: true }) + .then((stream) => { + streamRef.current = stream; + const audioCtx = new AudioContext(); + audioCtxRef.current = audioCtx; + const source = audioCtx.createMediaStreamSource(stream); + const analyserNode = audioCtx.createAnalyser(); + analyserNode.fftSize = 256; + source.connect(analyserNode); + // Do NOT connect to destination — we don't want to hear our own mic + setAnalyser(analyserNode); + }) + .catch((err) => { + console.error("Microphone access for visualizer failed:", err); + // Non-fatal — visualizer just won't work + }); + + // Cleanup on unmount + return () => { + liveModeRef.current = false; + try { + recognition.stop(); + } catch { + // ignore + } + recognitionRef.current = null; + wsRef.current?.close(); + wsRef.current = null; + streamRef.current?.getTracks().forEach((t) => t.stop()); + audioCtxRef.current?.close(); + }; + }, [sendText]); + + // --------------------------------------------------------------------------- + // Mute / unmute + // --------------------------------------------------------------------------- + + const toggleMute = useCallback(() => { + setMuted((prev) => { + const next = !prev; + if (next) { + // Muting — stop recognition + try { + recognitionRef.current?.stop(); + } catch { + // ignore + } + } else { + // Unmuting — restart recognition + try { + recognitionRef.current?.start(); + } catch { + // ignore + } + } + return next; + }); + }, []); + + // --------------------------------------------------------------------------- + // Close session + // --------------------------------------------------------------------------- + + const handleClose = useCallback(() => { + liveModeRef.current = false; + try { + recognitionRef.current?.stop(); + } catch { + // ignore + } + wsRef.current?.close(); + onClose(); + }, [onClose]); + + // --------------------------------------------------------------------------- + // Status text + // --------------------------------------------------------------------------- + + const statusText = + error ?? + (state === "sending" + ? "Thinking..." + : muted + ? "Muted" + : "Listening..."); + + // --------------------------------------------------------------------------- + // Render + // --------------------------------------------------------------------------- + + return ( +
+ {/* Waveform visualizer */} + + + {/* Interim transcription or confirmed text */} +
+ {state === "sending" && finalText ? ( + + {finalText} + + ) : interimText ? ( + + {interimText} + + ) : null} +
+ + {/* Status text */} +
{statusText}
+ + {/* Control bar */} +
+ {/* Mute button */} + + + {/* End session button */} + +
+
+ ); +} diff --git a/web/src/pages/PiChat.tsx b/web/src/pages/PiChat.tsx index 2220b31a..63312284 100644 --- a/web/src/pages/PiChat.tsx +++ b/web/src/pages/PiChat.tsx @@ -34,7 +34,7 @@ import { createRaraStreamFn } from "@/adapters/rara-stream"; import { api } from "@/api/client"; import type { ChatSession, ChatMessageData } from "@/api/types"; import { useNavigate } from "react-router"; -import { VoiceRecorder } from "@/components/VoiceRecorder"; +import { LiveVoiceSession } from "@/components/LiveVoiceSession"; /** Strip `...` blocks from assistant text. */ function stripThinkTags(text: string): string { @@ -259,6 +259,7 @@ export default function PiChat() { const agentRef = useRef(null); const chatPanelRef = useRef(null); const [showSessionList, setShowSessionList] = useState(false); + const [liveMode, setLiveMode] = useState(false); const navigate = useNavigate(); /** Switch the agent to a different session, loading its history. */ @@ -405,15 +406,32 @@ export default function PiChat() { - {/* Voice recorder button — fixed top-right */} -
- agentRef.current?.sessionId} - onComplete={reloadMessages} - /> -
+ {/* Voice button — fixed top-right */} + {!liveMode && ( +
+ +
+ )} {/* Chat panel container */}
+ {/* Live voice panel — bottom overlay */} + {liveMode && ( + agentRef.current?.sessionId} + onTurnComplete={reloadMessages} + onClose={() => setLiveMode(false)} + /> + )} {/* Session list slide-over */} {showSessionList && ( Date: Mon, 13 Apr 2026 22:31:32 +0800 Subject: [PATCH 2/2] style(web): apply ElevenLabs design language to LiveVoiceSession (#1345) Rewrite the voice panel UI with ElevenLabs-inspired dark theme: - AudioVisualizer: larger canvas (400x80), 3px round-cap bars, state-based colors (zinc idle, blue hearing, emerald speaking), idle breathing animation - Voice panel: backdrop-blur-xl, multi-layer dark shadow, h-52 with centered flex layout and generous spacing - Pill-shaped controls: translucent mute (bg-white/5) and red-tinted end button - Status label: uppercase 11px tracked text - Interim text: italic muted with min-height to prevent layout jank - PiChat toggle: active state shows red pulsing dot + "LIVE" pill badge Closes #1345 --- web/src/components/LiveVoiceSession.tsx | 159 ++++++++++++++++++------ web/src/pages/PiChat.tsx | 19 ++- 2 files changed, 133 insertions(+), 45 deletions(-) diff --git a/web/src/components/LiveVoiceSession.tsx b/web/src/components/LiveVoiceSession.tsx index a08776ea..c247d2b1 100644 --- a/web/src/components/LiveVoiceSession.tsx +++ b/web/src/components/LiveVoiceSession.tsx @@ -68,9 +68,19 @@ type LiveVoiceSessionProps = { }; // --------------------------------------------------------------------------- -// Audio Visualizer (inline — replaces LiveKit Agents UI dependency) +// Audio Visualizer — ElevenLabs-inspired waveform bars // --------------------------------------------------------------------------- +/** Color palette keyed by voice state, adapted for dark theme. */ +const STATE_COLORS: Record = { + idle: "#525252", // zinc-600 — muted, ambient + sending: "#525252", // same gray, pulsing animation signals "thinking" + speaking: "#10b981", // emerald-500 — Rara is speaking +}; + +/** Blue highlight when user is actively speaking (mic input detected). */ +const HEARING_COLOR = "#3b82f6"; // blue-500 + function AudioVisualizer({ analyser, state, @@ -80,6 +90,8 @@ function AudioVisualizer({ }) { const canvasRef = useRef(null); const animFrameRef = useRef(0); + // Persist per-bar phase offsets for idle breathing animation + const phaseOffsetsRef = useRef(null); useEffect(() => { const canvas = canvasRef.current; @@ -87,8 +99,24 @@ function AudioVisualizer({ const ctx = canvas.getContext("2d"); if (!ctx) return; - const BAR_COUNT = 32; - const dataArray = analyser ? new Uint8Array(analyser.frequencyBinCount) : null; + const BAR_COUNT = 40; + const BAR_WIDTH = 3; + const BAR_GAP = 2; + const dataArray = analyser + ? new Uint8Array(analyser.frequencyBinCount) + : null; + + // Initialize stable random phase offsets (once) + if (!phaseOffsetsRef.current) { + phaseOffsetsRef.current = Array.from( + { length: BAR_COUNT }, + () => Math.random() * Math.PI * 2, + ); + } + const phaseOffsets = phaseOffsetsRef.current; + + // Sending state: soft pulse via opacity oscillation + let sendingPhase = 0; function draw() { if (!ctx || !canvas) return; @@ -96,36 +124,75 @@ function AudioVisualizer({ const h = canvas.height; ctx.clearRect(0, 0, w, h); - // Determine color based on state - const color = - state === "sending" - ? "rgba(156, 163, 175, 0.5)" // gray — thinking - : state === "speaking" - ? "rgba(34, 197, 94, 0.7)" // green — speaking - : "rgba(59, 130, 246, 0.6)"; // blue — listening - if (analyser && dataArray) { analyser.getByteFrequencyData(dataArray); } - const barWidth = w / BAR_COUNT - 2; + // Detect whether user is producing audio (hearing state) + let avgLevel = 0; + if (dataArray) { + let sum = 0; + for (let i = 0; i < dataArray.length; i++) sum += dataArray[i]; + avgLevel = sum / dataArray.length; + } + const isHearing = state === "idle" && avgLevel > 12; + + // Pick bar color + const color = isHearing ? HEARING_COLOR : STATE_COLORS[state]; + + // Sending pulse: oscillate global opacity + let globalAlpha = 1; + if (state === "sending") { + sendingPhase += 0.03; + globalAlpha = 0.4 + 0.3 * Math.sin(sendingPhase); + } + + const totalBarsWidth = BAR_COUNT * (BAR_WIDTH + BAR_GAP) - BAR_GAP; + const offsetX = (w - totalBarsWidth) / 2; const centerY = h / 2; + const now = performance.now() / 1000; // seconds + + ctx.lineCap = "round"; for (let i = 0; i < BAR_COUNT; i++) { // Map frequency bin to bar height const binIndex = dataArray ? Math.floor((i / BAR_COUNT) * dataArray.length) : 0; - const value = dataArray ? dataArray[binIndex] : 0; - // Minimum bar height for idle state - const barHeight = Math.max(2, (value / 255) * (h * 0.8)); + const rawValue = dataArray ? dataArray[binIndex] : 0; + + let barHeight: number; + if (state === "idle" && !isHearing) { + // Idle breathing: gentle sinusoidal per-bar undulation + const breath = + Math.sin(now * 1.2 + phaseOffsets[i]) * 0.5 + 0.5; // 0..1 + barHeight = 3 + breath * 6; // 3..9px — subtle + } else if (state === "sending") { + // Thinking: slow wave with moderate height + const wave = + Math.sin(now * 2 + (i / BAR_COUNT) * Math.PI * 2) * 0.5 + 0.5; + barHeight = 4 + wave * 14; + } else { + // Hearing or speaking: driven by audio data + barHeight = Math.max(3, (rawValue / 255) * (h * 0.85)); + } + + const x = offsetX + i * (BAR_WIDTH + BAR_GAP); - const x = i * (barWidth + 2) + 1; + ctx.globalAlpha = globalAlpha; ctx.fillStyle = color; - ctx.roundRect(x, centerY - barHeight / 2, barWidth, barHeight, 2); + ctx.beginPath(); + ctx.roundRect( + x, + centerY - barHeight / 2, + BAR_WIDTH, + barHeight, + BAR_WIDTH / 2, + ); ctx.fill(); } + ctx.globalAlpha = 1; animFrameRef.current = requestAnimationFrame(draw); } @@ -139,9 +206,9 @@ function AudioVisualizer({ return ( ); } @@ -154,6 +221,8 @@ function AudioVisualizer({ * Bottom voice panel for real-time voice conversation. * Uses Web Speech API for continuous speech-to-text, sends transcribed text * through the existing WebSocket chat API, and displays a waveform visualizer. + * + * UI styled after ElevenLabs design language, adapted for dark theme. */ export function LiveVoiceSession({ getSessionKey, @@ -387,62 +456,72 @@ export function LiveVoiceSession({ // Status text // --------------------------------------------------------------------------- - const statusText = + const statusLabel = error ?? (state === "sending" - ? "Thinking..." + ? "THINKING" : muted - ? "Muted" - : "Listening..."); + ? "MUTED" + : "LISTENING"); // --------------------------------------------------------------------------- - // Render + // Render — ElevenLabs-inspired dark voice panel // --------------------------------------------------------------------------- return ( -
- {/* Waveform visualizer */} +
+ {/* Waveform visualizer — the hero element */} {/* Interim transcription or confirmed text */} -
+
{state === "sending" && finalText ? ( - + {finalText} ) : interimText ? ( - + {interimText} ) : null}
- {/* Status text */} -
{statusText}
+ {/* Status label — uppercase, tracked, small */} +
+ {statusLabel} +
- {/* Control bar */} -
+ {/* Control bar — pill buttons */} +
{/* Mute button */} {/* End session button */}
diff --git a/web/src/pages/PiChat.tsx b/web/src/pages/PiChat.tsx index 63312284..eb91b5ff 100644 --- a/web/src/pages/PiChat.tsx +++ b/web/src/pages/PiChat.tsx @@ -406,9 +406,18 @@ export default function PiChat() { - {/* Voice button — fixed top-right */} - {!liveMode && ( -
+ {/* Voice toggle — fixed top-right */} +
+ {liveMode ? ( + + ) : ( -
- )} + )} +
{/* Chat panel container */}
{/* Live voice panel — bottom overlay */}