diff --git a/src/serve/demo.html b/src/serve/demo.html index 828412d..d896601 100644 --- a/src/serve/demo.html +++ b/src/serve/demo.html @@ -336,6 +336,8 @@

kotoba — voice chat

/** The SpeechRecognition instance. Recreated per listening session * because some browsers don't cleanly reuse it after errors. */ let recognition = null; +/** Whether recognition auto-restart is currently desired. */ +let recognitionShouldRun = false; /** AbortController for the in-flight LLM fetch, so user interruptions * can cancel it immediately. */ @@ -370,6 +372,13 @@

kotoba — voice chat

let interimEl = null; /** Monotonic turn id used to invalidate stale async turn continuations. */ let activeTurnId = 0; +/** Assistant text visible to the user in the current turn, used for + * echo-guard heuristics during barge-in detection. */ +let currentAssistantText = ''; +/** Timestamp when we entered speaking state for the current turn. */ +let speakingStartedAtMs = 0; +/** Timestamp when assistant playback finished in the previous turn. */ +let lastAssistantFinishedAtMs = 0; // --------------------------------------------------------------------------- // UI helpers @@ -700,6 +709,24 @@

kotoba — voice chat

return text.replace(/]*>[\s\S]*?(?:<\/think>|$)/gi, ''); } +// Normalize text for fuzzy echo comparison between what the assistant just +// said and what SpeechRecognition captured while speaking. +function normalizeForEchoCompare(text) { + return text + .toLowerCase() + .replace(/[\s\u3000]/g, '') + .replace(/[,。!?、,.!?:;"'“”‘’()()\-]/g, ''); +} + +// Heuristic: if the recognized fragment is mostly contained in the current +// assistant reply, it's likely speaker echo rather than a real interruption. +function isLikelyAssistantEcho(recognizedText) { + const heard = normalizeForEchoCompare(recognizedText); + const spoken = normalizeForEchoCompare(currentAssistantText); + if (!heard || heard.length < 2 || !spoken) return false; + return spoken.includes(heard); +} + // --------------------------------------------------------------------------- // LLM streaming // --------------------------------------------------------------------------- @@ -793,6 +820,7 @@

kotoba — voice chat

// stream and derive the visible/speakable text incrementally. let rawText = ''; let visibleText = ''; + currentAssistantText = ''; // Start (or reuse) the TTS socket for this turn. resetTtsDrainState(); @@ -812,9 +840,12 @@

kotoba — voice chat

// First visible token = transition to speaking and arm barge-in. if (state === State.THINKING) { setState(State.SPEAKING, 'speaking…'); + speakingStartedAtMs = Date.now(); + recognitionShouldRun = true; startRecognition(); } currentAssistantEl.textContent += visibleDelta; + currentAssistantText += visibleDelta; el.conversation.scrollTop = el.conversation.scrollHeight; textBuffer += visibleDelta; @@ -839,8 +870,11 @@

kotoba — voice chat

showBanner('LLM unreachable, using echo mode'); fullText = `「${userText}」と聞きました。`; currentAssistantEl.textContent = fullText; + currentAssistantText = fullText; textBuffer = fullText; setState(State.SPEAKING, 'speaking…'); + speakingStartedAtMs = Date.now(); + recognitionShouldRun = true; startRecognition(); } @@ -870,9 +904,11 @@

kotoba — voice chat

// has no single "all done" event across multiple buffers. await waitForPlaybackToDrain(); if (turnId !== activeTurnId || !running) return; + lastAssistantFinishedAtMs = Date.now(); if (running) { setState(State.LISTENING, 'listening…'); + recognitionShouldRun = true; startRecognition(); } } @@ -928,11 +964,8 @@

kotoba — voice chat

}; rec.onspeechstart = () => { - // Barge-in: if the assistant is currently talking, cut it off. - if (state === State.SPEAKING) { - interrupt(); - setState(State.LISTENING, 'listening… (interrupted)'); - } + // We do not interrupt immediately on speechstart because speaker echo + // can trigger false positives. Interruption is confirmed on final text. }; rec.onresult = (ev) => { @@ -944,15 +977,25 @@

kotoba — voice chat

else interim += res[0].transcript; } if (interim) showInterim(interim); - if (finalText.trim()) { - // Some engines may deliver final text without firing `onspeechstart`. + const final = finalText.trim(); + if (final) { if (state === State.SPEAKING) { + // Guard against echo loops: assistant audio can be picked up by the + // microphone and re-recognized as user speech. + if (isLikelyAssistantEcho(final)) return; + // Ignore ultra-early short fragments right after speaking starts. + const speakingMs = Date.now() - speakingStartedAtMs; + if (speakingMs < 350 && final.length < 4) return; interrupt(); setState(State.LISTENING, 'listening… (interrupted)'); + } else if (state === State.LISTENING) { + // Ignore echoes of assistant text that leak into recognition. + if (isLikelyAssistantEcho(final)) return; } clearInterim(); + recognitionShouldRun = false; stopRecognition(); - handleUserTurn(finalText.trim()).catch((e) => console.error(e)); + handleUserTurn(final).catch((e) => console.error(e)); } }; @@ -965,7 +1008,11 @@

kotoba — voice chat

rec.onend = () => { // Auto-restart while active so barge-in keeps working during speaking. - if (running && (state === State.LISTENING || state === State.SPEAKING)) { + if ( + running && + recognitionShouldRun && + (state === State.LISTENING || state === State.SPEAKING) + ) { try { rec.start(); } catch (_) { /* start() throws if already started */ } } @@ -1012,11 +1059,13 @@

kotoba — voice chat

el.startBtn.disabled = true; el.stopBtn.disabled = false; setState(State.LISTENING, 'listening…'); + recognitionShouldRun = true; startRecognition(); }); el.stopBtn.addEventListener('click', () => { running = false; + recognitionShouldRun = false; interrupt(); stopRecognition(); recognition = null; diff --git a/src/serve/tests.rs b/src/serve/tests.rs index 3bfcf9d..1a04c0d 100644 --- a/src/serve/tests.rs +++ b/src/serve/tests.rs @@ -243,6 +243,8 @@ async fn demo_returns_html_page() { assert!(!body.contains("socketToClose?.close")); // Regression: strip model `` blocks before rendering/TTS. assert!(body.contains("stripThinkingBlocks")); + // Regression: keep speaking-stage barge-in from echo-looping. + assert!(body.contains("isLikelyAssistantEcho")); } // ---------------------------------------------------------------------------