Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 58 additions & 9 deletions src/serve/demo.html
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,8 @@ <h1>kotoba — voice chat</h1>
/** The SpeechRecognition instance. Recreated per listening session
* because some browsers don't cleanly reuse it after errors. */
let recognition = null;
/** Whether recognition auto-restart is currently desired. */
let recognitionShouldRun = false;

/** AbortController for the in-flight LLM fetch, so user interruptions
* can cancel it immediately. */
Expand Down Expand Up @@ -370,6 +372,13 @@ <h1>kotoba — voice chat</h1>
let interimEl = null;
/** Monotonic turn id used to invalidate stale async turn continuations. */
let activeTurnId = 0;
/** Assistant text visible to the user in the current turn, used for
* echo-guard heuristics during barge-in detection. */
let currentAssistantText = '';
/** Timestamp when we entered speaking state for the current turn. */
let speakingStartedAtMs = 0;
/** Timestamp when assistant playback finished in the previous turn. */
let lastAssistantFinishedAtMs = 0;

// ---------------------------------------------------------------------------
// UI helpers
Expand Down Expand Up @@ -700,6 +709,24 @@ <h1>kotoba — voice chat</h1>
return text.replace(/<think\b[^>]*>[\s\S]*?(?:<\/think>|$)/gi, '');
}

// Normalize text for fuzzy echo comparison between what the assistant just
// said and what SpeechRecognition captured while speaking.
function normalizeForEchoCompare(text) {
return text
.toLowerCase()
.replace(/[\s\u3000]/g, '')
.replace(/[,。!?、,.!?:;"'“”‘’()()\-]/g, '');
}

// Heuristic: if the recognized fragment is mostly contained in the current
// assistant reply, it's likely speaker echo rather than a real interruption.
function isLikelyAssistantEcho(recognizedText) {
const heard = normalizeForEchoCompare(recognizedText);
const spoken = normalizeForEchoCompare(currentAssistantText);
if (!heard || heard.length < 2 || !spoken) return false;
return spoken.includes(heard);
}

// ---------------------------------------------------------------------------
// LLM streaming
// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -793,6 +820,7 @@ <h1>kotoba — voice chat</h1>
// stream and derive the visible/speakable text incrementally.
let rawText = '';
let visibleText = '';
currentAssistantText = '';

// Start (or reuse) the TTS socket for this turn.
resetTtsDrainState();
Expand All @@ -812,9 +840,12 @@ <h1>kotoba — voice chat</h1>
// First visible token = transition to speaking and arm barge-in.
if (state === State.THINKING) {
setState(State.SPEAKING, 'speaking…');
speakingStartedAtMs = Date.now();
recognitionShouldRun = true;
startRecognition();
}
currentAssistantEl.textContent += visibleDelta;
currentAssistantText += visibleDelta;
el.conversation.scrollTop = el.conversation.scrollHeight;

textBuffer += visibleDelta;
Expand All @@ -839,8 +870,11 @@ <h1>kotoba — voice chat</h1>
showBanner('LLM unreachable, using echo mode');
fullText = `「${userText}」と聞きました。`;
currentAssistantEl.textContent = fullText;
currentAssistantText = fullText;
textBuffer = fullText;
setState(State.SPEAKING, 'speaking…');
speakingStartedAtMs = Date.now();
recognitionShouldRun = true;
startRecognition();
}

Expand Down Expand Up @@ -870,9 +904,11 @@ <h1>kotoba — voice chat</h1>
// has no single "all done" event across multiple buffers.
await waitForPlaybackToDrain();
if (turnId !== activeTurnId || !running) return;
lastAssistantFinishedAtMs = Date.now();

if (running) {
setState(State.LISTENING, 'listening…');
recognitionShouldRun = true;
startRecognition();
}
}
Expand Down Expand Up @@ -928,11 +964,8 @@ <h1>kotoba — voice chat</h1>
};

rec.onspeechstart = () => {
// Barge-in: if the assistant is currently talking, cut it off.
if (state === State.SPEAKING) {
interrupt();
setState(State.LISTENING, 'listening… (interrupted)');
}
// We do not interrupt immediately on speechstart because speaker echo
// can trigger false positives. Interruption is confirmed on final text.
};

rec.onresult = (ev) => {
Expand All @@ -944,15 +977,25 @@ <h1>kotoba — voice chat</h1>
else interim += res[0].transcript;
}
if (interim) showInterim(interim);
if (finalText.trim()) {
// Some engines may deliver final text without firing `onspeechstart`.
const final = finalText.trim();
if (final) {
if (state === State.SPEAKING) {
// Guard against echo loops: assistant audio can be picked up by the
// microphone and re-recognized as user speech.
if (isLikelyAssistantEcho(final)) return;
// Ignore ultra-early short fragments right after speaking starts.
const speakingMs = Date.now() - speakingStartedAtMs;
if (speakingMs < 350 && final.length < 4) return;
interrupt();
setState(State.LISTENING, 'listening… (interrupted)');
} else if (state === State.LISTENING) {
// Ignore echoes of assistant text that leak into recognition.
if (isLikelyAssistantEcho(final)) return;
}
clearInterim();
recognitionShouldRun = false;
stopRecognition();
handleUserTurn(finalText.trim()).catch((e) => console.error(e));
handleUserTurn(final).catch((e) => console.error(e));
}
};

Expand All @@ -965,7 +1008,11 @@ <h1>kotoba — voice chat</h1>

rec.onend = () => {
// Auto-restart while active so barge-in keeps working during speaking.
if (running && (state === State.LISTENING || state === State.SPEAKING)) {
if (
running &&
recognitionShouldRun &&
(state === State.LISTENING || state === State.SPEAKING)
) {
try { rec.start(); }
catch (_) { /* start() throws if already started */ }
}
Expand Down Expand Up @@ -1012,11 +1059,13 @@ <h1>kotoba — voice chat</h1>
el.startBtn.disabled = true;
el.stopBtn.disabled = false;
setState(State.LISTENING, 'listening…');
recognitionShouldRun = true;
startRecognition();
});

el.stopBtn.addEventListener('click', () => {
running = false;
recognitionShouldRun = false;
interrupt();
stopRecognition();
recognition = null;
Expand Down
2 changes: 2 additions & 0 deletions src/serve/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,8 @@ async fn demo_returns_html_page() {
assert!(!body.contains("socketToClose?.close"));
// Regression: strip model `<think>` blocks before rendering/TTS.
assert!(body.contains("stripThinkingBlocks"));
// Regression: keep speaking-stage barge-in from echo-looping.
assert!(body.contains("isLikelyAssistantEcho"));
}

// ---------------------------------------------------------------------------
Expand Down
Loading