Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 39 additions & 9 deletions src/serve/demo.html
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,8 @@ <h1>kotoba — voice chat</h1>

/** The DOM element that previews interim (not-yet-final) STT text. */
let interimEl = null;
/** Monotonic turn id used to invalidate stale async turn continuations. */
let activeTurnId = 0;

// ---------------------------------------------------------------------------
// UI helpers
Expand Down Expand Up @@ -692,6 +694,12 @@ <h1>kotoba — voice chat</h1>
return { sentences, remainder };
}

// Remove model reasoning blocks wrapped by `<think>...</think>` from text
// before showing UI text or forwarding content to TTS.
function stripThinkingBlocks(text) {
return text.replace(/<think\b[^>]*>[\s\S]*?(?:<\/think>|$)/gi, '');
}

// ---------------------------------------------------------------------------
// LLM streaming
// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -768,6 +776,7 @@ <h1>kotoba — voice chat</h1>

async function handleUserTurn(userText) {
if (!running) return;
const turnId = ++activeTurnId;
setState(State.THINKING, 'thinking…');

// Push user message to history + UI.
Expand All @@ -780,6 +789,10 @@ <h1>kotoba — voice chat</h1>

// Running buffer used by the sentence splitter.
let textBuffer = '';
// Raw model text can include `<think>...</think>` blocks. We keep a raw
// stream and derive the visible/speakable text incrementally.
let rawText = '';
let visibleText = '';

// Start (or reuse) the TTS socket for this turn.
resetTtsDrainState();
Expand All @@ -789,23 +802,32 @@ <h1>kotoba — voice chat</h1>
ttsWs = openTtsSocket();

const onToken = (delta) => {
if (!running) return;
// First token = transition to speaking as soon as we actually send audio.
if (!running || turnId !== activeTurnId) return;
rawText += delta;
const nextVisible = stripThinkingBlocks(rawText);
const visibleDelta = nextVisible.slice(visibleText.length);
if (!visibleDelta) return;
visibleText = nextVisible;

// First visible token = transition to speaking and arm barge-in.
if (state === State.THINKING) {
setState(State.SPEAKING, 'speaking…');
startRecognition();
}
currentAssistantEl.textContent += delta;
currentAssistantEl.textContent += visibleDelta;
el.conversation.scrollTop = el.conversation.scrollHeight;

textBuffer += delta;
textBuffer += visibleDelta;
const { sentences, remainder } = extractSentences(textBuffer);
textBuffer = remainder;
for (const s of sentences) sendSentenceToTts(s);
};

let fullText;
let fullText = '';
try {
fullText = await streamLlm(userText, onToken);
await streamLlm(userText, onToken);
if (turnId !== activeTurnId || !running) return;
fullText = visibleText;
hideBanner();
} catch (err) {
if (err.name === 'AbortError') {
Expand All @@ -819,6 +841,7 @@ <h1>kotoba — voice chat</h1>
currentAssistantEl.textContent = fullText;
textBuffer = fullText;
setState(State.SPEAKING, 'speaking…');
startRecognition();
}

// Flush any trailing partial sentence.
Expand All @@ -830,6 +853,7 @@ <h1>kotoba — voice chat</h1>
// Wait until all sentence-level TTS requests receive terminal server
// responses before closing the socket.
await waitForTtsDrain();
if (turnId !== activeTurnId || !running) return;
if (ttsWs) {
try { ttsWs.close(); } catch (_) { /* ignore */ }
ttsWs = null;
Expand All @@ -845,6 +869,7 @@ <h1>kotoba — voice chat</h1>
// to listening. We poll the playback queue because AudioBuffer playback
// has no single "all done" event across multiple buffers.
await waitForPlaybackToDrain();
if (turnId !== activeTurnId || !running) return;

if (running) {
setState(State.LISTENING, 'listening…');
Expand All @@ -868,6 +893,7 @@ <h1>kotoba — voice chat</h1>
// ---------------------------------------------------------------------------

function interrupt() {
activeTurnId += 1;
// Stop any in-flight LLM fetch.
if (llmAbort) {
try { llmAbort.abort(); } catch (_) { /* ignore */ }
Expand Down Expand Up @@ -919,6 +945,11 @@ <h1>kotoba — voice chat</h1>
}
if (interim) showInterim(interim);
if (finalText.trim()) {
// Some engines may deliver final text without firing `onspeechstart`.
if (state === State.SPEAKING) {
interrupt();
setState(State.LISTENING, 'listening… (interrupted)');
}
clearInterim();
stopRecognition();
handleUserTurn(finalText.trim()).catch((e) => console.error(e));
Expand All @@ -933,9 +964,8 @@ <h1>kotoba — voice chat</h1>
};

rec.onend = () => {
// Auto-restart while we're still in listening mode so the user
// can keep talking without pressing anything.
if (running && state === State.LISTENING) {
// Auto-restart while active so barge-in keeps working during speaking.
if (running && (state === State.LISTENING || state === State.SPEAKING)) {
try { rec.start(); }
catch (_) { /* start() throws if already started */ }
}
Expand Down
2 changes: 2 additions & 0 deletions src/serve/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,8 @@ async fn demo_returns_html_page() {
// Regression: do not close the TTS socket on a fixed timer.
assert!(body.contains("waitForTtsDrain"));
assert!(!body.contains("socketToClose?.close"));
// Regression: strip model `<think>` blocks before rendering/TTS.
assert!(body.contains("stripThinkingBlocks"));
}

// ---------------------------------------------------------------------------
Expand Down
Loading