-
Notifications
You must be signed in to change notification settings - Fork 255
Open
Description
Hello @met4citizen , hope you’re doing well. I wanted to ask: if I’m receiving a streaming response from the my backend via SSE so the response time be much faster the batch LLM response, and sending sentence by sentence to the Azure TTS service, does TalkingHead support this kind of streaming with Azure TTS? And how would the viseme logic be handled in this case?
This is the current logic:
async function azureSpeak(ssml) {
if (!microsoftSynthesizer) {
// Retrieve config from input fields
const resp = await fetch(`${TOKEN_ENDPOINT}/speech/ms-token`);
if (!resp.ok) throw new Error("Token fetch failed");
const config = SpeechSDK.SpeechConfig.fromEndpoint(
new URL(
`wss://${tokenData.region}.tts.speech.microsoft.com/cognitiveservices/websocket/v1` )
);
config.authorizationToken = tokenData.token;
config.speechSynthesisVoiceName = voiceName;
config.speechSynthesisOutputFormat =
window.SpeechSDK.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm;
microsoftSynthesizer = new window.SpeechSDK.SpeechSynthesizer(
config,
null
);
// Handle the synthesis results
microsoftSynthesizer.synthesizing = (s, e) => {
switch (lipsyncType) {
case "blendshapes":
head.streamAudio({
audio: e.result.audioData,
anims: azureBlendShapes?.sbuffer.splice(
0,
azureBlendShapes?.sbuffer.length
),
});
break;
case "visemes":
head.streamAudio({
audio: e.result.audioData,
visemes: visemesbuffer.visemes.splice(
0,
visemesbuffer.visemes.length
),
vtimes: visemesbuffer.vtimes.splice(
0,
visemesbuffer.vtimes.length
),
vdurations: visemesbuffer.vdurations.splice(
0,
visemesbuffer.vdurations.length
),
});
break;
case "words":
head.streamAudio({
audio: e.result.audioData,
words: wordsbuffer.words.splice(0, wordsbuffer.words.length),
wtimes: wordsbuffer.wtimes.splice(0, wordsbuffer.wtimes.length),
wdurations: wordsbuffer.wdurations.splice(
0,
wordsbuffer.wdurations.length
),
});
break;
default:
console.error(`Unknown animation mode: ${lipsyncType}`);
}
};
// Viseme handling
microsoftSynthesizer.visemeReceived = (s, e) => {
if (lipsyncType === "visemes") {
const vtime = e.audioOffset / 10000;
const viseme = visemeMap[e.visemeId];
if (!head.isStreaming) return;
if (prevViseme) {
let vduration = vtime - prevViseme.vtime;
if (vduration < 40) vduration = 40;
visemesbuffer.visemes.push(prevViseme.viseme);
visemesbuffer.vtimes.push(prevViseme.vtime);
visemesbuffer.vdurations.push(vduration);
}
prevViseme = { viseme, vtime };
} else if (lipsyncType === "blendshapes") {
let animation = null;
if (e?.animation && e.animation.trim() !== "") {
try {
animation = JSON.parse(e.animation);
} catch (error) {
console.error("Error parsing animation blendshapes:", error);
return;
}
}
if (!animation) return;
const vs = {};
AzureBlendshapeMap.forEach((mtName, i) => {
vs[mtName] = animation.BlendShapes.map((frame) => frame[i]);
});
azureBlendShapes.sbuffer.push({
name: "blendshapes",
delay: (animation.FrameIndex * 1000) / 60,
dt: Array.from(
{ length: animation.BlendShapes.length },
() => 1000 / 60
),
vs: vs,
});
}
};
// Process word boundaries and punctuations
microsoftSynthesizer.wordBoundary = function (s, e) {
const word = e.text;
const time = e.audioOffset / 10000;
const duration = e.duration / 10000;
if (
e.boundaryType === "PunctuationBoundary" &&
wordsbuffer.words.length
) {
wordsbuffer.words[wordsbuffer.words.length - 1] += word;
wordsbuffer.wdurations[wordsbuffer.wdurations.length - 1] += duration;
} else if (
e.boundaryType === "WordBoundary" ||
e.boundaryType === "PunctuationBoundary"
) {
wordsbuffer.words.push(word);
wordsbuffer.wtimes.push(time);
wordsbuffer.wdurations.push(duration);
}
};
}
// Start stream speaking
head.streamStart(
{ sampleRate: 48000, mood: "happy", gain: 0.5, lipsyncType: lipsyncType },
() => {
console.log("Audio playback started.");
const subtitlesElement = document.getElementById("subtitles");
subtitlesElement.textContent = "";
subtitlesElement.style.display = "none";
subtitlesElement.setAttribute("data-lines", 0);
document.getElementById("btn-txt").textContent = "Playing...";
},
() => {
console.log("Audio playback ended.");
const subtitlesElement = document.getElementById("subtitles");
const displayDuration = Math.max(
2000,
subtitlesElement.textContent.length * 50
);
setTimeout(() => {
subtitlesElement.textContent = "";
subtitlesElement.style.display = "none";
// Reset all states here
isProcessing = false;
nodeSpeak.disabled = false;
document.getElementById("btn-txt").textContent = "Ask";
document.getElementById("speak").disabled = false;
document.getElementById("text").value = "";
currentAnswerForEmojis = null;
emojiTextMap = null;
}, displayDuration);
},
(subtitleText) => {
console.log("subtitleText: ", subtitleText);
// NEW: Check for emojis in the original text and play them when corresponding words are spoken
// if (currentAnswerForEmojis) {
checkAndPlayEmojisForSubtitle(subtitleText, currentAnswerForEmojis);
// }
const subtitlesElement = document.getElementById("subtitles");
const currentText = subtitlesElement.textContent;
const words = subtitleText.split(" ");
const MAX_LINES = 2;
let currentLines = parseInt(
subtitlesElement.getAttribute("data-lines") || "0"
);
subtitlesElement.style.display = "block";
subtitlesElement.textContent += subtitleText;
const styles = window.getComputedStyle(subtitlesElement);
const lineHeight = parseInt(styles.lineHeight);
const height = subtitlesElement.offsetHeight;
const actualLines = Math.ceil(height / lineHeight);
if (actualLines > MAX_LINES) {
const allWords = subtitlesElement.textContent.split(" ");
const removeCount = Math.ceil(allWords.length / 3);
subtitlesElement.textContent =
"... " + allWords.slice(removeCount).join(" ");
}
subtitlesElement.setAttribute("data-lines", actualLines.toString());
}
);
// Perform TTS
microsoftSynthesizer.speakSsmlAsync(
ssml,
(result) => {
console.log("=== CALLBACK FIRED ===");
console.log("Result object:", result);
console.log("Result reason:", result.reason);
console.log(
"Result reason name:",
window.SpeechSDK.ResultReason[result.reason]
);
if (
result.reason ===
window.SpeechSDK.ResultReason.SynthesizingAudioCompleted
) {
if (lipsyncType === "visemes" && prevViseme) {
const finalDuration = 100;
visemesbuffer.visemes.push(prevViseme.viseme);
visemesbuffer.vtimes.push(prevViseme.vtime);
visemesbuffer.vdurations.push(finalDuration);
prevViseme = null;
}
let speak = {};
if (lipsyncType === "visemes" && visemesbuffer.visemes.length) {
speak.visemes = visemesbuffer.visemes.splice(
0,
visemesbuffer.visemes.length
);
speak.vtimes = visemesbuffer.vtimes.splice(
0,
visemesbuffer.vtimes.length
);
speak.vdurations = visemesbuffer.vdurations.splice(
0,
visemesbuffer.vdurations.length
);
}
if (lipsyncType === "blendshapes") {
speak.anims = azureBlendShapes?.sbuffer.splice(
0,
azureBlendShapes?.sbuffer.length
);
}
speak.words = wordsbuffer.words.splice(0, wordsbuffer.words.length);
speak.wtimes = wordsbuffer.wtimes.splice(
0,
wordsbuffer.wtimes.length
);
speak.wdurations = wordsbuffer.wdurations.splice(
0,
wordsbuffer.wdurations.length
);
if (speak.visemes || speak.words || speak.anims) {
speak.audio = new ArrayBuffer(0);
head.streamAudio(speak);
}
head.streamNotifyEnd();
resetLipsyncBuffers();
console.log("Speech synthesis completed.");
}
},
(error) => {
console.error("Azure speech synthesis error:", error);
resetLipsyncBuffers();
currentAnswerForEmojis = null;
emojiTextMap = null;
}
);
}
Metadata
Metadata
Assignees
Labels
No labels