LLM stream response

Hello @met4citizen , hope you’re doing well. I wanted to ask: if I’m receiving a streaming response from the my backend via SSE so the response time be much faster the batch LLM response, and sending sentence by sentence to the Azure TTS service, does TalkingHead support this kind of streaming with Azure TTS? And how would the viseme logic be handled in this case?

This is the current logic:

```
async function azureSpeak(ssml) {
    if (!microsoftSynthesizer) {
      // Retrieve config from input fields
      const resp = await fetch(`${TOKEN_ENDPOINT}/speech/ms-token`);
      if (!resp.ok) throw new Error("Token fetch failed");
     
 const config = SpeechSDK.SpeechConfig.fromEndpoint(
        new URL(
          `wss://${tokenData.region}.tts.speech.microsoft.com/cognitiveservices/websocket/v1`              )
      );
      config.authorizationToken = tokenData.token;

      config.speechSynthesisVoiceName = voiceName;

      config.speechSynthesisOutputFormat =
        window.SpeechSDK.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm;
      microsoftSynthesizer = new window.SpeechSDK.SpeechSynthesizer(
        config,
        null
      );

      // Handle the synthesis results
      microsoftSynthesizer.synthesizing = (s, e) => {
        switch (lipsyncType) {
          case "blendshapes":
            head.streamAudio({
              audio: e.result.audioData,
              anims: azureBlendShapes?.sbuffer.splice(
                0,
                azureBlendShapes?.sbuffer.length
              ),
            });
            break;
          case "visemes":
            head.streamAudio({
              audio: e.result.audioData,
              visemes: visemesbuffer.visemes.splice(
                0,
                visemesbuffer.visemes.length
              ),
              vtimes: visemesbuffer.vtimes.splice(
                0,
                visemesbuffer.vtimes.length
              ),
              vdurations: visemesbuffer.vdurations.splice(
                0,
                visemesbuffer.vdurations.length
              ),
            });
            break;
          case "words":
            head.streamAudio({
              audio: e.result.audioData,
              words: wordsbuffer.words.splice(0, wordsbuffer.words.length),
              wtimes: wordsbuffer.wtimes.splice(0, wordsbuffer.wtimes.length),
              wdurations: wordsbuffer.wdurations.splice(
                0,
                wordsbuffer.wdurations.length
              ),
            });
            break;
          default:
            console.error(`Unknown animation mode: ${lipsyncType}`);
        }
      };

      // Viseme handling
      microsoftSynthesizer.visemeReceived = (s, e) => {
        if (lipsyncType === "visemes") {
          const vtime = e.audioOffset / 10000;
          const viseme = visemeMap[e.visemeId];
          if (!head.isStreaming) return;
          if (prevViseme) {
            let vduration = vtime - prevViseme.vtime;
            if (vduration < 40) vduration = 40;
            visemesbuffer.visemes.push(prevViseme.viseme);
            visemesbuffer.vtimes.push(prevViseme.vtime);
            visemesbuffer.vdurations.push(vduration);
          }
          prevViseme = { viseme, vtime };
        } else if (lipsyncType === "blendshapes") {
          let animation = null;
          if (e?.animation && e.animation.trim() !== "") {
            try {
              animation = JSON.parse(e.animation);
            } catch (error) {
              console.error("Error parsing animation blendshapes:", error);
              return;
            }
          }
          if (!animation) return;
          const vs = {};
          AzureBlendshapeMap.forEach((mtName, i) => {
            vs[mtName] = animation.BlendShapes.map((frame) => frame[i]);
          });

          azureBlendShapes.sbuffer.push({
            name: "blendshapes",
            delay: (animation.FrameIndex * 1000) / 60,
            dt: Array.from(
              { length: animation.BlendShapes.length },
              () => 1000 / 60
            ),
            vs: vs,
          });
        }
      };

      // Process word boundaries and punctuations
      microsoftSynthesizer.wordBoundary = function (s, e) {
        const word = e.text;
        const time = e.audioOffset / 10000;
        const duration = e.duration / 10000;

        if (
          e.boundaryType === "PunctuationBoundary" &&
          wordsbuffer.words.length
        ) {
          wordsbuffer.words[wordsbuffer.words.length - 1] += word;
          wordsbuffer.wdurations[wordsbuffer.wdurations.length - 1] += duration;
        } else if (
          e.boundaryType === "WordBoundary" ||
          e.boundaryType === "PunctuationBoundary"
        ) {
          wordsbuffer.words.push(word);
          wordsbuffer.wtimes.push(time);
          wordsbuffer.wdurations.push(duration);
        }
      };
    }

    // Start stream speaking
    head.streamStart(
      { sampleRate: 48000, mood: "happy", gain: 0.5, lipsyncType: lipsyncType },
      () => {
        console.log("Audio playback started.");
        const subtitlesElement = document.getElementById("subtitles");
        subtitlesElement.textContent = "";
        subtitlesElement.style.display = "none";
        subtitlesElement.setAttribute("data-lines", 0);
        document.getElementById("btn-txt").textContent = "Playing...";
      },
      () => {
        console.log("Audio playback ended.");
        const subtitlesElement = document.getElementById("subtitles");
        const displayDuration = Math.max(
          2000,
          subtitlesElement.textContent.length * 50
        );
        setTimeout(() => {
          subtitlesElement.textContent = "";
          subtitlesElement.style.display = "none";

          // Reset all states here
          isProcessing = false;
          nodeSpeak.disabled = false;
          document.getElementById("btn-txt").textContent = "Ask";
          document.getElementById("speak").disabled = false;
          document.getElementById("text").value = "";
          currentAnswerForEmojis = null;
          emojiTextMap = null;
        }, displayDuration);
      },
      (subtitleText) => {
        console.log("subtitleText: ", subtitleText);

        // NEW: Check for emojis in the original text and play them when corresponding words are spoken
        // if (currentAnswerForEmojis) {
        checkAndPlayEmojisForSubtitle(subtitleText, currentAnswerForEmojis);
        // }

        const subtitlesElement = document.getElementById("subtitles");
        const currentText = subtitlesElement.textContent;
        const words = subtitleText.split(" ");
        const MAX_LINES = 2;

        let currentLines = parseInt(
          subtitlesElement.getAttribute("data-lines") || "0"
        );

        subtitlesElement.style.display = "block";
        subtitlesElement.textContent += subtitleText;

        const styles = window.getComputedStyle(subtitlesElement);
        const lineHeight = parseInt(styles.lineHeight);
        const height = subtitlesElement.offsetHeight;
        const actualLines = Math.ceil(height / lineHeight);

        if (actualLines > MAX_LINES) {
          const allWords = subtitlesElement.textContent.split(" ");
          const removeCount = Math.ceil(allWords.length / 3);
          subtitlesElement.textContent =
            "... " + allWords.slice(removeCount).join(" ");
        }

        subtitlesElement.setAttribute("data-lines", actualLines.toString());
      }
    );

    // Perform TTS
    microsoftSynthesizer.speakSsmlAsync(
      ssml,
      (result) => {
        console.log("=== CALLBACK FIRED ===");
        console.log("Result object:", result);
        console.log("Result reason:", result.reason);
        console.log(
          "Result reason name:",
          window.SpeechSDK.ResultReason[result.reason]
        );
        if (
          result.reason ===
          window.SpeechSDK.ResultReason.SynthesizingAudioCompleted
        ) {
          if (lipsyncType === "visemes" && prevViseme) {
            const finalDuration = 100;
            visemesbuffer.visemes.push(prevViseme.viseme);
            visemesbuffer.vtimes.push(prevViseme.vtime);
            visemesbuffer.vdurations.push(finalDuration);
            prevViseme = null;
          }

          let speak = {};

          if (lipsyncType === "visemes" && visemesbuffer.visemes.length) {
            speak.visemes = visemesbuffer.visemes.splice(
              0,
              visemesbuffer.visemes.length
            );
            speak.vtimes = visemesbuffer.vtimes.splice(
              0,
              visemesbuffer.vtimes.length
            );
            speak.vdurations = visemesbuffer.vdurations.splice(
              0,
              visemesbuffer.vdurations.length
            );
          }
          if (lipsyncType === "blendshapes") {
            speak.anims = azureBlendShapes?.sbuffer.splice(
              0,
              azureBlendShapes?.sbuffer.length
            );
          }

          speak.words = wordsbuffer.words.splice(0, wordsbuffer.words.length);
          speak.wtimes = wordsbuffer.wtimes.splice(
            0,
            wordsbuffer.wtimes.length
          );
          speak.wdurations = wordsbuffer.wdurations.splice(
            0,
            wordsbuffer.wdurations.length
          );

          if (speak.visemes || speak.words || speak.anims) {
            speak.audio = new ArrayBuffer(0);
            head.streamAudio(speak);
          }

          head.streamNotifyEnd();
          resetLipsyncBuffers();

          console.log("Speech synthesis completed.");
        }
      },
      (error) => {
        console.error("Azure speech synthesis error:", error);
        resetLipsyncBuffers();
        currentAnswerForEmojis = null;
        emojiTextMap = null;
      }
    );
  }
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

LLM stream response #153

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

LLM stream response #153

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions