Skip to content

LLM stream response #153

@3bdop

Description

@3bdop

Hello @met4citizen , hope you’re doing well. I wanted to ask: if I’m receiving a streaming response from the my backend via SSE so the response time be much faster the batch LLM response, and sending sentence by sentence to the Azure TTS service, does TalkingHead support this kind of streaming with Azure TTS? And how would the viseme logic be handled in this case?

This is the current logic:

async function azureSpeak(ssml) {
    if (!microsoftSynthesizer) {
      // Retrieve config from input fields
      const resp = await fetch(`${TOKEN_ENDPOINT}/speech/ms-token`);
      if (!resp.ok) throw new Error("Token fetch failed");
     
 const config = SpeechSDK.SpeechConfig.fromEndpoint(
        new URL(
          `wss://${tokenData.region}.tts.speech.microsoft.com/cognitiveservices/websocket/v1`              )
      );
      config.authorizationToken = tokenData.token;

      config.speechSynthesisVoiceName = voiceName;

      config.speechSynthesisOutputFormat =
        window.SpeechSDK.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm;
      microsoftSynthesizer = new window.SpeechSDK.SpeechSynthesizer(
        config,
        null
      );

      // Handle the synthesis results
      microsoftSynthesizer.synthesizing = (s, e) => {
        switch (lipsyncType) {
          case "blendshapes":
            head.streamAudio({
              audio: e.result.audioData,
              anims: azureBlendShapes?.sbuffer.splice(
                0,
                azureBlendShapes?.sbuffer.length
              ),
            });
            break;
          case "visemes":
            head.streamAudio({
              audio: e.result.audioData,
              visemes: visemesbuffer.visemes.splice(
                0,
                visemesbuffer.visemes.length
              ),
              vtimes: visemesbuffer.vtimes.splice(
                0,
                visemesbuffer.vtimes.length
              ),
              vdurations: visemesbuffer.vdurations.splice(
                0,
                visemesbuffer.vdurations.length
              ),
            });
            break;
          case "words":
            head.streamAudio({
              audio: e.result.audioData,
              words: wordsbuffer.words.splice(0, wordsbuffer.words.length),
              wtimes: wordsbuffer.wtimes.splice(0, wordsbuffer.wtimes.length),
              wdurations: wordsbuffer.wdurations.splice(
                0,
                wordsbuffer.wdurations.length
              ),
            });
            break;
          default:
            console.error(`Unknown animation mode: ${lipsyncType}`);
        }
      };

      // Viseme handling
      microsoftSynthesizer.visemeReceived = (s, e) => {
        if (lipsyncType === "visemes") {
          const vtime = e.audioOffset / 10000;
          const viseme = visemeMap[e.visemeId];
          if (!head.isStreaming) return;
          if (prevViseme) {
            let vduration = vtime - prevViseme.vtime;
            if (vduration < 40) vduration = 40;
            visemesbuffer.visemes.push(prevViseme.viseme);
            visemesbuffer.vtimes.push(prevViseme.vtime);
            visemesbuffer.vdurations.push(vduration);
          }
          prevViseme = { viseme, vtime };
        } else if (lipsyncType === "blendshapes") {
          let animation = null;
          if (e?.animation && e.animation.trim() !== "") {
            try {
              animation = JSON.parse(e.animation);
            } catch (error) {
              console.error("Error parsing animation blendshapes:", error);
              return;
            }
          }
          if (!animation) return;
          const vs = {};
          AzureBlendshapeMap.forEach((mtName, i) => {
            vs[mtName] = animation.BlendShapes.map((frame) => frame[i]);
          });

          azureBlendShapes.sbuffer.push({
            name: "blendshapes",
            delay: (animation.FrameIndex * 1000) / 60,
            dt: Array.from(
              { length: animation.BlendShapes.length },
              () => 1000 / 60
            ),
            vs: vs,
          });
        }
      };

      // Process word boundaries and punctuations
      microsoftSynthesizer.wordBoundary = function (s, e) {
        const word = e.text;
        const time = e.audioOffset / 10000;
        const duration = e.duration / 10000;

        if (
          e.boundaryType === "PunctuationBoundary" &&
          wordsbuffer.words.length
        ) {
          wordsbuffer.words[wordsbuffer.words.length - 1] += word;
          wordsbuffer.wdurations[wordsbuffer.wdurations.length - 1] += duration;
        } else if (
          e.boundaryType === "WordBoundary" ||
          e.boundaryType === "PunctuationBoundary"
        ) {
          wordsbuffer.words.push(word);
          wordsbuffer.wtimes.push(time);
          wordsbuffer.wdurations.push(duration);
        }
      };
    }

    // Start stream speaking
    head.streamStart(
      { sampleRate: 48000, mood: "happy", gain: 0.5, lipsyncType: lipsyncType },
      () => {
        console.log("Audio playback started.");
        const subtitlesElement = document.getElementById("subtitles");
        subtitlesElement.textContent = "";
        subtitlesElement.style.display = "none";
        subtitlesElement.setAttribute("data-lines", 0);
        document.getElementById("btn-txt").textContent = "Playing...";
      },
      () => {
        console.log("Audio playback ended.");
        const subtitlesElement = document.getElementById("subtitles");
        const displayDuration = Math.max(
          2000,
          subtitlesElement.textContent.length * 50
        );
        setTimeout(() => {
          subtitlesElement.textContent = "";
          subtitlesElement.style.display = "none";

          // Reset all states here
          isProcessing = false;
          nodeSpeak.disabled = false;
          document.getElementById("btn-txt").textContent = "Ask";
          document.getElementById("speak").disabled = false;
          document.getElementById("text").value = "";
          currentAnswerForEmojis = null;
          emojiTextMap = null;
        }, displayDuration);
      },
      (subtitleText) => {
        console.log("subtitleText: ", subtitleText);

        // NEW: Check for emojis in the original text and play them when corresponding words are spoken
        // if (currentAnswerForEmojis) {
        checkAndPlayEmojisForSubtitle(subtitleText, currentAnswerForEmojis);
        // }

        const subtitlesElement = document.getElementById("subtitles");
        const currentText = subtitlesElement.textContent;
        const words = subtitleText.split(" ");
        const MAX_LINES = 2;

        let currentLines = parseInt(
          subtitlesElement.getAttribute("data-lines") || "0"
        );

        subtitlesElement.style.display = "block";
        subtitlesElement.textContent += subtitleText;

        const styles = window.getComputedStyle(subtitlesElement);
        const lineHeight = parseInt(styles.lineHeight);
        const height = subtitlesElement.offsetHeight;
        const actualLines = Math.ceil(height / lineHeight);

        if (actualLines > MAX_LINES) {
          const allWords = subtitlesElement.textContent.split(" ");
          const removeCount = Math.ceil(allWords.length / 3);
          subtitlesElement.textContent =
            "... " + allWords.slice(removeCount).join(" ");
        }

        subtitlesElement.setAttribute("data-lines", actualLines.toString());
      }
    );

    // Perform TTS
    microsoftSynthesizer.speakSsmlAsync(
      ssml,
      (result) => {
        console.log("=== CALLBACK FIRED ===");
        console.log("Result object:", result);
        console.log("Result reason:", result.reason);
        console.log(
          "Result reason name:",
          window.SpeechSDK.ResultReason[result.reason]
        );
        if (
          result.reason ===
          window.SpeechSDK.ResultReason.SynthesizingAudioCompleted
        ) {
          if (lipsyncType === "visemes" && prevViseme) {
            const finalDuration = 100;
            visemesbuffer.visemes.push(prevViseme.viseme);
            visemesbuffer.vtimes.push(prevViseme.vtime);
            visemesbuffer.vdurations.push(finalDuration);
            prevViseme = null;
          }

          let speak = {};

          if (lipsyncType === "visemes" && visemesbuffer.visemes.length) {
            speak.visemes = visemesbuffer.visemes.splice(
              0,
              visemesbuffer.visemes.length
            );
            speak.vtimes = visemesbuffer.vtimes.splice(
              0,
              visemesbuffer.vtimes.length
            );
            speak.vdurations = visemesbuffer.vdurations.splice(
              0,
              visemesbuffer.vdurations.length
            );
          }
          if (lipsyncType === "blendshapes") {
            speak.anims = azureBlendShapes?.sbuffer.splice(
              0,
              azureBlendShapes?.sbuffer.length
            );
          }

          speak.words = wordsbuffer.words.splice(0, wordsbuffer.words.length);
          speak.wtimes = wordsbuffer.wtimes.splice(
            0,
            wordsbuffer.wtimes.length
          );
          speak.wdurations = wordsbuffer.wdurations.splice(
            0,
            wordsbuffer.wdurations.length
          );

          if (speak.visemes || speak.words || speak.anims) {
            speak.audio = new ArrayBuffer(0);
            head.streamAudio(speak);
          }

          head.streamNotifyEnd();
          resetLipsyncBuffers();

          console.log("Speech synthesis completed.");
        }
      },
      (error) => {
        console.error("Azure speech synthesis error:", error);
        resetLipsyncBuffers();
        currentAnswerForEmojis = null;
        emojiTextMap = null;
      }
    );
  }

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions