vladmaraev · vladmaraev · Jul 1, 2025 · Jun 16, 2025 · Jun 24, 2025
diff --git a/package.json b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "speechstate",
   "license": "GPL-3.0",
-  "version": "2.13.1",
+  "version": "2.14.0",
   "homepage": "http://localhost/speechstate",
   "main": "./dist/index.js",
   "types": "./dist/index.d.ts",

diff --git a/src/speechstate.ts b/src/speechstate.ts
@@ -56,6 +56,8 @@ const speechstate = setup({
           input: {
             azureAuthorizationToken: context.azureAuthorizationToken,
             ttsDefaultVoice: context.settings.ttsDefaultVoice,
+            ttsDefaultFillerDelay: context.settings.ttsDefaultFillerDelay,
+            ttsDefaultFiller: context.settings.ttsDefaultFiller,
             ttsLexicon: context.settings.ttsLexicon,
             audioContext: context.audioContext,
             azureRegion: context.settings.azureRegion,

diff --git a/src/tts.ts b/src/tts.ts
@@ -40,7 +40,8 @@ export const ttsMachine = setup({
       return {
         buffer:
           context.buffer.substring(0, spaceIndex) +
-          " um," +
+          " " +
+          context.ttsDefaultFiller +
           context.buffer.substring(spaceIndex),
       };
     }),
@@ -300,6 +301,8 @@ export const ttsMachine = setup({
   context: ({ input }) => ({
     azureAuthorizationToken: input.azureAuthorizationToken,
     ttsDefaultVoice: input.ttsDefaultVoice || "en-US-DavisNeural",
+    ttsDefaultFillerDelay: input.ttsDefaultFillerDelay || 500,
+    ttsDefaultFiller: input.ttsDefaultFiller || "um,",
     ttsLexicon: input.ttsLexicon,
     audioContext: input.audioContext,
     azureRegion: input.azureRegion,
@@ -333,10 +336,13 @@ export const ttsMachine = setup({
                     target: "BufferedSpeaker",
                     guard: ({ event }) => !!event.value.stream,
                     actions: assign({
-                      agenda: ({ event }) =>
+                      agenda: ({ context, event }) =>
                         event.value.fillerDelay
                           ? event.value
-                          : { ...event.value, fillerDelay: 500 },
+                          : {
+                              ...event.value,
+                              fillerDelay: context.ttsDefaultFillerDelay,
+                            },
                     }),
                   },
                   {

diff --git a/src/types.ts b/src/types.ts
@@ -26,6 +26,8 @@ export interface Settings {
   speechRecognitionEndpointId?: string;
   ttsDefaultVoice?: string;
   ttsLexicon?: string;
+  ttsDefaultFillerDelay?: number;
+  ttsDefaultFiller?: string;
   newTokenInterval?: number;
 }
 
@@ -140,6 +142,8 @@ export interface TTSInit {
   audioContext: AudioContext;
   azureRegion: string;
   ttsDefaultVoice: string;
+  ttsDefaultFillerDelay?: number;
+  ttsDefaultFiller?: string;
   ttsLexicon?: string;
   locale: string;
 }

diff --git a/test/server.mjs b/test/server.mjs
@@ -60,7 +60,7 @@ async function run() {
       } else {
         clearInterval(interval);
       }
-    }, 300);
+    }, 500);
 
     res.on("close", () => {
       clearInterval(interval);

diff --git a/test/tts.test.ts b/test/tts.test.ts
@@ -11,6 +11,8 @@ describe("Synthesis test", async () => {
       return {
         ssRef: spawn(speechstate, {
           input: {
+            ttsDefaultFiller: "um the filler,",
+            ttsDefaultFillerDelay: 100,
             noPonyfill: false,
             azureRegion: "swedencentral",
             azureCredentials: {
@@ -89,7 +91,7 @@ describe("Synthesis test", async () => {
     expect(snapshot).toBeTruthy();
   });
 
-  test("synthesise from stream", async () => {
+  test.only("synthesise from stream", async () => {
     actor.getSnapshot().context.ssRef.send({
       type: "SPEAK",
       value: { utterance: "", stream: "http://localhost:3000/sse/1" },
@@ -98,6 +100,16 @@ describe("Synthesis test", async () => {
     expect(snapshot).toBeTruthy();
   });
 
+  test.only("synthesise from stream; different filler and timeout", async () => {
+    actor.getSnapshot().context.ssRef.send({
+      type: "SPEAK",
+      value: { utterance: "", stream: "http://localhost:3000/sse/1", fillerDelay: 100_000 },
+    });
+    const snapshot = await waitForView(actor, "speaking", 1000);
+    expect(snapshot).toBeTruthy();
+  });
+
+
   test("synthesise from stream; stop and restart on CONTROL", async () => {
     actor.getSnapshot().context.ssRef.send({
       type: "SPEAK",
@@ -152,7 +164,7 @@ describe("Synthesis test", async () => {
     expect(snapshot).toBeTruthy();
   });
 
-  test.only("synthesise from stream, use cache", async () => {
+  test("synthesise from stream, use cache", async () => {
     actor.getSnapshot().context.ssRef.send({
       type: "SPEAK",
       value: {