From e75d138db3fd711294473856b5657f018554bef6 Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Wed, 17 Dec 2025 14:52:33 +0000 Subject: [PATCH 01/12] Remove quickstart - it's an older v of overview page --- docs/voice-agents/quickstart.mdx | 110 ------------------------------- 1 file changed, 110 deletions(-) delete mode 100644 docs/voice-agents/quickstart.mdx diff --git a/docs/voice-agents/quickstart.mdx b/docs/voice-agents/quickstart.mdx deleted file mode 100644 index 5b3dad73..00000000 --- a/docs/voice-agents/quickstart.mdx +++ /dev/null @@ -1,110 +0,0 @@ ---- -description: Learn how to build voice-enabled applications with the Speechmatics voice SDK ---- -import Admonition from '@theme/Admonition'; -import CodeBlock from '@theme/CodeBlock'; -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - -import pythonVoiceQuickstart from "./assets/basic-quickstart.py?raw" -import pythonVoicePresets from "./assets/presets.py?raw" -import pythonVoiceCustomConfig from "./assets/custom-config.py?raw" - -# Voice agent overview - -The voice SDK builds on our real-time API to provide features optimized for conversational AI: - -- **Intelligent segmentation**: groups words into meaningful speech segments per speaker. -- **Turn detection**: automatically detects when speakers finish talking. -- **Speaker management**: focus on or ignore specific speakers in multi-speaker scenarios. -- **Preset configurations**: offers ready-to-use settings for conversations, note-taking, and captions. -- **Simplified event handling**: delivers clean, structured segments instead of raw word-level events. - -### When to use the voice SDK vs real-time SDK - -Use the voice SDK when: - -- Building conversational AI or voice agents -- You need automatic turn detection -- You want speaker-focused transcription -- You need ready-to-use presets for common scenarios - -Use the realtime SDK when: - -- You need the raw stream of word-by-word transcription data -- Building custom segmentation logic -- You want fine-grained control over every event -- Processing batch files or custom workflows - -## Getting started - -### 1. Get your API key - -[Create an API key in the portal](https://portal.speechmatics.com/settings/api-keys) to access the voice SDK. Store your key securely as a managed secret. - -### 2. Install dependencies - -```bash -# Standard installation -pip install speechmatics-voice - -# With SMART_TURN (ML-based turn detection) -pip install speechmatics-voice[smart] -``` - -### 3. Quickstart - -Replace `YOUR_API_KEY` with your actual API key from the portal: - - - - - {pythonVoiceQuickstart} - - - - - {pythonVoicePresets} - - - - - {pythonVoiceCustomConfig} - - - - -## FAQ - -### Implementation and deployment - -
-Can I deploy this in my own environment? - -Yes! The voice agent SDK can be consumed via our managed service or deployed in your own environment. To learn more about on-premises deployment options, [speak to sales](https://www.speechmatics.com/speak-to-sales). -
- -### Support - -
-Where can I provide feedback or get help? - -You can submit feedback, bug reports, or feature requests through the Speechmatics [GitHub discussions](https://github.com/orgs/speechmatics/discussions). -
- -## Next steps - -For more information, see the [voice agent Python SDK](https://github.com/speechmatics/speechmatics-python-sdk/tree/main/sdk/voice) on github. - -To learn more, check out [the Speechmatics academy](https://github.com/speechmatics/speechmatics-academy) - -Ready to build something amazing with our voice agent SDK? We'd love to hear about your project and help you succeed. - -**Get in touch with us:** -- Share your feedback and feature requests -- Ask questions about implementation -- Discuss enterprise pricing and custom voices -- Report any issues or bugs you encounter - -[Contact our team](https://support.speechmatics.com) or join our developer community (https://www.reddit.com/r/Speechmatics) to connect with other builders using text to speech. - From 579d4696b5d820eff33946d2bfe5cb0a34cd30e8 Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Wed, 17 Dec 2025 16:22:32 +0000 Subject: [PATCH 02/12] Restructure Voice Agents Flow sidebar as nested category under Voice Agents --- docs/voice-agents/sidebar.ts | 2 + .../voice-agents/voice-agents-flow/sidebar.ts | 61 +++++++++++++++++++ sidebars.ts | 2 - 3 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 docs/voice-agents/voice-agents-flow/sidebar.ts diff --git a/docs/voice-agents/sidebar.ts b/docs/voice-agents/sidebar.ts index 412e29bc..f93c8697 100644 --- a/docs/voice-agents/sidebar.ts +++ b/docs/voice-agents/sidebar.ts @@ -1,3 +1,4 @@ +import voiceAgentsFlowSidebar from "./voice-agents-flow/sidebar"; export default { type: "category", label: "Voice agents", @@ -14,5 +15,6 @@ export default { id: "voice-agents/features", label: "Features", }, + voiceAgentsFlowSidebar, ], } as const; \ No newline at end of file diff --git a/docs/voice-agents/voice-agents-flow/sidebar.ts b/docs/voice-agents/voice-agents-flow/sidebar.ts new file mode 100644 index 00000000..d4a4a2ec --- /dev/null +++ b/docs/voice-agents/voice-agents-flow/sidebar.ts @@ -0,0 +1,61 @@ +export default { + type: "category", + label: "Voice agents – Flow", + collapsible: true, + collapsed: true, + items: [ + { + type: "doc", + label: "Overview", + id: "voice-agents/voice-agents-flow/index", + }, + { + type: "category", + label:"Features", + collapsible: true, + collapsed: true, + items: [ + { + type: "doc", + id: "voice-agents/voice-agents-flow/features/application-inputs", + }, + { + type: "doc", + id: "voice-agents/voice-agents-flow/features/function-calling", + }, + { + type: "doc", + id: "voice-agents/voice-agents-flow/features/webrtc-livekit", + }, + ], + }, + { + type: "category", + label:"Guides", + collapsible: true, + collapsed: true, + items: [ + { + type: "doc", + id: "voice-agents/voice-agents-flow/guides/nextjs-guide", + }, + { + type: "doc", + id: "voice-agents/voice-agents-flow/guides/react-native", + }, + ], + }, + { + type: "doc", + id: "voice-agents/voice-agents-flow/setup", + }, + { + type: "doc", + id: "voice-agents/voice-agents-flow/supported-formats-and-limits", + }, + { + type: "doc", + id: "voice-agents/voice-agents-flow/supported-languages", + }, + ], +} as const; \ No newline at end of file diff --git a/sidebars.ts b/sidebars.ts index 9a81597c..88dcea82 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -3,7 +3,6 @@ import deploymentsSidebar from "./docs/deployments/sidebar"; import gettingStartedSidebar from "./docs/get-started/sidebar"; import speechToTextSidebar from "./docs/speech-to-text/sidebar"; import textToSpeechSidebar from "./docs/text-to-speech/sidebar"; -import voiceAgentsFlowSidebar from "./docs/voice-agents-flow/sidebar"; import integrationsAndSDKSidebar from "./docs/integrations-and-sdks/sidebar"; import voiceAgentsSidebar from "./docs/voice-agents/sidebar"; @@ -14,7 +13,6 @@ export default { voiceAgentsSidebar, textToSpeechSidebar, integrationsAndSDKSidebar, - voiceAgentsFlowSidebar, deploymentsSidebar, { type: "category", From 5bb3f8828c33c60930eab6eb2053c2a35b76f0c1 Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Wed, 17 Dec 2025 16:24:08 +0000 Subject: [PATCH 03/12] Index repurposed as flow Overview --- docs/voice-agents/voice-agents-flow/index.md | 62 ++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 docs/voice-agents/voice-agents-flow/index.md diff --git a/docs/voice-agents/voice-agents-flow/index.md b/docs/voice-agents/voice-agents-flow/index.md new file mode 100644 index 00000000..aae9d1fb --- /dev/null +++ b/docs/voice-agents/voice-agents-flow/index.md @@ -0,0 +1,62 @@ +--- +title: Overview +description: Build conversational AI agents with the Flow API +--- + +# Overview + +:::info +Try Flow free for up to 50 hours per month. +::: + +[**Flow**](https://speechmatics.com/flow) is our Voice Agent API that allows you to add responsive, real-time speech-to-speech interactions to any product. + +Flow is engineered to engage in natural and fluid conversations by automatically handling interruptions, responding to multiple speakers, and understanding different dialects and accents. + +## How Flow works + +Built on top of Speechmatics' [industry-leading ASR](/speech-to-text/realtime/quickstart), the latest LLMs and text to speech, Flow is engineered to engage in natural and fluid conversations. + +Simply stream in audio, and Flow will provide the TTS response as well as other useful information. + +### Component models + +The three base components of the Flow Engine are speech to text, large language model, and text to speech. + +#### Speech to text (ASR) + +Flow is built on the foundations of Speechmatics' market-leading Realtime ASR. The client passes streaming audio to the Flow service through the WebSocket. The service then processes multiple speech & non-speech signals such as the spoken words, tonality, & audio events before passing the context to the LLM to formulate a response. + +Flow natively supports multiple speaker detection (Speaker Diarization). Flow can be configured to ignore, acknowledge or engage with non-primary speakers when setting up Agents. + +This transcribed text is also streamed back to the client as soon as it is generated to support any client-driven recording, monitoring & analytics workflows. + +To improve accuracy on product-specific terminology we recommend using a Custom Dictionary when setting up Agents in the [Portal](https://portal.speechmatics.com/). + +#### Large language model (LLM) + +Flow’s conversational understanding & knowledge is powered by LLMs. The transcribed text from the ASR is then passed with Flow configurations to the LLM to formulate a natural-sounding response. + +The response-generation can be influenced through defining a persona, style, and context when setting up Templates. + +#### Text to speech (TTS) + +Output generated by the LLM, when ready to be spoken, will be converted to audio through the chosen TTS engine. These engines were selected to provide the most natural-sounding responses while not trading off on latency. This audio is then streamed back to the client, who must then play this back to the user. + +## Flow engine + +### Understanding disfluencies & pacing + +Everyone has a different style of speaking. Natural speech is colored with filler sounds and the pace of speech can vary from speaker to speaker. A one-size-fits-all voice agent can add a lot of friction to the experience if it keeps interrupting you. We’ve designed Flow to adapt to your speaking style and not be over-eager to interrupt, helping to make users feel comfortable. + +### Handling interruptions + +Flow has been modelled on real-world human conversations. Whether it is to stop Flow from going off-track or to correct wrong assumptions, you can interrupt it. We’ve built our own interruption engine that intelligently ignores unintentional interruptions and gracefully handles the ones that it needs to. To avoid sounding abrupt and unnatural when interrupted, Flow will finish the current word that’s being spoken and gradually fade out the next one. + +### End-of-turn detection + +Based on your voice & what you’ve been saying, Flow uses a [small language model (SLM) architecture](https://blog.speechmatics.com/semantic-turn-detection) to smartly detect when you’re done speaking before it responds for a natural and responsive experience. Flow is built to be human-centric and, while we could achieve much lower latencies, it’s rude to interrupt mid-thought. + +### Help and support + +For any additional issues, please reach out to the Flow Support team at [flow-help@speechmatics.com](mailto:flow-help@speechmatics.com). From 6330c25cd3fdf479033620315137be19f92fa572 Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Wed, 17 Dec 2025 16:25:45 +0000 Subject: [PATCH 04/12] Move Voice Agents Flow docs into nested voice-agents dir --- .../features/application-inputs.mdx | 0 .../features/assets/function-calling.py | 133 ++++++++++++++++++ .../features/function-calling.mdx | 0 .../features/webrtc-livekit.mdx | 0 .../voice-agents-flow/guides/nextjs-guide.mdx | 0 .../voice-agents-flow/guides/react-native.mdx | 0 .../voice-agents-flow/setup.mdx | 0 .../supported-formats-and-limits.mdx | 0 .../voice-agents-flow/supported-languages.mdx | 0 9 files changed, 133 insertions(+) rename docs/{ => voice-agents}/voice-agents-flow/features/application-inputs.mdx (100%) create mode 100644 docs/voice-agents/voice-agents-flow/features/assets/function-calling.py rename docs/{ => voice-agents}/voice-agents-flow/features/function-calling.mdx (100%) rename docs/{ => voice-agents}/voice-agents-flow/features/webrtc-livekit.mdx (100%) rename docs/{ => voice-agents}/voice-agents-flow/guides/nextjs-guide.mdx (100%) rename docs/{ => voice-agents}/voice-agents-flow/guides/react-native.mdx (100%) rename docs/{ => voice-agents}/voice-agents-flow/setup.mdx (100%) rename docs/{ => voice-agents}/voice-agents-flow/supported-formats-and-limits.mdx (100%) rename docs/{ => voice-agents}/voice-agents-flow/supported-languages.mdx (100%) diff --git a/docs/voice-agents-flow/features/application-inputs.mdx b/docs/voice-agents/voice-agents-flow/features/application-inputs.mdx similarity index 100% rename from docs/voice-agents-flow/features/application-inputs.mdx rename to docs/voice-agents/voice-agents-flow/features/application-inputs.mdx diff --git a/docs/voice-agents/voice-agents-flow/features/assets/function-calling.py b/docs/voice-agents/voice-agents-flow/features/assets/function-calling.py new file mode 100644 index 00000000..ec8e740f --- /dev/null +++ b/docs/voice-agents/voice-agents-flow/features/assets/function-calling.py @@ -0,0 +1,133 @@ +import asyncio +import io +import sys +import json + +import pyaudio + +from speechmatics_flow.client import WebsocketClient +from speechmatics_flow.models import ( + ConnectionSettings, + Interaction, + AudioSettings, + ConversationConfig, + ServerMessageType, + ClientMessageType, +) +from speechmatics_flow.tool_function_param import ToolFunctionParam + +AUTH_TOKEN = "Place your auth token here" + +# Example configuration which could add a reminder to a calendar. +reminder_config = ToolFunctionParam( + type="function", + function={ + "name": "add_reminder", + "description": "Use this to schedule reminders. Needs a confirmation.", + "parameters": { + "type": "object", + "properties": { + "date": { + "type": "string", + "description": "The date for the reminder in dd/mm/yyyy format", + }, + "time": { + "type": "string", + "description": "The time for the reminder in 24 hour hh:mm format", + }, + "title": { + "type": "string", + "description": "The title for the reminder", + }, + "project": { + "type": "string", + "description": "Which project the reminder is related to. If not provided, leave blank.", + }, + }, + "required": ["project"], + }, + }, +) + + +# Callback for handling reminder ToolInvoke in your system. +async def reminder_handler(msg: dict): + print("Attempting to add reminder") + print(msg) + response_message = { + "message": ClientMessageType.ToolResult, + "id": msg["id"], + "status": "ok", # Used to inform user the status of the function call. Could be "failed" or "rejected". + "content": "Added reminder successfully to calendar", # LLM response helper message + } + + await client.websocket.send(json.dumps(response_message)) + + +# Create a websocket client +client = WebsocketClient( + ConnectionSettings( + url="wss://flow.api.speechmatics.com/v1/flow", + auth_token=AUTH_TOKEN, + ) +) + +# Create a buffer to store binary messages sent from the server +audio_buffer = io.BytesIO() + + +# Create callback function which adds binary messages to audio buffer +def binary_msg_handler(msg: bytes): + if isinstance(msg, (bytes, bytearray)): + audio_buffer.write(msg) + + +# Register the callback which will be called +# when the client receives an audio message from the server +client.add_event_handler(ServerMessageType.AddAudio, binary_msg_handler) + +# Handling ToolInvoke message +client.add_event_handler(ServerMessageType.ToolInvoke, reminder_handler) + + +async def audio_playback(buffer): + """Read from buffer and play audio back to the user""" + p = pyaudio.PyAudio() + stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True) + try: + while True: + # Get the current value from the buffer + audio_to_play = buffer.getvalue() + # Only proceed if there is audio data to play + if audio_to_play: + # Write the audio to the stream + stream.write(audio_to_play) + buffer.seek(0) + buffer.truncate(0) + # Pause briefly before checking the buffer again + await asyncio.sleep(0.05) + finally: + stream.close() + stream.stop_stream() + p.terminate() + + +async def main(): + print("Starting...") + tasks = [ + # Use the websocket to connect to Flow Service and start a conversation + asyncio.create_task( + client.run( + interactions=[Interaction(sys.stdin.buffer)], + audio_settings=AudioSettings(), + conversation_config=ConversationConfig(), + tools=[reminder_config], + ) + ), + # Run audio playback handler which streams audio from audio buffer + asyncio.create_task(audio_playback(audio_buffer)), + ] + await asyncio.gather(*tasks) + + +asyncio.run(main()) diff --git a/docs/voice-agents-flow/features/function-calling.mdx b/docs/voice-agents/voice-agents-flow/features/function-calling.mdx similarity index 100% rename from docs/voice-agents-flow/features/function-calling.mdx rename to docs/voice-agents/voice-agents-flow/features/function-calling.mdx diff --git a/docs/voice-agents-flow/features/webrtc-livekit.mdx b/docs/voice-agents/voice-agents-flow/features/webrtc-livekit.mdx similarity index 100% rename from docs/voice-agents-flow/features/webrtc-livekit.mdx rename to docs/voice-agents/voice-agents-flow/features/webrtc-livekit.mdx diff --git a/docs/voice-agents-flow/guides/nextjs-guide.mdx b/docs/voice-agents/voice-agents-flow/guides/nextjs-guide.mdx similarity index 100% rename from docs/voice-agents-flow/guides/nextjs-guide.mdx rename to docs/voice-agents/voice-agents-flow/guides/nextjs-guide.mdx diff --git a/docs/voice-agents-flow/guides/react-native.mdx b/docs/voice-agents/voice-agents-flow/guides/react-native.mdx similarity index 100% rename from docs/voice-agents-flow/guides/react-native.mdx rename to docs/voice-agents/voice-agents-flow/guides/react-native.mdx diff --git a/docs/voice-agents-flow/setup.mdx b/docs/voice-agents/voice-agents-flow/setup.mdx similarity index 100% rename from docs/voice-agents-flow/setup.mdx rename to docs/voice-agents/voice-agents-flow/setup.mdx diff --git a/docs/voice-agents-flow/supported-formats-and-limits.mdx b/docs/voice-agents/voice-agents-flow/supported-formats-and-limits.mdx similarity index 100% rename from docs/voice-agents-flow/supported-formats-and-limits.mdx rename to docs/voice-agents/voice-agents-flow/supported-formats-and-limits.mdx diff --git a/docs/voice-agents-flow/supported-languages.mdx b/docs/voice-agents/voice-agents-flow/supported-languages.mdx similarity index 100% rename from docs/voice-agents-flow/supported-languages.mdx rename to docs/voice-agents/voice-agents-flow/supported-languages.mdx From d3da47d39350bda7905101debdab7f47329b3940 Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Wed, 17 Dec 2025 16:26:04 +0000 Subject: [PATCH 05/12] Add minimal LiveKit Flow client HTML demo under nested cat --- .../features/assets/livekit-poc.html | 241 ++++++++++++++++++ 1 file changed, 241 insertions(+) create mode 100644 docs/voice-agents/voice-agents-flow/features/assets/livekit-poc.html diff --git a/docs/voice-agents/voice-agents-flow/features/assets/livekit-poc.html b/docs/voice-agents/voice-agents-flow/features/assets/livekit-poc.html new file mode 100644 index 00000000..1678c3c0 --- /dev/null +++ b/docs/voice-agents/voice-agents-flow/features/assets/livekit-poc.html @@ -0,0 +1,241 @@ + + + + + + Minimal LiveKit Flow Client + + + + +

Minimal LiveKit Flow Client

+ +
+ +
+
Status: disconnected
+ +
+ + + + From 6dc4b348538db1815f910908eac58125e27ee3d3 Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Wed, 17 Dec 2025 16:27:04 +0000 Subject: [PATCH 06/12] Remove duplicate files --- .../features/assets/function-calling.py | 133 ---------- .../features/assets/livekit-poc.html | 241 ------------------ docs/voice-agents-flow/index.md | 62 ----- docs/voice-agents-flow/sidebar.ts | 49 ---- 4 files changed, 485 deletions(-) delete mode 100644 docs/voice-agents-flow/features/assets/function-calling.py delete mode 100644 docs/voice-agents-flow/features/assets/livekit-poc.html delete mode 100644 docs/voice-agents-flow/index.md delete mode 100644 docs/voice-agents-flow/sidebar.ts diff --git a/docs/voice-agents-flow/features/assets/function-calling.py b/docs/voice-agents-flow/features/assets/function-calling.py deleted file mode 100644 index ec8e740f..00000000 --- a/docs/voice-agents-flow/features/assets/function-calling.py +++ /dev/null @@ -1,133 +0,0 @@ -import asyncio -import io -import sys -import json - -import pyaudio - -from speechmatics_flow.client import WebsocketClient -from speechmatics_flow.models import ( - ConnectionSettings, - Interaction, - AudioSettings, - ConversationConfig, - ServerMessageType, - ClientMessageType, -) -from speechmatics_flow.tool_function_param import ToolFunctionParam - -AUTH_TOKEN = "Place your auth token here" - -# Example configuration which could add a reminder to a calendar. -reminder_config = ToolFunctionParam( - type="function", - function={ - "name": "add_reminder", - "description": "Use this to schedule reminders. Needs a confirmation.", - "parameters": { - "type": "object", - "properties": { - "date": { - "type": "string", - "description": "The date for the reminder in dd/mm/yyyy format", - }, - "time": { - "type": "string", - "description": "The time for the reminder in 24 hour hh:mm format", - }, - "title": { - "type": "string", - "description": "The title for the reminder", - }, - "project": { - "type": "string", - "description": "Which project the reminder is related to. If not provided, leave blank.", - }, - }, - "required": ["project"], - }, - }, -) - - -# Callback for handling reminder ToolInvoke in your system. -async def reminder_handler(msg: dict): - print("Attempting to add reminder") - print(msg) - response_message = { - "message": ClientMessageType.ToolResult, - "id": msg["id"], - "status": "ok", # Used to inform user the status of the function call. Could be "failed" or "rejected". - "content": "Added reminder successfully to calendar", # LLM response helper message - } - - await client.websocket.send(json.dumps(response_message)) - - -# Create a websocket client -client = WebsocketClient( - ConnectionSettings( - url="wss://flow.api.speechmatics.com/v1/flow", - auth_token=AUTH_TOKEN, - ) -) - -# Create a buffer to store binary messages sent from the server -audio_buffer = io.BytesIO() - - -# Create callback function which adds binary messages to audio buffer -def binary_msg_handler(msg: bytes): - if isinstance(msg, (bytes, bytearray)): - audio_buffer.write(msg) - - -# Register the callback which will be called -# when the client receives an audio message from the server -client.add_event_handler(ServerMessageType.AddAudio, binary_msg_handler) - -# Handling ToolInvoke message -client.add_event_handler(ServerMessageType.ToolInvoke, reminder_handler) - - -async def audio_playback(buffer): - """Read from buffer and play audio back to the user""" - p = pyaudio.PyAudio() - stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True) - try: - while True: - # Get the current value from the buffer - audio_to_play = buffer.getvalue() - # Only proceed if there is audio data to play - if audio_to_play: - # Write the audio to the stream - stream.write(audio_to_play) - buffer.seek(0) - buffer.truncate(0) - # Pause briefly before checking the buffer again - await asyncio.sleep(0.05) - finally: - stream.close() - stream.stop_stream() - p.terminate() - - -async def main(): - print("Starting...") - tasks = [ - # Use the websocket to connect to Flow Service and start a conversation - asyncio.create_task( - client.run( - interactions=[Interaction(sys.stdin.buffer)], - audio_settings=AudioSettings(), - conversation_config=ConversationConfig(), - tools=[reminder_config], - ) - ), - # Run audio playback handler which streams audio from audio buffer - asyncio.create_task(audio_playback(audio_buffer)), - ] - await asyncio.gather(*tasks) - - -asyncio.run(main()) diff --git a/docs/voice-agents-flow/features/assets/livekit-poc.html b/docs/voice-agents-flow/features/assets/livekit-poc.html deleted file mode 100644 index 1678c3c0..00000000 --- a/docs/voice-agents-flow/features/assets/livekit-poc.html +++ /dev/null @@ -1,241 +0,0 @@ - - - - - - Minimal LiveKit Flow Client - - - - -

Minimal LiveKit Flow Client

- -
- -
-
Status: disconnected
- -
- - - - diff --git a/docs/voice-agents-flow/index.md b/docs/voice-agents-flow/index.md deleted file mode 100644 index aae9d1fb..00000000 --- a/docs/voice-agents-flow/index.md +++ /dev/null @@ -1,62 +0,0 @@ ---- -title: Overview -description: Build conversational AI agents with the Flow API ---- - -# Overview - -:::info -Try Flow free for up to 50 hours per month. -::: - -[**Flow**](https://speechmatics.com/flow) is our Voice Agent API that allows you to add responsive, real-time speech-to-speech interactions to any product. - -Flow is engineered to engage in natural and fluid conversations by automatically handling interruptions, responding to multiple speakers, and understanding different dialects and accents. - -## How Flow works - -Built on top of Speechmatics' [industry-leading ASR](/speech-to-text/realtime/quickstart), the latest LLMs and text to speech, Flow is engineered to engage in natural and fluid conversations. - -Simply stream in audio, and Flow will provide the TTS response as well as other useful information. - -### Component models - -The three base components of the Flow Engine are speech to text, large language model, and text to speech. - -#### Speech to text (ASR) - -Flow is built on the foundations of Speechmatics' market-leading Realtime ASR. The client passes streaming audio to the Flow service through the WebSocket. The service then processes multiple speech & non-speech signals such as the spoken words, tonality, & audio events before passing the context to the LLM to formulate a response. - -Flow natively supports multiple speaker detection (Speaker Diarization). Flow can be configured to ignore, acknowledge or engage with non-primary speakers when setting up Agents. - -This transcribed text is also streamed back to the client as soon as it is generated to support any client-driven recording, monitoring & analytics workflows. - -To improve accuracy on product-specific terminology we recommend using a Custom Dictionary when setting up Agents in the [Portal](https://portal.speechmatics.com/). - -#### Large language model (LLM) - -Flow’s conversational understanding & knowledge is powered by LLMs. The transcribed text from the ASR is then passed with Flow configurations to the LLM to formulate a natural-sounding response. - -The response-generation can be influenced through defining a persona, style, and context when setting up Templates. - -#### Text to speech (TTS) - -Output generated by the LLM, when ready to be spoken, will be converted to audio through the chosen TTS engine. These engines were selected to provide the most natural-sounding responses while not trading off on latency. This audio is then streamed back to the client, who must then play this back to the user. - -## Flow engine - -### Understanding disfluencies & pacing - -Everyone has a different style of speaking. Natural speech is colored with filler sounds and the pace of speech can vary from speaker to speaker. A one-size-fits-all voice agent can add a lot of friction to the experience if it keeps interrupting you. We’ve designed Flow to adapt to your speaking style and not be over-eager to interrupt, helping to make users feel comfortable. - -### Handling interruptions - -Flow has been modelled on real-world human conversations. Whether it is to stop Flow from going off-track or to correct wrong assumptions, you can interrupt it. We’ve built our own interruption engine that intelligently ignores unintentional interruptions and gracefully handles the ones that it needs to. To avoid sounding abrupt and unnatural when interrupted, Flow will finish the current word that’s being spoken and gradually fade out the next one. - -### End-of-turn detection - -Based on your voice & what you’ve been saying, Flow uses a [small language model (SLM) architecture](https://blog.speechmatics.com/semantic-turn-detection) to smartly detect when you’re done speaking before it responds for a natural and responsive experience. Flow is built to be human-centric and, while we could achieve much lower latencies, it’s rude to interrupt mid-thought. - -### Help and support - -For any additional issues, please reach out to the Flow Support team at [flow-help@speechmatics.com](mailto:flow-help@speechmatics.com). diff --git a/docs/voice-agents-flow/sidebar.ts b/docs/voice-agents-flow/sidebar.ts deleted file mode 100644 index 418ef968..00000000 --- a/docs/voice-agents-flow/sidebar.ts +++ /dev/null @@ -1,49 +0,0 @@ -export default { - type: "category", - label: "Voice agents – Flow", - collapsible: false, - collapsed: false, - items: [ - { - type: "doc", - id: "voice-agents-flow/index", - }, - { - type: "category", - label: "Features", - items: [ - { - type: "autogenerated", - dirName: "voice-agents-flow/features", - }, - ], - }, - { - type: "category", - label: "Guides", - items: [ - { - type: "autogenerated", - dirName: "voice-agents-flow/guides", - }, - { - type: "doc", - id: "guides/projects", - }, - ], - }, - { - type: "doc", - id: "voice-agents-flow/setup", - }, - { - type: "doc", - id: "voice-agents-flow/supported-formats-and-limits", - }, - - { - type: "doc", - id: "voice-agents-flow/supported-languages", - }, - ], -} as const; From ea5bd2e2e7a22561a2ef2adee941d318e7108c08 Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Wed, 17 Dec 2025 17:06:02 +0000 Subject: [PATCH 07/12] Update Flow redirect paths to match new nested structure under voice-agents --- vercel.json | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/vercel.json b/vercel.json index 9786e042..87c0a3ec 100644 --- a/vercel.json +++ b/vercel.json @@ -163,42 +163,42 @@ }, { "source": "/flow/application-inputs", - "destination": "/voice-agents-flow/features/application-inputs", + "destination": "/voice-agents/flow/features/application-inputs", "permanent": true }, { "source": "/flow/config", - "destination": "/voice-agents-flow/setup", + "destination": "/voice-agents/flow/setup", "permanent": true }, { "source": "/flow/function-calling", - "destination": "/voice-agents-flow/features/function-calling", + "destination": "/voice-agents/flow/features/function-calling", "permanent": true }, { "source": "/flow/introduction", - "destination": "/voice-agents-flow", + "destination": "/voice-agents/flow", "permanent": true }, { "source": "/flow/languages-supported", - "destination": "/voice-agents-flow/supported-languages", + "destination": "/voice-agents/flow/supported-languages", "permanent": true }, { "source": "/flow/livekit-webrtc", - "destination": "/voice-agents-flow/features/webrtc-livekit", + "destination": "/voice-agents/flow/features/webrtc-livekit", "permanent": true }, { "source": "/flow/nextjs-guide", - "destination": "/voice-agents-flow/guides/nextjs-guide", + "destination": "/voice-agents/flow/guides/nextjs-guide", "permanent": true }, { "source": "/flow/react-native-guide", - "destination": "/voice-agents-flow/guides/react-native", + "destination": "/voice-agents/flow/guides/react-native", "permanent": true }, { @@ -863,12 +863,12 @@ }, { "source": "/flow", - "destination": "/voice-agents-flow", + "destination": "/voice-agents/flow", "permanent": true }, { "source": "/flow/getting-started", - "destination": "/voice-agents-flow", + "destination": "/voice-agents/flow", "permanent": true }, { From a900b56cf1aa6af96faabe1f3ed082f2712c6d18 Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Wed, 17 Dec 2025 17:07:36 +0000 Subject: [PATCH 08/12] Add documentation for Flow application inputs, function calling, and LiveKit client demo --- .../flow/features/application-inputs.mdx | 30 ++ .../flow/features/assets/function-calling.py | 133 +++++++++ .../flow/features/assets/livekit-poc.html | 241 +++++++++++++++ .../flow/features/function-calling.mdx | 83 ++++++ .../flow/features/webrtc-livekit.mdx | 55 ++++ .../voice-agents/flow/guides/nextjs-guide.mdx | 274 ++++++++++++++++++ .../voice-agents/flow/guides/react-native.mdx | 254 ++++++++++++++++ docs/voice-agents/flow/index.md | 62 ++++ docs/voice-agents/flow/setup.mdx | 44 +++ docs/voice-agents/flow/sidebar.ts | 61 ++++ .../flow/supported-formats-and-limits.mdx | 39 +++ .../voice-agents/flow/supported-languages.mdx | 53 ++++ docs/voice-agents/sidebar.ts | 3 +- 13 files changed, 1331 insertions(+), 1 deletion(-) create mode 100644 docs/voice-agents/flow/features/application-inputs.mdx create mode 100644 docs/voice-agents/flow/features/assets/function-calling.py create mode 100644 docs/voice-agents/flow/features/assets/livekit-poc.html create mode 100644 docs/voice-agents/flow/features/function-calling.mdx create mode 100644 docs/voice-agents/flow/features/webrtc-livekit.mdx create mode 100644 docs/voice-agents/flow/guides/nextjs-guide.mdx create mode 100644 docs/voice-agents/flow/guides/react-native.mdx create mode 100644 docs/voice-agents/flow/index.md create mode 100644 docs/voice-agents/flow/setup.mdx create mode 100644 docs/voice-agents/flow/sidebar.ts create mode 100644 docs/voice-agents/flow/supported-formats-and-limits.mdx create mode 100644 docs/voice-agents/flow/supported-languages.mdx diff --git a/docs/voice-agents/flow/features/application-inputs.mdx b/docs/voice-agents/flow/features/application-inputs.mdx new file mode 100644 index 00000000..c3299c50 --- /dev/null +++ b/docs/voice-agents/flow/features/application-inputs.mdx @@ -0,0 +1,30 @@ +--- +description: 'Learn about the application inputs for Flow' +--- + +import flowSpec from "!openapi-schema-loader!@site/spec/flow-api.yaml"; +import SchemaNode from "@theme/Schema"; +import { omit } from "lodash"; + +# Application Inputs + +Voice-enabled applications often need to react to events beyond user speech. For instance: +- A user pauses for too long, requiring a prompt to keep the conversation going +- An external notification or system event which must be conveyed during a voice conversation + +To handle these scenarios, the application can send an AddInput message to the Flow system. This allows non-verbal or system-driven inputs to be injected into the conversation dynamically. + +```json +{ + "message": "AddInput", + "input": "", // Required: The input text to be incorporated + "interrupt_response": "", // Optional: If true, interrupts the current response (default: false) + "immediate": "" // Optional: If true, processes the input immediately after the current turn ends (default: false) +} +``` + +## Application Input variables + +See [API reference](/api-ref/flow-voice-ai-websocket#addinput) for complete schema. + + diff --git a/docs/voice-agents/flow/features/assets/function-calling.py b/docs/voice-agents/flow/features/assets/function-calling.py new file mode 100644 index 00000000..ec8e740f --- /dev/null +++ b/docs/voice-agents/flow/features/assets/function-calling.py @@ -0,0 +1,133 @@ +import asyncio +import io +import sys +import json + +import pyaudio + +from speechmatics_flow.client import WebsocketClient +from speechmatics_flow.models import ( + ConnectionSettings, + Interaction, + AudioSettings, + ConversationConfig, + ServerMessageType, + ClientMessageType, +) +from speechmatics_flow.tool_function_param import ToolFunctionParam + +AUTH_TOKEN = "Place your auth token here" + +# Example configuration which could add a reminder to a calendar. +reminder_config = ToolFunctionParam( + type="function", + function={ + "name": "add_reminder", + "description": "Use this to schedule reminders. Needs a confirmation.", + "parameters": { + "type": "object", + "properties": { + "date": { + "type": "string", + "description": "The date for the reminder in dd/mm/yyyy format", + }, + "time": { + "type": "string", + "description": "The time for the reminder in 24 hour hh:mm format", + }, + "title": { + "type": "string", + "description": "The title for the reminder", + }, + "project": { + "type": "string", + "description": "Which project the reminder is related to. If not provided, leave blank.", + }, + }, + "required": ["project"], + }, + }, +) + + +# Callback for handling reminder ToolInvoke in your system. +async def reminder_handler(msg: dict): + print("Attempting to add reminder") + print(msg) + response_message = { + "message": ClientMessageType.ToolResult, + "id": msg["id"], + "status": "ok", # Used to inform user the status of the function call. Could be "failed" or "rejected". + "content": "Added reminder successfully to calendar", # LLM response helper message + } + + await client.websocket.send(json.dumps(response_message)) + + +# Create a websocket client +client = WebsocketClient( + ConnectionSettings( + url="wss://flow.api.speechmatics.com/v1/flow", + auth_token=AUTH_TOKEN, + ) +) + +# Create a buffer to store binary messages sent from the server +audio_buffer = io.BytesIO() + + +# Create callback function which adds binary messages to audio buffer +def binary_msg_handler(msg: bytes): + if isinstance(msg, (bytes, bytearray)): + audio_buffer.write(msg) + + +# Register the callback which will be called +# when the client receives an audio message from the server +client.add_event_handler(ServerMessageType.AddAudio, binary_msg_handler) + +# Handling ToolInvoke message +client.add_event_handler(ServerMessageType.ToolInvoke, reminder_handler) + + +async def audio_playback(buffer): + """Read from buffer and play audio back to the user""" + p = pyaudio.PyAudio() + stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True) + try: + while True: + # Get the current value from the buffer + audio_to_play = buffer.getvalue() + # Only proceed if there is audio data to play + if audio_to_play: + # Write the audio to the stream + stream.write(audio_to_play) + buffer.seek(0) + buffer.truncate(0) + # Pause briefly before checking the buffer again + await asyncio.sleep(0.05) + finally: + stream.close() + stream.stop_stream() + p.terminate() + + +async def main(): + print("Starting...") + tasks = [ + # Use the websocket to connect to Flow Service and start a conversation + asyncio.create_task( + client.run( + interactions=[Interaction(sys.stdin.buffer)], + audio_settings=AudioSettings(), + conversation_config=ConversationConfig(), + tools=[reminder_config], + ) + ), + # Run audio playback handler which streams audio from audio buffer + asyncio.create_task(audio_playback(audio_buffer)), + ] + await asyncio.gather(*tasks) + + +asyncio.run(main()) diff --git a/docs/voice-agents/flow/features/assets/livekit-poc.html b/docs/voice-agents/flow/features/assets/livekit-poc.html new file mode 100644 index 00000000..1678c3c0 --- /dev/null +++ b/docs/voice-agents/flow/features/assets/livekit-poc.html @@ -0,0 +1,241 @@ + + + + + + Minimal LiveKit Flow Client + + + + +

Minimal LiveKit Flow Client

+ +
+ +
+
Status: disconnected
+ +
+ + + + diff --git a/docs/voice-agents/flow/features/function-calling.mdx b/docs/voice-agents/flow/features/function-calling.mdx new file mode 100644 index 00000000..e7b6d13e --- /dev/null +++ b/docs/voice-agents/flow/features/function-calling.mdx @@ -0,0 +1,83 @@ +--- +description: 'Learn about the function calling feature for Flow' +--- + +import CodeBlock from '@theme/CodeBlock'; +import pythonFunctionCallingExample from './assets/function-calling.py'; +import SchemaNode from '@theme/Schema'; +import flowSpec from '!openapi-schema-loader!@site/spec/flow-api.yaml'; +import { Callout, Flex, Text } from '@radix-ui/themes'; +import { ConstructionIcon } from 'lucide-react'; + +# Function Calling + +Function Calling allows you to connect Flow to external tools and systems. This unlocks Flow's ability to act in the real-world and better serve the needs of your users. + +This could involve needing real-time information such as opening/closing times or validation services for authentication or action APIs that control a fast food system while placing a drive-thru order. + +Based on what the user says in the conversation, Flow will recognise the user's intentions and extract out the key information that your system needs to complete the function call. + +For example, you may want Flow to add reminders in a user's calendar: + +```json +{ + "name": "add_reminder", + "description": "Use this to schedule reminders. Needs a confirmation.", + "parameters": { + "type": "object", + "properties": { + "date" : { + "type" : "string", + "description" : "The date for the reminder in dd/mm/yyyy format" + }, + "time" : { + "type": "string", + "description" : "The time for the reminder in 24 hour hh:mm format" + }, + "title" : { + "type": "string", + "description" : "The title for the reminder" + }, + "project": { + "type": "string", + "description": "Which project the reminder is related to. If not provided, leave blank." + } + }, + "required": ["project"] + } +} +``` + +## Configuring Function Calling + +An agent can be configured to use function calling in two ways: + +1. **In code**: when starting a session with the [`StartConversation` message](/api-ref/flow-voice-ai-websocket#startconversation) +2. (_coming soon_) **In the portal**: when configuring an agent + +### In the portal + +[Create an agent in the portal](https://portal.speechmatics.com/create-agent) and enable function calling in the agent settings. + +### In `StartConversation` + +Functions must be declared within a list of tools when your client sends the [StartConversation message](/api-ref/flow-voice-ai-websocket#startconversation). Each function must be defined with the following: + + + + +## Example + + +{pythonFunctionCallingExample} + + +## Considerations + +- Function `status` - The client must inform the service of whether the function call succeeded or not. This allows the service to inform the user of the result. There is no automatic timeout on the Flow API. +- Asynchronous - Function calling is fully asynchronous. Once the client is informed of the function call, the conversation will continue to progress until a function call status update is received from the client. This is to continue providing a natural conversational experience to the customer. +- Completion Message - Flow can play a message on completion of the function call. The Client can switch this off by passing `` in the content field of the ToolResult message. + +:::note +Since LLMs are semantically instructed, complete, narrow and unambiguous function calls with simple descriptions can create a reliable customer experience. Complex business logic should be handled within your client. +::: diff --git a/docs/voice-agents/flow/features/webrtc-livekit.mdx b/docs/voice-agents/flow/features/webrtc-livekit.mdx new file mode 100644 index 00000000..c73dcc63 --- /dev/null +++ b/docs/voice-agents/flow/features/webrtc-livekit.mdx @@ -0,0 +1,55 @@ +--- +description: 'Learn how to use the Flow API over WebRTC with LiveKit' +--- + +import CodeBlock from "@theme/CodeBlock"; +import LivekitPoc from "./assets/livekit-poc.html?raw"; + +# WebRTC over LiveKit + +A client may want to use Flow in unsure network conditions or through mobile devices with fluctuating networks. In such scenarios, we offer WebRTC protocol as a way to connect to Flow. More information about the protocol can be found on official [webrtc website](https://webrtc.org/). +Flow uses the WebRTC setup provided by LiveKit to enable support for the protocol. + +## API + +Client makes a HTTP POST request to `/v1/flow/livekit` endpoint, with body containing StartConversation message as described in the [Flow API reference](/api-ref/flow-voice-ai-websocket#startconversation). "audio_format" field must not be used in this scenario as LiveKit WebRTC takes control of the audio format. + +```json +{ + "message": "StartConversation", + "conversation_config": { + "template_id": "flow-service-assistant-one", + "template_variables": { + "timezone": "Europe/London" + } + } +} +``` + +## Response + +In response, a LiveKit room is created. The returned URL & token are used to connect to the LiveKit server. + +```json +{ + "url": "wss://test-app-d3kro1gz.livekit.cloud", + "token": "", + "id": "" +} +``` + +## Connecting to LiveKit + +Provided JWT token has short TTL and should be used immediately after receiving it. + +The LiveKit SDK for a given platform should be used to connect to the LiveKit server. The SDK handles the connection and audio streaming, including bitrate management. +Text messages, or control messages are exchanged using LocalParticipant object. +Protocol for messages with the Flow Agent is the same as in case of normal WebSocket connection to Flow API. + +LiveKit documentation can be found [here](https://docs.livekit.io/home/client/connect/) + +## Example client in JavaScript + + + {LivekitPoc} + \ No newline at end of file diff --git a/docs/voice-agents/flow/guides/nextjs-guide.mdx b/docs/voice-agents/flow/guides/nextjs-guide.mdx new file mode 100644 index 00000000..3fe4b451 --- /dev/null +++ b/docs/voice-agents/flow/guides/nextjs-guide.mdx @@ -0,0 +1,274 @@ +--- +sidebar_label: NextJS +title: Build a conversational AI web app with Next.js and Flow +description: 'Learn how to build a conversational AI web app with Next.js and Flow' +--- + +import { Box, Card, Flex } from "@radix-ui/themes"; +import CodeBlock from "@theme/CodeBlock"; +import { GithubIcon } from "lucide-react"; + +{/* -------------- Step 1-------------- */} + +import postcssConfigExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-1/postcss.config.mjs"; +import globalsCssExampleStepOne from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-1/app/globals.css"; + +{/* -------------- Step 2 -------------- */} +import providersStepTwo from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-2/app/providers.tsx"; +import useAudioContextsExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-2/hooks/useAudioContexts.ts"; +import pageStepTwoExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-2/app/page.tsx"; +import nextjsConfigExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-2/next.config.ts"; + +{/* -------------- Step 3 -------------- */} +import globalsCssExampleStepThree from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-3/app/globals.css"; +import controlsExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-3/components/Controls.tsx"; +import serverActionExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-3/app/actions.ts"; +import microphoneSelectExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-3/components/MicrophoneSelect.tsx"; +import statusExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-3/components/Status.tsx"; +import transcriptViewExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-3/components/TranscriptView.tsx"; +import pageStepThreeExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-3/app/page.tsx"; + + +# Build a conversational AI web app with Next.js and Flow + +In this guide, we will walk you through the process of building a conversational AI web application using Next.js and Flow. +You will learn how to set up your development environment, create a Next.js project, integrate Flow and implement a simple conversational AI feature. + +You can find the complete code on [GitHub ](https://github.com/speechmatics/nextjs-flow-guide). + +## Prerequisites + +Before getting started, ensure you have: + +- [Node.js 20](https://nodejs.org/en) or later +- A Speechmatics account and API key + +## Step 1: Setup project, dependencies and API key + +We will be using NextJS 15 with App Router and Typescript. We will also use TailwindCSS for styling, but feel free to use any styling solution you prefer. + +### Create a new Next.js project + +```sh +npx create-next-app@latest nextjs-flow-guide --typescript --eslint --app +``` + +### Install Speechmatics packages + +```sh +# Official Flow API client for React +npm install @speechmatics/flow-client-react + +# Used for requesting JWTs for API authentication +npm install @speechmatics/auth + +# These let us capture and play raw audio in the browser easily +npm install @speechmatics/browser-audio-input +npm install @speechmatics/browser-audio-input-react +npm install @speechmatics/web-pcm-player-react + +# Utility package for rendering the transcript of the conversation +npm install @speechmatics/use-flow-transcript + +``` + +### Install TailwindCSS + +These steps are from the[Tailwind docs here](https://tailwindcss.com/docs/installation/framework-guides/nextjs). + +1. Install TailwindCSS +```sh +npm install @tailwindcss/postcss +``` + +2. Create a `postcss.config.mjs` file in the root of the project with the following content: + + + {postcssConfigExample} + + +Finally, remove all styles from the `globals.css` file, and replace it with the following content: + + + {globalsCssExampleStepOne} + + + +### Add your API key to `.env` + +Create a `.env` file in the root of the project, and add your API key: + + + {"API_KEY=\"your-api-key\""} + + +## Step 2: Configuration and Context providers + +### Configure Webpack to serve AudioWorklet script + +To interface with the Flow API, we need to record and send raw audio. The `@speechmatics/browser-audio-input` package is designed to do this. It achieves this by providing a script which can be loaded by an [AudioWorklet](https://developer.mozilla.org/en-US/docs/Web/API/AudioWorklet), but how this script is consumed depends on the bundler being used. + +In order to use this package with NextJS, we need to configure Webpack to serve the provided script from a URL, rather than bundling it with the rest. We can leverage [Asset Modules](https://webpack.js.org/guides/asset-modules/) to achieve this. + + + {nextjsConfigExample} + + + +### Audio and Context providers + +#### Context providers + +We will be using 3 context providers in the app: + +1. **FlowProvider** - Provides the Flow client to the app. +2. **PCMAudioRecorderProvider** - Given an `AudioContext`, provides the browser audio input to the app. +3. **PCMAudioPlayerProvider** - Given an `AudioContext`, provides the web PCM player to the app. + +We'll start by creating a `providers.tsx` file in the `app` directory, and adding the following content: + +Here we add the 3 context providers to the app, passing the `AudioContext` instances to both the audio providers, and the `workletScriptURL` to `PCMAudioRecorderProvider`. + + {providersStepTwo} + + +:::info +#### A note about `AudioContext` and sample rates in Firefox + +[AudioContext](https://developer.mozilla.org/en-US/docs/Web/API/AudioContext) is the WebAPI for handling audio recording and playback. Under normal circumstances, you should aim to have [one reusable instance](https://developer.mozilla.org/en-US/docs/Web/API/AudioContext#:~:text=It%27s%20recommended%20to%20create%20one%20AudioContext%20and%20reuse%20it%20instead%20of%20initializing%20a%20new%20one%20each%20time) of an `AudioContext` in your app. In most browsers, an `AudioContext` can freely record and play back audio at different sample rates, but this is not the case in Firefox (see outstanding [bug here](https://bugzilla.mozilla.org/show_bug.cgi?id=1725336)). + + +To handle this, we can create a utility hook to expose separate `AudioContext` instances for recording and playback in Firefox, while sharing a single instance for other browsers (see below). + +::: + + + {useAudioContextsExample} + + +Now place the following code in the `app/page.tsx` file: + + + {pageStepTwoExample} + + +:::tip +If you get an error about `AudioWorkletProcessor` not being defined, make sure you [configured Webpack to serve the script URL](#configure-webpack-to-serve-audioworklet-script). +::: + +## Step 3: Implementing the UI + + +### Wireframe and styles + +The UI will follow this wireframe: + + + + + + + +

**Controls**

+

Where we can select the input device and persona, and start/stop the session.

+
+
+ + +

**Status**

+

Displays the current status of the connection.

+
+
+
+ + +

**TranscriptView**

+

Displays the transcript of the conversation.

+
+
+
+
+
+ +We'll start by adding some basic styles to the `app/globals.css` file: + + + {globalsCssExampleStepThree} + + + +### `Controls` component + +This component will contain: + +- A dropdown to select the input device +- A dropdown to select the persona +- A button to start/stop the session +- A button to mute the microphone when the session is active + +To connect to the API, we will also need to setup a [Server Action](https://react.dev/reference/rsc/server-functions) to request a JWT from the backend. We can then call this server action in our component. + +Here we define the controls component: + - `form` contains the dropdowns and buttons + - When the form is submitted, we call the `getJWT` server action, then pass the JWT to the `startConversation` function, along with the config from the FormData. + + + {controlsExample} + + +We also create a utility component to render the microphone select dropdown. It also handles prompting the user for permission to use the microphone. + + {microphoneSelectExample} + + +Finally we define the server action to request a JWT from the backend. + + {serverActionExample} + + + +### `Status` component + +This component will display: + - The status of the Websocket connection + - The Session ID of the current conversation + - Whether the microphone is recording + + + {statusExample} + + + +### `TranscriptView` component + +This component will use the `useFlowTranscript` hook to display the transcript of the conversation. + +:::tip +The `useFlowTranscript` hook is provided for convenience. If you want more fine-grained control over the transcript you should use the `useFlowEventListener` hook to listen for incoming events, and handle them as you see fit. +::: + + + {transcriptViewExample} + + +### Putting it all together + +Now we can update the `app/page.tsx` file to use the new components: + +:::note +Since the component in `page.tsx` is a [React Server Component](https://react.dev/reference/rsc/server-components), we can use it to fetch the list of personas from the backend, and pass it to the `Controls` component. +::: + + + {pageStepThreeExample} + + +## Running the app + +To run the app, use the following command: + +```sh +npm run dev +``` + +You should now be able to access the app at [`http://localhost:3000`](http://localhost:3000). \ No newline at end of file diff --git a/docs/voice-agents/flow/guides/react-native.mdx b/docs/voice-agents/flow/guides/react-native.mdx new file mode 100644 index 00000000..2416b069 --- /dev/null +++ b/docs/voice-agents/flow/guides/react-native.mdx @@ -0,0 +1,254 @@ +--- +title: Build a conversational AI app with React Native and Flow +description: 'Learn how to create a mobile application that integrates Speechmatics Flow service using React Native.' +sidebar_label: React Native +--- + +import AppScreenshotUrl from "@site/static/img/flow-react-native.png"; +import CodeBlock from "@theme/CodeBlock"; +import { Kbd } from "@radix-ui/themes"; +import { GithubIcon } from "lucide-react"; + +import indexStepOne from "?url=https://raw.githubusercontent.com/speechmatics/flow-react-native-guide/refs/heads/step-1/app/index.tsx"; +import indexStepTwo from "?url=https://raw.githubusercontent.com/speechmatics/flow-react-native-guide/refs/heads/step-2/app/index.tsx"; +import indexStepThree from "?url=https://raw.githubusercontent.com/speechmatics/flow-react-native-guide/refs/heads/step-3/app/index.tsx"; +import volumeDisplay from "?url=https://raw.githubusercontent.com/speechmatics/flow-react-native-guide/refs/heads/step-4/app/volume-display.tsx"; +import indexStepFour from "?url=https://raw.githubusercontent.com/speechmatics/flow-react-native-guide/refs/heads/step-4/app/index.tsx"; + +# Build a conversational AI app with React Native and Flow + +This guide demonstrates how to build the app using the Expo framework, implementing real-time audio communication with Flow's servers. + +You can find the complete code on [GitHub ](https://github.com/speechmatics/flow-react-native-guide). + + + +## Prerequisites + +Before getting started, ensure you have: + +- [Node.js (LTS)](https://nodejs.org/en/download) installed on your system +- Development environment configured for development builds: + - [Android Development](https://docs.expo.dev/get-started/set-up-your-environment/?mode=development-build&buildEnv=local&platform=android&device=simulated) + - [iOS Development](https://docs.expo.dev/get-started/set-up-your-environment/?mode=development-build&buildEnv=local&platform=ios&device=simulated) + +## Project Setup + +Start by creating a fresh Expo project: + +```sh +npx create-expo-app@latest +``` + +To remove the example code and start with a clean slate: + +```sh +npm run reset-project +``` + +This command preserves the example files by moving them to an 'app-example' directory while creating a new clean app directory. +You can safely remove the 'app-example' directory if you don't need it for reference. + +## Essential Dependencies + +Install the following packages to enable Flow integration and audio handling: + +```sh +# React version of Flow client +npm i @speechmatics/flow-client-react + +# Polyfill for the EventTarget class +npm i event-target-polyfill + +# Expo native module to handle audio +npm i @speechmatics/expo-two-way-audio + +# Just for the purpose of this example. See comment in the code above `createSpeechmaticsJWT` +npm i @speechmatics/auth +``` + +:::info +The Flow client uses [EventTarget](https://developer.mozilla.org/en-US/docs/Web/API/EventTarget) which is typically available in browsers but not in react native. +For that reason we've installed the polyfill: [`event-target-polyfill`](https://www.npmjs.com/package/event-target-polyfill). +::: + +## Building the User Interface + +Let's create a minimal user interface. +Start by clearing the `app/` directory and creating a new `index.tsx` file with a basic UI structure: + + + {indexStepOne} + + +The view above will just render a Connect/Disconnect button that won't do anything yet. + +Let's run it on the simulator to see how it looks: + +```sh +# For iOS simulator +npx expo run ios + +# For Android emulator +npx expo run android +``` + +This will launch the Metro Bundler and show up some options. + +:::warning +#### Expo Go is not supported +If it shows the following: `Using Expo Go`, **we need to switch to a development build by pressing** s. Then press r to reload the app. +Some features that we are going to include, like the native module for handling audio, don't work properly in Expo Go. +::: + +## Implementing Flow Connection + +It's time to add some functionality to this example. +We'll start by implementing the connect and disconnect logic. +For that we are going to use the `@speechmatics/flow-client-react` package. + +Our `/app/index.tsx` file should now look as follows: + + + {indexStepTwo} + + +In the code above, we are injecting the API key from an environment variable. +To make the environment variable available, let's create a `.env` file in the root directory of the project with the following content: + + + {"EXPO_PUBLIC_SPEECHMATICS_API_KEY='YOUR_API_KEY_GOES_HERE"} + + +API keys can be obtained from the [Speechmatics Portal](https://portal.speechmatics.com/api-keys) + +:::danger +This is just an example app. +In a real app you should obtain the JWT from your server. +`createSpeechmaticsJWT` could be used on a server running JS. +Otherwise, you will expose your API key to the client. +::: + +## Audio Integration + +The final step is implementing two-way audio communication. This involves three crucial tasks: + +1. Microphone input capture in PCM format +2. Speaker output routing for Flow responses +3. Acoustic Echo Cancellation (AEC) to prevent audio feedback + +We'll use the Speechmatics Expo Two Way Audio module to handle these requirements efficiently. + +In order to allow microphone access we need to add some configuration to the `app.json` file in the root of our project. +For iOS we add an `infoPlist` entry and for Android a `permissions` entry. + + +{` +{ + "expo": { + // ... + "ios": { + "infoPlist": { + "NSMicrophoneUsageDescription": "Allow Speechmatics to access your microphone" + }, + // ... + }, + "android": { + "permissions": ["RECORD_AUDIO", "MODIFY_AUDIO_SETTINGS"], + // ... + } + } + // ... +} +`} + + +Now we will update the code to handle these microphone adjustments. Our `/app/index.tsx` file should look as follows: + + + {indexStepThree} + + +Since we've introduced some app configuration changes in app.json, let's build the app from scratch (cleaning the `ios` or `android` folders): + +```sh +# iOS +npx expo prebuild --clean -p ios +npx expo run:ios + +# Android +npx expo prebuild --clean -p android +npx expo run:android +``` + +Our app can now unmute the microphone to send audio samples to the Flow server. Audio messages received from the Flow server will be played back through the speaker. + +:::info +While simulators are great for initial testing, features like Acoustic Echo Cancellation require physical devices for proper functionality. +If you find that the simulator audio support is not working or is not good enough we strongly recommend [testing on physical devices](#testing-on-physical-devices). +::: + +## Volume Indicators + +To enhance our UI, we'll add volume indicators for both the microphone and speaker, and organize our buttons into a "bottom bar." + +**Design Overview** + +- Volume Indicators: These will consist of two concentric circles: + - Outer Circle: Represents the speaker volume. + - Inner Circle: Represents the microphone volume. +- Animation: The circles will animate to grow or shrink based on the current volume level. + +**Implementation Details** + +We'll use the `react-native-reanimated` library to handle the animations. +This library is often included by default in Expo apps, but if it's not, you can follow the installation instructions [here](https://docs.swmansion.com/react-native-reanimated/docs/fundamentals/get-started/#installation). + +To keep things organised, let’s create a new file inside the app folder to house our custom volume indicator component. + + + {volumeDisplay} + + +Next, we'll integrate our volume indicator component into the app. + +Our `/app/index.tsx` file should look as follows: + + + {indexStepFour} + + +We have successfully completed our conversational AI application! You can connect to Flow services and unmute the microphone to start a conversation. + +## Testing on Physical Devices + +To deploy to real devices, first configure the development environment for using local builds with physical devices: + +- [iOS](https://docs.expo.dev/get-started/set-up-your-environment/?mode=development-build&buildEnv=local&platform=ios&device=physical) +- [Android](https://docs.expo.dev/get-started/set-up-your-environment/?mode=development-build&buildEnv=local) + +Then run the following command: + +```sh +# For iOS devices +npx expo run:ios --device --configuration Release + +# For Android devices +npx expo run:android --device --variant release +``` + +## Additional resources + +Dive deeper into the tools used in this guide: + +### Speechmatics JS SDK + +- [Repository](https://github.com/speechmatics/speechmatics-js-sdk) + +### Expo Two Way Audio + +- [Repository](https://github.com/speechmatics/expo-two-way-audio) +- [Examples](https://github.com/speechmatics/expo-two-way-audio/tree/main/examples) diff --git a/docs/voice-agents/flow/index.md b/docs/voice-agents/flow/index.md new file mode 100644 index 00000000..aae9d1fb --- /dev/null +++ b/docs/voice-agents/flow/index.md @@ -0,0 +1,62 @@ +--- +title: Overview +description: Build conversational AI agents with the Flow API +--- + +# Overview + +:::info +Try Flow free for up to 50 hours per month. +::: + +[**Flow**](https://speechmatics.com/flow) is our Voice Agent API that allows you to add responsive, real-time speech-to-speech interactions to any product. + +Flow is engineered to engage in natural and fluid conversations by automatically handling interruptions, responding to multiple speakers, and understanding different dialects and accents. + +## How Flow works + +Built on top of Speechmatics' [industry-leading ASR](/speech-to-text/realtime/quickstart), the latest LLMs and text to speech, Flow is engineered to engage in natural and fluid conversations. + +Simply stream in audio, and Flow will provide the TTS response as well as other useful information. + +### Component models + +The three base components of the Flow Engine are speech to text, large language model, and text to speech. + +#### Speech to text (ASR) + +Flow is built on the foundations of Speechmatics' market-leading Realtime ASR. The client passes streaming audio to the Flow service through the WebSocket. The service then processes multiple speech & non-speech signals such as the spoken words, tonality, & audio events before passing the context to the LLM to formulate a response. + +Flow natively supports multiple speaker detection (Speaker Diarization). Flow can be configured to ignore, acknowledge or engage with non-primary speakers when setting up Agents. + +This transcribed text is also streamed back to the client as soon as it is generated to support any client-driven recording, monitoring & analytics workflows. + +To improve accuracy on product-specific terminology we recommend using a Custom Dictionary when setting up Agents in the [Portal](https://portal.speechmatics.com/). + +#### Large language model (LLM) + +Flow’s conversational understanding & knowledge is powered by LLMs. The transcribed text from the ASR is then passed with Flow configurations to the LLM to formulate a natural-sounding response. + +The response-generation can be influenced through defining a persona, style, and context when setting up Templates. + +#### Text to speech (TTS) + +Output generated by the LLM, when ready to be spoken, will be converted to audio through the chosen TTS engine. These engines were selected to provide the most natural-sounding responses while not trading off on latency. This audio is then streamed back to the client, who must then play this back to the user. + +## Flow engine + +### Understanding disfluencies & pacing + +Everyone has a different style of speaking. Natural speech is colored with filler sounds and the pace of speech can vary from speaker to speaker. A one-size-fits-all voice agent can add a lot of friction to the experience if it keeps interrupting you. We’ve designed Flow to adapt to your speaking style and not be over-eager to interrupt, helping to make users feel comfortable. + +### Handling interruptions + +Flow has been modelled on real-world human conversations. Whether it is to stop Flow from going off-track or to correct wrong assumptions, you can interrupt it. We’ve built our own interruption engine that intelligently ignores unintentional interruptions and gracefully handles the ones that it needs to. To avoid sounding abrupt and unnatural when interrupted, Flow will finish the current word that’s being spoken and gradually fade out the next one. + +### End-of-turn detection + +Based on your voice & what you’ve been saying, Flow uses a [small language model (SLM) architecture](https://blog.speechmatics.com/semantic-turn-detection) to smartly detect when you’re done speaking before it responds for a natural and responsive experience. Flow is built to be human-centric and, while we could achieve much lower latencies, it’s rude to interrupt mid-thought. + +### Help and support + +For any additional issues, please reach out to the Flow Support team at [flow-help@speechmatics.com](mailto:flow-help@speechmatics.com). diff --git a/docs/voice-agents/flow/setup.mdx b/docs/voice-agents/flow/setup.mdx new file mode 100644 index 00000000..9d07e642 --- /dev/null +++ b/docs/voice-agents/flow/setup.mdx @@ -0,0 +1,44 @@ +--- +sidebar_label: Setup +description: 'Learn about the setup for Flow' +--- + +import SchemaNode from "@theme/Schema"; +import flowSchema from "!openapi-schema-loader!@site/spec/flow-api.yaml"; + +# Flow setup + +A voice agent template covers multiple elements that typically need to be configured in concert to power a specific class of conversations in a human-facing application. + + +Flow can be configured using the following parameters: + + + +
+For more details, refer to [StartConversation API reference](/api-ref/flow-voice-ai-websocket#startconversation). + +### Function calling + +[Function Calling](/voice-agents-flow/features/function-calling) allows you to connect Flow to external tools and systems. This unlocks Flow's ability to act in the real-world and better serve the needs of your users. + +This could involve needing real-time information such as opening/closing times or validation services for authentication or action APIs that control a fast food system while placing a drive-thru order. + + +### Moderating and controlling conversations + +You might want to control ongoing conversation based on what's spoken by the user or the output by the LLM. This could involve situations where the agent is asked to do things out of scope or the conversation is heading in unintentional directions. We enable this through sharing the real-time transcript from speech (AddPartialTranscript/ AddTranscript) and the entire response from the LLM just before it begins to speak (ResponseStarted). We recommend building monitoring on top of these streams and to use either AudioEnded to end the session, or close the WebSocket directly if the final transcript is unimportant. + +#### Steering the conversation + +[Application Inputs](/voice-agents-flow/features/application-inputs) allow you to steer the conversation by adding helpful updates & information asynchronously to Flow + +### Managing call recordings and transcripts + +Clients are responsible for maintaining their own recordings & conversation logs. This is enabled through the audio already being routed entirely through the client, and conversation transcripts being provided in real-time through AddPartialTranscript/AddTranscript/ ResponseStarted/ ResponseCompleted/ ResponseInterrupted. + +### Internet search + +Internet Search allows your agent to look up information such as the weather and the news by accessing the internet. + +Internet Search is currently only available when using the official [iPhone](https://apps.apple.com/us/app/speechmatics-flow/id6673918783) or [Android](https://play.google.com/store/apps/details?id=com.speechmatics.flowapp) applications, or for Enterprise customers. diff --git a/docs/voice-agents/flow/sidebar.ts b/docs/voice-agents/flow/sidebar.ts new file mode 100644 index 00000000..b6f69577 --- /dev/null +++ b/docs/voice-agents/flow/sidebar.ts @@ -0,0 +1,61 @@ +export default { + type: "category", + label: "Flow", + collapsible: true, + collapsed: true, + items: [ + { + type: "doc", + label: "Overview", + id: "voice-agents/flow/index", + }, + { + type: "category", + label:"Features", + collapsible: true, + collapsed: true, + items: [ + { + type: "doc", + id: "voice-agents/flow/features/application-inputs", + }, + { + type: "doc", + id: "voice-agents/flow/features/function-calling", + }, + { + type: "doc", + id: "voice-agents/flow/features/webrtc-livekit", + }, + ], + }, + { + type: "category", + label:"Guides", + collapsible: true, + collapsed: true, + items: [ + { + type: "doc", + id: "voice-agents/flow/guides/nextjs-guide", + }, + { + type: "doc", + id: "voice-agents/flow/guides/react-native", + }, + ], + }, + { + type: "doc", + id: "voice-agents/flow/setup", + }, + { + type: "doc", + id: "voice-agents/flow/supported-formats-and-limits", + }, + { + type: "doc", + id: "voice-agents/flow/supported-languages", + }, + ], +} as const; \ No newline at end of file diff --git a/docs/voice-agents/flow/supported-formats-and-limits.mdx b/docs/voice-agents/flow/supported-formats-and-limits.mdx new file mode 100644 index 00000000..6021e150 --- /dev/null +++ b/docs/voice-agents/flow/supported-formats-and-limits.mdx @@ -0,0 +1,39 @@ +--- +keywords: [speechmatics, api, limits, flow, voice agents] +toc_max_heading_level: 2 +description: 'Learn about the supported input and output audio formats for the Flow API' +--- + +# Supported Formats and Limits + +## Input audio + +All input audio (i.e. the user's voice) sent to the Flow API must be raw PCM audio in one of the following formats: + +- PCM F32 LE raw audio stream (32-bit float) +- PCM S16 LE raw audio stream (16-bit signed int) + +Other audio encodings are not supported. Sample rates are not restricted, but we recommend using **16kHz**. + +## Output audio + +The Flow API will always return PCM audio in **PCM S16 LE** format, regardless of the input format. + +The output audio sample rate is always **16kHz**. + +## Usage Limits + +The Flow API limits the number of hours of audio users can process each month to help manage load on our servers. All users are limited to 50 hours per month and three concurrent sessions. + +Please reach out to [Support](https://support.speechmatics.com) if you need to increase the above limits. + +### Guidance for users + +Clients can disconnect a session before it is automatically terminated and immediately reconnect a new session. Note that new sessions will typically start in less than a second. +If seamless transition is required, the new session can be connected a few seconds before disconnecting the old session. + +Since unpredictable network issues can cause WebSocket connections to be dropped, we recommend graceful handling of session termination for long-running sessions. + +## Data Retention + +Conversation audio and transcriptions by the Flow API are not stored. \ No newline at end of file diff --git a/docs/voice-agents/flow/supported-languages.mdx b/docs/voice-agents/flow/supported-languages.mdx new file mode 100644 index 00000000..5c5eaf0c --- /dev/null +++ b/docs/voice-agents/flow/supported-languages.mdx @@ -0,0 +1,53 @@ +--- +description: 'Learn about the languages supported in Flow' +--- + +# Languages Supported + +Speechmatics offers over 30 languages in Flow with more on the way. The following is a list of supported languages & voices. These can be configured when creating an agent in the [portal](https://portal.speechmatics.com/) for use in the API. + +| Language | Voices | +| ---------------------------- | ------------------------------- | +| Arabic | Arabic Female | +| Bulgarian | Bulgarian Male | +| Croatian | Croatian Male | +| Czech | Czech Female | +| Danish | Danish Female | +| Dutch | Dutch Female | +| English | English (British) Female, English (British) Male, English (American) Male, English (American Female) | +| Finnish | Finnish Female | +| French | French Male | +| German | German Male | +| Greek | Greek Male | +| Hindi | Hindi Female | +| Hungarian | Hungarian Female | +| Indonesian | Indonesian Female | +| Italian | Italian Female | +| Japanese | Japanese Female | +| Korean | Korean Female | +| Malay | English/Malay Female | +| Mandarin | English/Mandarin Male | +| Norwegian | Norwegian Male | +| Polish | Polish Male | +| Portuguese | Portuguese Male | +| Romanian | Romanian Female | +| Russian | Russian Male | +| Slovakian | Slovak Male | +| Spanish | English/Spanish Male, English/Spanish (Colombian) Male, English/Spanish (Colombian) Female | +| Swedish | Swedish Female | +| Tamil | English/Tamil Female | +| Thai | Thai Female | +| Turkish | Turkish Female | +| Ukrainian | Ukrainian Female | +| Vietnamese | Vietnamese Female | + +## Bilingual Support + +For the following languages & voices, the agents created are able to communicate in the primary language of the language pack as well as English without compromising the accuracy of either. + +| Bilingual Pack | Voices | +| ---------------------------- | ------------------------------- | +| Spanish / English | English/Spanish Male, English/Spanish (Colombian) Male, English/Spanish (Colombian) Female | +| Mandarin / English | Mandarin/English Male | +| Tamil / English | Tamil/English Female | +| Malay / English | Malay/English Female | \ No newline at end of file diff --git a/docs/voice-agents/sidebar.ts b/docs/voice-agents/sidebar.ts index f93c8697..f14bba42 100644 --- a/docs/voice-agents/sidebar.ts +++ b/docs/voice-agents/sidebar.ts @@ -1,4 +1,5 @@ -import voiceAgentsFlowSidebar from "./voice-agents-flow/sidebar"; +import voiceAgentsFlowSidebar from "./flow/sidebar"; + export default { type: "category", label: "Voice agents", From e1cfe2f93f2c126b3700a83ad07e813638f267f1 Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Wed, 17 Dec 2025 17:08:01 +0000 Subject: [PATCH 09/12] Remove duplicate Flow feature docs --- .../features/application-inputs.mdx | 30 -- .../features/assets/function-calling.py | 133 --------- .../features/assets/livekit-poc.html | 241 --------------- .../features/function-calling.mdx | 83 ------ .../features/webrtc-livekit.mdx | 55 ---- .../voice-agents-flow/guides/nextjs-guide.mdx | 274 ------------------ .../voice-agents-flow/guides/react-native.mdx | 254 ---------------- docs/voice-agents/voice-agents-flow/index.md | 62 ---- docs/voice-agents/voice-agents-flow/setup.mdx | 44 --- .../voice-agents/voice-agents-flow/sidebar.ts | 61 ---- .../supported-formats-and-limits.mdx | 39 --- .../voice-agents-flow/supported-languages.mdx | 53 ---- 12 files changed, 1329 deletions(-) delete mode 100644 docs/voice-agents/voice-agents-flow/features/application-inputs.mdx delete mode 100644 docs/voice-agents/voice-agents-flow/features/assets/function-calling.py delete mode 100644 docs/voice-agents/voice-agents-flow/features/assets/livekit-poc.html delete mode 100644 docs/voice-agents/voice-agents-flow/features/function-calling.mdx delete mode 100644 docs/voice-agents/voice-agents-flow/features/webrtc-livekit.mdx delete mode 100644 docs/voice-agents/voice-agents-flow/guides/nextjs-guide.mdx delete mode 100644 docs/voice-agents/voice-agents-flow/guides/react-native.mdx delete mode 100644 docs/voice-agents/voice-agents-flow/index.md delete mode 100644 docs/voice-agents/voice-agents-flow/setup.mdx delete mode 100644 docs/voice-agents/voice-agents-flow/sidebar.ts delete mode 100644 docs/voice-agents/voice-agents-flow/supported-formats-and-limits.mdx delete mode 100644 docs/voice-agents/voice-agents-flow/supported-languages.mdx diff --git a/docs/voice-agents/voice-agents-flow/features/application-inputs.mdx b/docs/voice-agents/voice-agents-flow/features/application-inputs.mdx deleted file mode 100644 index c3299c50..00000000 --- a/docs/voice-agents/voice-agents-flow/features/application-inputs.mdx +++ /dev/null @@ -1,30 +0,0 @@ ---- -description: 'Learn about the application inputs for Flow' ---- - -import flowSpec from "!openapi-schema-loader!@site/spec/flow-api.yaml"; -import SchemaNode from "@theme/Schema"; -import { omit } from "lodash"; - -# Application Inputs - -Voice-enabled applications often need to react to events beyond user speech. For instance: -- A user pauses for too long, requiring a prompt to keep the conversation going -- An external notification or system event which must be conveyed during a voice conversation - -To handle these scenarios, the application can send an AddInput message to the Flow system. This allows non-verbal or system-driven inputs to be injected into the conversation dynamically. - -```json -{ - "message": "AddInput", - "input": "", // Required: The input text to be incorporated - "interrupt_response": "", // Optional: If true, interrupts the current response (default: false) - "immediate": "" // Optional: If true, processes the input immediately after the current turn ends (default: false) -} -``` - -## Application Input variables - -See [API reference](/api-ref/flow-voice-ai-websocket#addinput) for complete schema. - - diff --git a/docs/voice-agents/voice-agents-flow/features/assets/function-calling.py b/docs/voice-agents/voice-agents-flow/features/assets/function-calling.py deleted file mode 100644 index ec8e740f..00000000 --- a/docs/voice-agents/voice-agents-flow/features/assets/function-calling.py +++ /dev/null @@ -1,133 +0,0 @@ -import asyncio -import io -import sys -import json - -import pyaudio - -from speechmatics_flow.client import WebsocketClient -from speechmatics_flow.models import ( - ConnectionSettings, - Interaction, - AudioSettings, - ConversationConfig, - ServerMessageType, - ClientMessageType, -) -from speechmatics_flow.tool_function_param import ToolFunctionParam - -AUTH_TOKEN = "Place your auth token here" - -# Example configuration which could add a reminder to a calendar. -reminder_config = ToolFunctionParam( - type="function", - function={ - "name": "add_reminder", - "description": "Use this to schedule reminders. Needs a confirmation.", - "parameters": { - "type": "object", - "properties": { - "date": { - "type": "string", - "description": "The date for the reminder in dd/mm/yyyy format", - }, - "time": { - "type": "string", - "description": "The time for the reminder in 24 hour hh:mm format", - }, - "title": { - "type": "string", - "description": "The title for the reminder", - }, - "project": { - "type": "string", - "description": "Which project the reminder is related to. If not provided, leave blank.", - }, - }, - "required": ["project"], - }, - }, -) - - -# Callback for handling reminder ToolInvoke in your system. -async def reminder_handler(msg: dict): - print("Attempting to add reminder") - print(msg) - response_message = { - "message": ClientMessageType.ToolResult, - "id": msg["id"], - "status": "ok", # Used to inform user the status of the function call. Could be "failed" or "rejected". - "content": "Added reminder successfully to calendar", # LLM response helper message - } - - await client.websocket.send(json.dumps(response_message)) - - -# Create a websocket client -client = WebsocketClient( - ConnectionSettings( - url="wss://flow.api.speechmatics.com/v1/flow", - auth_token=AUTH_TOKEN, - ) -) - -# Create a buffer to store binary messages sent from the server -audio_buffer = io.BytesIO() - - -# Create callback function which adds binary messages to audio buffer -def binary_msg_handler(msg: bytes): - if isinstance(msg, (bytes, bytearray)): - audio_buffer.write(msg) - - -# Register the callback which will be called -# when the client receives an audio message from the server -client.add_event_handler(ServerMessageType.AddAudio, binary_msg_handler) - -# Handling ToolInvoke message -client.add_event_handler(ServerMessageType.ToolInvoke, reminder_handler) - - -async def audio_playback(buffer): - """Read from buffer and play audio back to the user""" - p = pyaudio.PyAudio() - stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True) - try: - while True: - # Get the current value from the buffer - audio_to_play = buffer.getvalue() - # Only proceed if there is audio data to play - if audio_to_play: - # Write the audio to the stream - stream.write(audio_to_play) - buffer.seek(0) - buffer.truncate(0) - # Pause briefly before checking the buffer again - await asyncio.sleep(0.05) - finally: - stream.close() - stream.stop_stream() - p.terminate() - - -async def main(): - print("Starting...") - tasks = [ - # Use the websocket to connect to Flow Service and start a conversation - asyncio.create_task( - client.run( - interactions=[Interaction(sys.stdin.buffer)], - audio_settings=AudioSettings(), - conversation_config=ConversationConfig(), - tools=[reminder_config], - ) - ), - # Run audio playback handler which streams audio from audio buffer - asyncio.create_task(audio_playback(audio_buffer)), - ] - await asyncio.gather(*tasks) - - -asyncio.run(main()) diff --git a/docs/voice-agents/voice-agents-flow/features/assets/livekit-poc.html b/docs/voice-agents/voice-agents-flow/features/assets/livekit-poc.html deleted file mode 100644 index 1678c3c0..00000000 --- a/docs/voice-agents/voice-agents-flow/features/assets/livekit-poc.html +++ /dev/null @@ -1,241 +0,0 @@ - - - - - - Minimal LiveKit Flow Client - - - - -

Minimal LiveKit Flow Client

- -
- -
-
Status: disconnected
- -
- - - - diff --git a/docs/voice-agents/voice-agents-flow/features/function-calling.mdx b/docs/voice-agents/voice-agents-flow/features/function-calling.mdx deleted file mode 100644 index e7b6d13e..00000000 --- a/docs/voice-agents/voice-agents-flow/features/function-calling.mdx +++ /dev/null @@ -1,83 +0,0 @@ ---- -description: 'Learn about the function calling feature for Flow' ---- - -import CodeBlock from '@theme/CodeBlock'; -import pythonFunctionCallingExample from './assets/function-calling.py'; -import SchemaNode from '@theme/Schema'; -import flowSpec from '!openapi-schema-loader!@site/spec/flow-api.yaml'; -import { Callout, Flex, Text } from '@radix-ui/themes'; -import { ConstructionIcon } from 'lucide-react'; - -# Function Calling - -Function Calling allows you to connect Flow to external tools and systems. This unlocks Flow's ability to act in the real-world and better serve the needs of your users. - -This could involve needing real-time information such as opening/closing times or validation services for authentication or action APIs that control a fast food system while placing a drive-thru order. - -Based on what the user says in the conversation, Flow will recognise the user's intentions and extract out the key information that your system needs to complete the function call. - -For example, you may want Flow to add reminders in a user's calendar: - -```json -{ - "name": "add_reminder", - "description": "Use this to schedule reminders. Needs a confirmation.", - "parameters": { - "type": "object", - "properties": { - "date" : { - "type" : "string", - "description" : "The date for the reminder in dd/mm/yyyy format" - }, - "time" : { - "type": "string", - "description" : "The time for the reminder in 24 hour hh:mm format" - }, - "title" : { - "type": "string", - "description" : "The title for the reminder" - }, - "project": { - "type": "string", - "description": "Which project the reminder is related to. If not provided, leave blank." - } - }, - "required": ["project"] - } -} -``` - -## Configuring Function Calling - -An agent can be configured to use function calling in two ways: - -1. **In code**: when starting a session with the [`StartConversation` message](/api-ref/flow-voice-ai-websocket#startconversation) -2. (_coming soon_) **In the portal**: when configuring an agent - -### In the portal - -[Create an agent in the portal](https://portal.speechmatics.com/create-agent) and enable function calling in the agent settings. - -### In `StartConversation` - -Functions must be declared within a list of tools when your client sends the [StartConversation message](/api-ref/flow-voice-ai-websocket#startconversation). Each function must be defined with the following: - - - - -## Example - - -{pythonFunctionCallingExample} - - -## Considerations - -- Function `status` - The client must inform the service of whether the function call succeeded or not. This allows the service to inform the user of the result. There is no automatic timeout on the Flow API. -- Asynchronous - Function calling is fully asynchronous. Once the client is informed of the function call, the conversation will continue to progress until a function call status update is received from the client. This is to continue providing a natural conversational experience to the customer. -- Completion Message - Flow can play a message on completion of the function call. The Client can switch this off by passing `` in the content field of the ToolResult message. - -:::note -Since LLMs are semantically instructed, complete, narrow and unambiguous function calls with simple descriptions can create a reliable customer experience. Complex business logic should be handled within your client. -::: diff --git a/docs/voice-agents/voice-agents-flow/features/webrtc-livekit.mdx b/docs/voice-agents/voice-agents-flow/features/webrtc-livekit.mdx deleted file mode 100644 index c73dcc63..00000000 --- a/docs/voice-agents/voice-agents-flow/features/webrtc-livekit.mdx +++ /dev/null @@ -1,55 +0,0 @@ ---- -description: 'Learn how to use the Flow API over WebRTC with LiveKit' ---- - -import CodeBlock from "@theme/CodeBlock"; -import LivekitPoc from "./assets/livekit-poc.html?raw"; - -# WebRTC over LiveKit - -A client may want to use Flow in unsure network conditions or through mobile devices with fluctuating networks. In such scenarios, we offer WebRTC protocol as a way to connect to Flow. More information about the protocol can be found on official [webrtc website](https://webrtc.org/). -Flow uses the WebRTC setup provided by LiveKit to enable support for the protocol. - -## API - -Client makes a HTTP POST request to `/v1/flow/livekit` endpoint, with body containing StartConversation message as described in the [Flow API reference](/api-ref/flow-voice-ai-websocket#startconversation). "audio_format" field must not be used in this scenario as LiveKit WebRTC takes control of the audio format. - -```json -{ - "message": "StartConversation", - "conversation_config": { - "template_id": "flow-service-assistant-one", - "template_variables": { - "timezone": "Europe/London" - } - } -} -``` - -## Response - -In response, a LiveKit room is created. The returned URL & token are used to connect to the LiveKit server. - -```json -{ - "url": "wss://test-app-d3kro1gz.livekit.cloud", - "token": "", - "id": "
" -} -``` - -## Connecting to LiveKit - -Provided JWT token has short TTL and should be used immediately after receiving it. - -The LiveKit SDK for a given platform should be used to connect to the LiveKit server. The SDK handles the connection and audio streaming, including bitrate management. -Text messages, or control messages are exchanged using LocalParticipant object. -Protocol for messages with the Flow Agent is the same as in case of normal WebSocket connection to Flow API. - -LiveKit documentation can be found [here](https://docs.livekit.io/home/client/connect/) - -## Example client in JavaScript - - - {LivekitPoc} - \ No newline at end of file diff --git a/docs/voice-agents/voice-agents-flow/guides/nextjs-guide.mdx b/docs/voice-agents/voice-agents-flow/guides/nextjs-guide.mdx deleted file mode 100644 index 3fe4b451..00000000 --- a/docs/voice-agents/voice-agents-flow/guides/nextjs-guide.mdx +++ /dev/null @@ -1,274 +0,0 @@ ---- -sidebar_label: NextJS -title: Build a conversational AI web app with Next.js and Flow -description: 'Learn how to build a conversational AI web app with Next.js and Flow' ---- - -import { Box, Card, Flex } from "@radix-ui/themes"; -import CodeBlock from "@theme/CodeBlock"; -import { GithubIcon } from "lucide-react"; - -{/* -------------- Step 1-------------- */} - -import postcssConfigExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-1/postcss.config.mjs"; -import globalsCssExampleStepOne from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-1/app/globals.css"; - -{/* -------------- Step 2 -------------- */} -import providersStepTwo from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-2/app/providers.tsx"; -import useAudioContextsExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-2/hooks/useAudioContexts.ts"; -import pageStepTwoExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-2/app/page.tsx"; -import nextjsConfigExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-2/next.config.ts"; - -{/* -------------- Step 3 -------------- */} -import globalsCssExampleStepThree from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-3/app/globals.css"; -import controlsExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-3/components/Controls.tsx"; -import serverActionExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-3/app/actions.ts"; -import microphoneSelectExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-3/components/MicrophoneSelect.tsx"; -import statusExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-3/components/Status.tsx"; -import transcriptViewExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-3/components/TranscriptView.tsx"; -import pageStepThreeExample from "?url=https://raw.githubusercontent.com/speechmatics/nextjs-flow-guide/refs/heads/step-3/app/page.tsx"; - - -# Build a conversational AI web app with Next.js and Flow - -In this guide, we will walk you through the process of building a conversational AI web application using Next.js and Flow. -You will learn how to set up your development environment, create a Next.js project, integrate Flow and implement a simple conversational AI feature. - -You can find the complete code on [GitHub ](https://github.com/speechmatics/nextjs-flow-guide). - -## Prerequisites - -Before getting started, ensure you have: - -- [Node.js 20](https://nodejs.org/en) or later -- A Speechmatics account and API key - -## Step 1: Setup project, dependencies and API key - -We will be using NextJS 15 with App Router and Typescript. We will also use TailwindCSS for styling, but feel free to use any styling solution you prefer. - -### Create a new Next.js project - -```sh -npx create-next-app@latest nextjs-flow-guide --typescript --eslint --app -``` - -### Install Speechmatics packages - -```sh -# Official Flow API client for React -npm install @speechmatics/flow-client-react - -# Used for requesting JWTs for API authentication -npm install @speechmatics/auth - -# These let us capture and play raw audio in the browser easily -npm install @speechmatics/browser-audio-input -npm install @speechmatics/browser-audio-input-react -npm install @speechmatics/web-pcm-player-react - -# Utility package for rendering the transcript of the conversation -npm install @speechmatics/use-flow-transcript - -``` - -### Install TailwindCSS - -These steps are from the[Tailwind docs here](https://tailwindcss.com/docs/installation/framework-guides/nextjs). - -1. Install TailwindCSS -```sh -npm install @tailwindcss/postcss -``` - -2. Create a `postcss.config.mjs` file in the root of the project with the following content: - - - {postcssConfigExample} - - -Finally, remove all styles from the `globals.css` file, and replace it with the following content: - - - {globalsCssExampleStepOne} - - - -### Add your API key to `.env` - -Create a `.env` file in the root of the project, and add your API key: - - - {"API_KEY=\"your-api-key\""} - - -## Step 2: Configuration and Context providers - -### Configure Webpack to serve AudioWorklet script - -To interface with the Flow API, we need to record and send raw audio. The `@speechmatics/browser-audio-input` package is designed to do this. It achieves this by providing a script which can be loaded by an [AudioWorklet](https://developer.mozilla.org/en-US/docs/Web/API/AudioWorklet), but how this script is consumed depends on the bundler being used. - -In order to use this package with NextJS, we need to configure Webpack to serve the provided script from a URL, rather than bundling it with the rest. We can leverage [Asset Modules](https://webpack.js.org/guides/asset-modules/) to achieve this. - - - {nextjsConfigExample} - - - -### Audio and Context providers - -#### Context providers - -We will be using 3 context providers in the app: - -1. **FlowProvider** - Provides the Flow client to the app. -2. **PCMAudioRecorderProvider** - Given an `AudioContext`, provides the browser audio input to the app. -3. **PCMAudioPlayerProvider** - Given an `AudioContext`, provides the web PCM player to the app. - -We'll start by creating a `providers.tsx` file in the `app` directory, and adding the following content: - -Here we add the 3 context providers to the app, passing the `AudioContext` instances to both the audio providers, and the `workletScriptURL` to `PCMAudioRecorderProvider`. - - {providersStepTwo} - - -:::info -#### A note about `AudioContext` and sample rates in Firefox - -[AudioContext](https://developer.mozilla.org/en-US/docs/Web/API/AudioContext) is the WebAPI for handling audio recording and playback. Under normal circumstances, you should aim to have [one reusable instance](https://developer.mozilla.org/en-US/docs/Web/API/AudioContext#:~:text=It%27s%20recommended%20to%20create%20one%20AudioContext%20and%20reuse%20it%20instead%20of%20initializing%20a%20new%20one%20each%20time) of an `AudioContext` in your app. In most browsers, an `AudioContext` can freely record and play back audio at different sample rates, but this is not the case in Firefox (see outstanding [bug here](https://bugzilla.mozilla.org/show_bug.cgi?id=1725336)). - - -To handle this, we can create a utility hook to expose separate `AudioContext` instances for recording and playback in Firefox, while sharing a single instance for other browsers (see below). - -::: - - - {useAudioContextsExample} - - -Now place the following code in the `app/page.tsx` file: - - - {pageStepTwoExample} - - -:::tip -If you get an error about `AudioWorkletProcessor` not being defined, make sure you [configured Webpack to serve the script URL](#configure-webpack-to-serve-audioworklet-script). -::: - -## Step 3: Implementing the UI - - -### Wireframe and styles - -The UI will follow this wireframe: - - - - - - - -

**Controls**

-

Where we can select the input device and persona, and start/stop the session.

-
-
- - -

**Status**

-

Displays the current status of the connection.

-
-
-
- - -

**TranscriptView**

-

Displays the transcript of the conversation.

-
-
-
-
-
- -We'll start by adding some basic styles to the `app/globals.css` file: - - - {globalsCssExampleStepThree} - - - -### `Controls` component - -This component will contain: - -- A dropdown to select the input device -- A dropdown to select the persona -- A button to start/stop the session -- A button to mute the microphone when the session is active - -To connect to the API, we will also need to setup a [Server Action](https://react.dev/reference/rsc/server-functions) to request a JWT from the backend. We can then call this server action in our component. - -Here we define the controls component: - - `form` contains the dropdowns and buttons - - When the form is submitted, we call the `getJWT` server action, then pass the JWT to the `startConversation` function, along with the config from the FormData. - - - {controlsExample} - - -We also create a utility component to render the microphone select dropdown. It also handles prompting the user for permission to use the microphone. - - {microphoneSelectExample} - - -Finally we define the server action to request a JWT from the backend. - - {serverActionExample} - - - -### `Status` component - -This component will display: - - The status of the Websocket connection - - The Session ID of the current conversation - - Whether the microphone is recording - - - {statusExample} - - - -### `TranscriptView` component - -This component will use the `useFlowTranscript` hook to display the transcript of the conversation. - -:::tip -The `useFlowTranscript` hook is provided for convenience. If you want more fine-grained control over the transcript you should use the `useFlowEventListener` hook to listen for incoming events, and handle them as you see fit. -::: - - - {transcriptViewExample} - - -### Putting it all together - -Now we can update the `app/page.tsx` file to use the new components: - -:::note -Since the component in `page.tsx` is a [React Server Component](https://react.dev/reference/rsc/server-components), we can use it to fetch the list of personas from the backend, and pass it to the `Controls` component. -::: - - - {pageStepThreeExample} - - -## Running the app - -To run the app, use the following command: - -```sh -npm run dev -``` - -You should now be able to access the app at [`http://localhost:3000`](http://localhost:3000). \ No newline at end of file diff --git a/docs/voice-agents/voice-agents-flow/guides/react-native.mdx b/docs/voice-agents/voice-agents-flow/guides/react-native.mdx deleted file mode 100644 index 2416b069..00000000 --- a/docs/voice-agents/voice-agents-flow/guides/react-native.mdx +++ /dev/null @@ -1,254 +0,0 @@ ---- -title: Build a conversational AI app with React Native and Flow -description: 'Learn how to create a mobile application that integrates Speechmatics Flow service using React Native.' -sidebar_label: React Native ---- - -import AppScreenshotUrl from "@site/static/img/flow-react-native.png"; -import CodeBlock from "@theme/CodeBlock"; -import { Kbd } from "@radix-ui/themes"; -import { GithubIcon } from "lucide-react"; - -import indexStepOne from "?url=https://raw.githubusercontent.com/speechmatics/flow-react-native-guide/refs/heads/step-1/app/index.tsx"; -import indexStepTwo from "?url=https://raw.githubusercontent.com/speechmatics/flow-react-native-guide/refs/heads/step-2/app/index.tsx"; -import indexStepThree from "?url=https://raw.githubusercontent.com/speechmatics/flow-react-native-guide/refs/heads/step-3/app/index.tsx"; -import volumeDisplay from "?url=https://raw.githubusercontent.com/speechmatics/flow-react-native-guide/refs/heads/step-4/app/volume-display.tsx"; -import indexStepFour from "?url=https://raw.githubusercontent.com/speechmatics/flow-react-native-guide/refs/heads/step-4/app/index.tsx"; - -# Build a conversational AI app with React Native and Flow - -This guide demonstrates how to build the app using the Expo framework, implementing real-time audio communication with Flow's servers. - -You can find the complete code on [GitHub ](https://github.com/speechmatics/flow-react-native-guide). - - - -## Prerequisites - -Before getting started, ensure you have: - -- [Node.js (LTS)](https://nodejs.org/en/download) installed on your system -- Development environment configured for development builds: - - [Android Development](https://docs.expo.dev/get-started/set-up-your-environment/?mode=development-build&buildEnv=local&platform=android&device=simulated) - - [iOS Development](https://docs.expo.dev/get-started/set-up-your-environment/?mode=development-build&buildEnv=local&platform=ios&device=simulated) - -## Project Setup - -Start by creating a fresh Expo project: - -```sh -npx create-expo-app@latest -``` - -To remove the example code and start with a clean slate: - -```sh -npm run reset-project -``` - -This command preserves the example files by moving them to an 'app-example' directory while creating a new clean app directory. -You can safely remove the 'app-example' directory if you don't need it for reference. - -## Essential Dependencies - -Install the following packages to enable Flow integration and audio handling: - -```sh -# React version of Flow client -npm i @speechmatics/flow-client-react - -# Polyfill for the EventTarget class -npm i event-target-polyfill - -# Expo native module to handle audio -npm i @speechmatics/expo-two-way-audio - -# Just for the purpose of this example. See comment in the code above `createSpeechmaticsJWT` -npm i @speechmatics/auth -``` - -:::info -The Flow client uses [EventTarget](https://developer.mozilla.org/en-US/docs/Web/API/EventTarget) which is typically available in browsers but not in react native. -For that reason we've installed the polyfill: [`event-target-polyfill`](https://www.npmjs.com/package/event-target-polyfill). -::: - -## Building the User Interface - -Let's create a minimal user interface. -Start by clearing the `app/` directory and creating a new `index.tsx` file with a basic UI structure: - - - {indexStepOne} - - -The view above will just render a Connect/Disconnect button that won't do anything yet. - -Let's run it on the simulator to see how it looks: - -```sh -# For iOS simulator -npx expo run ios - -# For Android emulator -npx expo run android -``` - -This will launch the Metro Bundler and show up some options. - -:::warning -#### Expo Go is not supported -If it shows the following: `Using Expo Go`, **we need to switch to a development build by pressing** s. Then press r to reload the app. -Some features that we are going to include, like the native module for handling audio, don't work properly in Expo Go. -::: - -## Implementing Flow Connection - -It's time to add some functionality to this example. -We'll start by implementing the connect and disconnect logic. -For that we are going to use the `@speechmatics/flow-client-react` package. - -Our `/app/index.tsx` file should now look as follows: - - - {indexStepTwo} - - -In the code above, we are injecting the API key from an environment variable. -To make the environment variable available, let's create a `.env` file in the root directory of the project with the following content: - - - {"EXPO_PUBLIC_SPEECHMATICS_API_KEY='YOUR_API_KEY_GOES_HERE"} - - -API keys can be obtained from the [Speechmatics Portal](https://portal.speechmatics.com/api-keys) - -:::danger -This is just an example app. -In a real app you should obtain the JWT from your server. -`createSpeechmaticsJWT` could be used on a server running JS. -Otherwise, you will expose your API key to the client. -::: - -## Audio Integration - -The final step is implementing two-way audio communication. This involves three crucial tasks: - -1. Microphone input capture in PCM format -2. Speaker output routing for Flow responses -3. Acoustic Echo Cancellation (AEC) to prevent audio feedback - -We'll use the Speechmatics Expo Two Way Audio module to handle these requirements efficiently. - -In order to allow microphone access we need to add some configuration to the `app.json` file in the root of our project. -For iOS we add an `infoPlist` entry and for Android a `permissions` entry. - - -{` -{ - "expo": { - // ... - "ios": { - "infoPlist": { - "NSMicrophoneUsageDescription": "Allow Speechmatics to access your microphone" - }, - // ... - }, - "android": { - "permissions": ["RECORD_AUDIO", "MODIFY_AUDIO_SETTINGS"], - // ... - } - } - // ... -} -`} - - -Now we will update the code to handle these microphone adjustments. Our `/app/index.tsx` file should look as follows: - - - {indexStepThree} - - -Since we've introduced some app configuration changes in app.json, let's build the app from scratch (cleaning the `ios` or `android` folders): - -```sh -# iOS -npx expo prebuild --clean -p ios -npx expo run:ios - -# Android -npx expo prebuild --clean -p android -npx expo run:android -``` - -Our app can now unmute the microphone to send audio samples to the Flow server. Audio messages received from the Flow server will be played back through the speaker. - -:::info -While simulators are great for initial testing, features like Acoustic Echo Cancellation require physical devices for proper functionality. -If you find that the simulator audio support is not working or is not good enough we strongly recommend [testing on physical devices](#testing-on-physical-devices). -::: - -## Volume Indicators - -To enhance our UI, we'll add volume indicators for both the microphone and speaker, and organize our buttons into a "bottom bar." - -**Design Overview** - -- Volume Indicators: These will consist of two concentric circles: - - Outer Circle: Represents the speaker volume. - - Inner Circle: Represents the microphone volume. -- Animation: The circles will animate to grow or shrink based on the current volume level. - -**Implementation Details** - -We'll use the `react-native-reanimated` library to handle the animations. -This library is often included by default in Expo apps, but if it's not, you can follow the installation instructions [here](https://docs.swmansion.com/react-native-reanimated/docs/fundamentals/get-started/#installation). - -To keep things organised, let’s create a new file inside the app folder to house our custom volume indicator component. - - - {volumeDisplay} - - -Next, we'll integrate our volume indicator component into the app. - -Our `/app/index.tsx` file should look as follows: - - - {indexStepFour} - - -We have successfully completed our conversational AI application! You can connect to Flow services and unmute the microphone to start a conversation. - -## Testing on Physical Devices - -To deploy to real devices, first configure the development environment for using local builds with physical devices: - -- [iOS](https://docs.expo.dev/get-started/set-up-your-environment/?mode=development-build&buildEnv=local&platform=ios&device=physical) -- [Android](https://docs.expo.dev/get-started/set-up-your-environment/?mode=development-build&buildEnv=local) - -Then run the following command: - -```sh -# For iOS devices -npx expo run:ios --device --configuration Release - -# For Android devices -npx expo run:android --device --variant release -``` - -## Additional resources - -Dive deeper into the tools used in this guide: - -### Speechmatics JS SDK - -- [Repository](https://github.com/speechmatics/speechmatics-js-sdk) - -### Expo Two Way Audio - -- [Repository](https://github.com/speechmatics/expo-two-way-audio) -- [Examples](https://github.com/speechmatics/expo-two-way-audio/tree/main/examples) diff --git a/docs/voice-agents/voice-agents-flow/index.md b/docs/voice-agents/voice-agents-flow/index.md deleted file mode 100644 index aae9d1fb..00000000 --- a/docs/voice-agents/voice-agents-flow/index.md +++ /dev/null @@ -1,62 +0,0 @@ ---- -title: Overview -description: Build conversational AI agents with the Flow API ---- - -# Overview - -:::info -Try Flow free for up to 50 hours per month. -::: - -[**Flow**](https://speechmatics.com/flow) is our Voice Agent API that allows you to add responsive, real-time speech-to-speech interactions to any product. - -Flow is engineered to engage in natural and fluid conversations by automatically handling interruptions, responding to multiple speakers, and understanding different dialects and accents. - -## How Flow works - -Built on top of Speechmatics' [industry-leading ASR](/speech-to-text/realtime/quickstart), the latest LLMs and text to speech, Flow is engineered to engage in natural and fluid conversations. - -Simply stream in audio, and Flow will provide the TTS response as well as other useful information. - -### Component models - -The three base components of the Flow Engine are speech to text, large language model, and text to speech. - -#### Speech to text (ASR) - -Flow is built on the foundations of Speechmatics' market-leading Realtime ASR. The client passes streaming audio to the Flow service through the WebSocket. The service then processes multiple speech & non-speech signals such as the spoken words, tonality, & audio events before passing the context to the LLM to formulate a response. - -Flow natively supports multiple speaker detection (Speaker Diarization). Flow can be configured to ignore, acknowledge or engage with non-primary speakers when setting up Agents. - -This transcribed text is also streamed back to the client as soon as it is generated to support any client-driven recording, monitoring & analytics workflows. - -To improve accuracy on product-specific terminology we recommend using a Custom Dictionary when setting up Agents in the [Portal](https://portal.speechmatics.com/). - -#### Large language model (LLM) - -Flow’s conversational understanding & knowledge is powered by LLMs. The transcribed text from the ASR is then passed with Flow configurations to the LLM to formulate a natural-sounding response. - -The response-generation can be influenced through defining a persona, style, and context when setting up Templates. - -#### Text to speech (TTS) - -Output generated by the LLM, when ready to be spoken, will be converted to audio through the chosen TTS engine. These engines were selected to provide the most natural-sounding responses while not trading off on latency. This audio is then streamed back to the client, who must then play this back to the user. - -## Flow engine - -### Understanding disfluencies & pacing - -Everyone has a different style of speaking. Natural speech is colored with filler sounds and the pace of speech can vary from speaker to speaker. A one-size-fits-all voice agent can add a lot of friction to the experience if it keeps interrupting you. We’ve designed Flow to adapt to your speaking style and not be over-eager to interrupt, helping to make users feel comfortable. - -### Handling interruptions - -Flow has been modelled on real-world human conversations. Whether it is to stop Flow from going off-track or to correct wrong assumptions, you can interrupt it. We’ve built our own interruption engine that intelligently ignores unintentional interruptions and gracefully handles the ones that it needs to. To avoid sounding abrupt and unnatural when interrupted, Flow will finish the current word that’s being spoken and gradually fade out the next one. - -### End-of-turn detection - -Based on your voice & what you’ve been saying, Flow uses a [small language model (SLM) architecture](https://blog.speechmatics.com/semantic-turn-detection) to smartly detect when you’re done speaking before it responds for a natural and responsive experience. Flow is built to be human-centric and, while we could achieve much lower latencies, it’s rude to interrupt mid-thought. - -### Help and support - -For any additional issues, please reach out to the Flow Support team at [flow-help@speechmatics.com](mailto:flow-help@speechmatics.com). diff --git a/docs/voice-agents/voice-agents-flow/setup.mdx b/docs/voice-agents/voice-agents-flow/setup.mdx deleted file mode 100644 index 9d07e642..00000000 --- a/docs/voice-agents/voice-agents-flow/setup.mdx +++ /dev/null @@ -1,44 +0,0 @@ ---- -sidebar_label: Setup -description: 'Learn about the setup for Flow' ---- - -import SchemaNode from "@theme/Schema"; -import flowSchema from "!openapi-schema-loader!@site/spec/flow-api.yaml"; - -# Flow setup - -A voice agent template covers multiple elements that typically need to be configured in concert to power a specific class of conversations in a human-facing application. - - -Flow can be configured using the following parameters: - - - -
-For more details, refer to [StartConversation API reference](/api-ref/flow-voice-ai-websocket#startconversation). - -### Function calling - -[Function Calling](/voice-agents-flow/features/function-calling) allows you to connect Flow to external tools and systems. This unlocks Flow's ability to act in the real-world and better serve the needs of your users. - -This could involve needing real-time information such as opening/closing times or validation services for authentication or action APIs that control a fast food system while placing a drive-thru order. - - -### Moderating and controlling conversations - -You might want to control ongoing conversation based on what's spoken by the user or the output by the LLM. This could involve situations where the agent is asked to do things out of scope or the conversation is heading in unintentional directions. We enable this through sharing the real-time transcript from speech (AddPartialTranscript/ AddTranscript) and the entire response from the LLM just before it begins to speak (ResponseStarted). We recommend building monitoring on top of these streams and to use either AudioEnded to end the session, or close the WebSocket directly if the final transcript is unimportant. - -#### Steering the conversation - -[Application Inputs](/voice-agents-flow/features/application-inputs) allow you to steer the conversation by adding helpful updates & information asynchronously to Flow - -### Managing call recordings and transcripts - -Clients are responsible for maintaining their own recordings & conversation logs. This is enabled through the audio already being routed entirely through the client, and conversation transcripts being provided in real-time through AddPartialTranscript/AddTranscript/ ResponseStarted/ ResponseCompleted/ ResponseInterrupted. - -### Internet search - -Internet Search allows your agent to look up information such as the weather and the news by accessing the internet. - -Internet Search is currently only available when using the official [iPhone](https://apps.apple.com/us/app/speechmatics-flow/id6673918783) or [Android](https://play.google.com/store/apps/details?id=com.speechmatics.flowapp) applications, or for Enterprise customers. diff --git a/docs/voice-agents/voice-agents-flow/sidebar.ts b/docs/voice-agents/voice-agents-flow/sidebar.ts deleted file mode 100644 index d4a4a2ec..00000000 --- a/docs/voice-agents/voice-agents-flow/sidebar.ts +++ /dev/null @@ -1,61 +0,0 @@ -export default { - type: "category", - label: "Voice agents – Flow", - collapsible: true, - collapsed: true, - items: [ - { - type: "doc", - label: "Overview", - id: "voice-agents/voice-agents-flow/index", - }, - { - type: "category", - label:"Features", - collapsible: true, - collapsed: true, - items: [ - { - type: "doc", - id: "voice-agents/voice-agents-flow/features/application-inputs", - }, - { - type: "doc", - id: "voice-agents/voice-agents-flow/features/function-calling", - }, - { - type: "doc", - id: "voice-agents/voice-agents-flow/features/webrtc-livekit", - }, - ], - }, - { - type: "category", - label:"Guides", - collapsible: true, - collapsed: true, - items: [ - { - type: "doc", - id: "voice-agents/voice-agents-flow/guides/nextjs-guide", - }, - { - type: "doc", - id: "voice-agents/voice-agents-flow/guides/react-native", - }, - ], - }, - { - type: "doc", - id: "voice-agents/voice-agents-flow/setup", - }, - { - type: "doc", - id: "voice-agents/voice-agents-flow/supported-formats-and-limits", - }, - { - type: "doc", - id: "voice-agents/voice-agents-flow/supported-languages", - }, - ], -} as const; \ No newline at end of file diff --git a/docs/voice-agents/voice-agents-flow/supported-formats-and-limits.mdx b/docs/voice-agents/voice-agents-flow/supported-formats-and-limits.mdx deleted file mode 100644 index 6021e150..00000000 --- a/docs/voice-agents/voice-agents-flow/supported-formats-and-limits.mdx +++ /dev/null @@ -1,39 +0,0 @@ ---- -keywords: [speechmatics, api, limits, flow, voice agents] -toc_max_heading_level: 2 -description: 'Learn about the supported input and output audio formats for the Flow API' ---- - -# Supported Formats and Limits - -## Input audio - -All input audio (i.e. the user's voice) sent to the Flow API must be raw PCM audio in one of the following formats: - -- PCM F32 LE raw audio stream (32-bit float) -- PCM S16 LE raw audio stream (16-bit signed int) - -Other audio encodings are not supported. Sample rates are not restricted, but we recommend using **16kHz**. - -## Output audio - -The Flow API will always return PCM audio in **PCM S16 LE** format, regardless of the input format. - -The output audio sample rate is always **16kHz**. - -## Usage Limits - -The Flow API limits the number of hours of audio users can process each month to help manage load on our servers. All users are limited to 50 hours per month and three concurrent sessions. - -Please reach out to [Support](https://support.speechmatics.com) if you need to increase the above limits. - -### Guidance for users - -Clients can disconnect a session before it is automatically terminated and immediately reconnect a new session. Note that new sessions will typically start in less than a second. -If seamless transition is required, the new session can be connected a few seconds before disconnecting the old session. - -Since unpredictable network issues can cause WebSocket connections to be dropped, we recommend graceful handling of session termination for long-running sessions. - -## Data Retention - -Conversation audio and transcriptions by the Flow API are not stored. \ No newline at end of file diff --git a/docs/voice-agents/voice-agents-flow/supported-languages.mdx b/docs/voice-agents/voice-agents-flow/supported-languages.mdx deleted file mode 100644 index 5c5eaf0c..00000000 --- a/docs/voice-agents/voice-agents-flow/supported-languages.mdx +++ /dev/null @@ -1,53 +0,0 @@ ---- -description: 'Learn about the languages supported in Flow' ---- - -# Languages Supported - -Speechmatics offers over 30 languages in Flow with more on the way. The following is a list of supported languages & voices. These can be configured when creating an agent in the [portal](https://portal.speechmatics.com/) for use in the API. - -| Language | Voices | -| ---------------------------- | ------------------------------- | -| Arabic | Arabic Female | -| Bulgarian | Bulgarian Male | -| Croatian | Croatian Male | -| Czech | Czech Female | -| Danish | Danish Female | -| Dutch | Dutch Female | -| English | English (British) Female, English (British) Male, English (American) Male, English (American Female) | -| Finnish | Finnish Female | -| French | French Male | -| German | German Male | -| Greek | Greek Male | -| Hindi | Hindi Female | -| Hungarian | Hungarian Female | -| Indonesian | Indonesian Female | -| Italian | Italian Female | -| Japanese | Japanese Female | -| Korean | Korean Female | -| Malay | English/Malay Female | -| Mandarin | English/Mandarin Male | -| Norwegian | Norwegian Male | -| Polish | Polish Male | -| Portuguese | Portuguese Male | -| Romanian | Romanian Female | -| Russian | Russian Male | -| Slovakian | Slovak Male | -| Spanish | English/Spanish Male, English/Spanish (Colombian) Male, English/Spanish (Colombian) Female | -| Swedish | Swedish Female | -| Tamil | English/Tamil Female | -| Thai | Thai Female | -| Turkish | Turkish Female | -| Ukrainian | Ukrainian Female | -| Vietnamese | Vietnamese Female | - -## Bilingual Support - -For the following languages & voices, the agents created are able to communicate in the primary language of the language pack as well as English without compromising the accuracy of either. - -| Bilingual Pack | Voices | -| ---------------------------- | ------------------------------- | -| Spanish / English | English/Spanish Male, English/Spanish (Colombian) Male, English/Spanish (Colombian) Female | -| Mandarin / English | Mandarin/English Male | -| Tamil / English | Tamil/English Female | -| Malay / English | Malay/English Female | \ No newline at end of file From 7b2cc90289165622f71618b118ce5c390922f4b4 Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Wed, 17 Dec 2025 17:26:23 +0000 Subject: [PATCH 10/12] Update broken internal documentation links across multiple pages Fixed broken anchor links and updated paths to match new documentation structure: - Updated operating points links to point to /speech-to-text/languages#operating-points - Updated translation language pairs links to /speech-to-text/features/translation#languages - Updated Flow documentation links to use new /voice-agents/flow/ path structure - Fixed diarization anchor link to #diarization-and-punctuation - Removed broken links in gpu --- docs/deployments/container/cpu-speech-to-text.mdx | 2 +- docs/deployments/container/gpu-speech-to-text.mdx | 2 +- docs/deployments/container/gpu-translation.mdx | 4 ++-- docs/speech-to-text/batch/input.mdx | 2 +- docs/speech-to-text/batch/language-identification.mdx | 2 +- docs/speech-to-text/features/audio-filtering.mdx | 2 +- docs/speech-to-text/features/feature-discovery.mdx | 3 +-- docs/speech-to-text/features/translation.mdx | 2 +- docs/speech-to-text/formatting.mdx | 2 +- docs/speech-to-text/realtime/input.mdx | 2 +- docs/voice-agents/flow/setup.mdx | 4 ++-- spec/flow-api.yaml | 2 +- 12 files changed, 14 insertions(+), 15 deletions(-) diff --git a/docs/deployments/container/cpu-speech-to-text.mdx b/docs/deployments/container/cpu-speech-to-text.mdx index 6bf85083..291c5a6a 100644 --- a/docs/deployments/container/cpu-speech-to-text.mdx +++ b/docs/deployments/container/cpu-speech-to-text.mdx @@ -266,7 +266,7 @@ The parameters are: - `processor` - One of `cpu` or `gpu`. Note that selecting `gpu` requires a [GPU Inference Container](/deployments/container/gpu-speech-to-text) -- `operating_point` - One of `standard` or `enhanced`. The [operating point](/speech-to-text/#operating-points) you want to prewarm +- `operating_point` - One of `standard` or `enhanced`. The [operating point](/speech-to-text/languages#operating-points) you want to prewarm - `prewarm_connections` - Integer. The number of engine instances of the specific mode you want to pre-warm. The total number of `prewarm_connections` cannot be greater than `SM_MAX_CONCURRENT_CONNECTIONS`. After the pre-warming is complete, this parameter does not limit the types of connections the engine can start. diff --git a/docs/deployments/container/gpu-speech-to-text.mdx b/docs/deployments/container/gpu-speech-to-text.mdx index 3889a466..815ff5c8 100644 --- a/docs/deployments/container/gpu-speech-to-text.mdx +++ b/docs/deployments/container/gpu-speech-to-text.mdx @@ -107,7 +107,7 @@ Once the GPU Server is running, follow the [Instructions for Linking a CPU Conta ### Running only one operating point -[Operating Points](/speech-to-text/#operating-points-1) represent different levels of model complexity. +[Operating Points](/speech-to-text/languages#operating-points) represent different levels of model complexity. To save GPU memory for throughput, you can run the server with only one Operating Point loaded. To do this, pass the `SM_OPERATING_POINT` environment variable to the container and set it to either `standard` or `enhanced`. diff --git a/docs/deployments/container/gpu-translation.mdx b/docs/deployments/container/gpu-translation.mdx index f40f5908..c8153c25 100644 --- a/docs/deployments/container/gpu-translation.mdx +++ b/docs/deployments/container/gpu-translation.mdx @@ -207,6 +207,6 @@ If one or more of the target languages are not supported for the source language } ``` -Please note, this behaviour is different when using our [SaaS Deployment](/speech-to-text/features/translation#unsupported-target-language). +Please note, this behaviour is different when using our SaaS Deployment. -For all other errors, please see [documentation here](/speech-to-text/features/translation#batch-error-responses) +For all other errors, please see our documentation. diff --git a/docs/speech-to-text/batch/input.mdx b/docs/speech-to-text/batch/input.mdx index fbfa6f5f..cc77e31f 100644 --- a/docs/speech-to-text/batch/input.mdx +++ b/docs/speech-to-text/batch/input.mdx @@ -13,7 +13,7 @@ import batchSchema from "!openapi-schema-loader!@site/spec/batch.yaml"; :::info This page documents audio inputs for transcription by **REST API** (a.k.a. Batch SaaS). * For Realtime transcription, see the [Realtime Transcription input](/speech-to-text/realtime/input). -* For Flow Voice AI, see the [Flow Voice AI supported formats and limits](/voice-agents-flow/supported-formats-and-limits). +* For Flow Voice AI, see the [Flow Voice AI supported formats and limits](/voice-agents/flow/supported-formats-and-limits). ::: ## Supported file types diff --git a/docs/speech-to-text/batch/language-identification.mdx b/docs/speech-to-text/batch/language-identification.mdx index 8022aba3..7407e024 100644 --- a/docs/speech-to-text/batch/language-identification.mdx +++ b/docs/speech-to-text/batch/language-identification.mdx @@ -375,7 +375,7 @@ This error is available when checking the [job details](//api-ref/batch/get-job- ### Errors when used with translation -It is not possible to translate between all language pairs. When `auto` language is used, this can mean some translation target languages will not be available. See the full list of [Supported Language Pairs](/speech-to-text/features/translation#supported-translation-pairs). +It is not possible to translate between all language pairs. When `auto` language is used, this can mean some translation target languages will not be available. See the full list of [Supported Language Pairs](/speech-to-text/features/translation#languages). These errors are available when getting the [job transcript](/api-ref/batch/get-the-transcript-for-a-transcription-job): diff --git a/docs/speech-to-text/features/audio-filtering.mdx b/docs/speech-to-text/features/audio-filtering.mdx index 88eeba4c..f7ed95d9 100644 --- a/docs/speech-to-text/features/audio-filtering.mdx +++ b/docs/speech-to-text/features/audio-filtering.mdx @@ -73,6 +73,6 @@ To obtain volume labelling without filtering any audio, supply an empty config o Once the audio is in a raw format (16kHz 16bit mono), it is split into 0.01s chunks. For each chunk, the root mean square amplitude of the signal is calculated, and scaled to the range `0 - 100`. If the volume is less than the supplied cut-off, the chunk will be replaced with silence. -To work successfully without degrading accuracy, the background speech must be significantly quieter than the foreground speech, otherwise the filtering process may remove small sections of the audio which should be transcribed. For this reason, the feature works better with the [Enhanced Operating Point](/speech-to-text/#operating-points-1), which is more robust against inadvertent damage to the audio. +To work successfully without degrading accuracy, the background speech must be significantly quieter than the foreground speech, otherwise the filtering process may remove small sections of the audio which should be transcribed. For this reason, the feature works better with the [Enhanced Operating Points](/speech-to-text/languages#operating-points), which is more robust against inadvertent damage to the audio. The word volume calculation takes the start and end times of words, and applies a weighted average of the volumes of each audio chunk which make up the word. The weighting attempts to ignore areas of silence within long words, and provide a better match with the volume classification a human listener would make. \ No newline at end of file diff --git a/docs/speech-to-text/features/feature-discovery.mdx b/docs/speech-to-text/features/feature-discovery.mdx index 09daf484..f6fb3bfb 100644 --- a/docs/speech-to-text/features/feature-discovery.mdx +++ b/docs/speech-to-text/features/feature-discovery.mdx @@ -24,6 +24,5 @@ The feature discovery endpoint will include an object with the following propert - `languages` - Includes a list of supported ISO language codes - `locales` - Includes any languages with a supported [Output Locale](/speech-to-text/formatting#output-locale) - `domains` - Includes any languages with a supported [Domain Language Optimizations](/speech-to-text/languages#multilingual-speech-to-text) - - `translation` - Includes all supported [translation pairs](/speech-to-text/features/translation#supported-translation-pairs) + - `translation` - Includes all [supported translation pairs](/speech-to-text/features/translation#languages) - `languageid` - List of languages supported by [Language Identification](/speech-to-text/batch/language-identification) - diff --git a/docs/speech-to-text/features/translation.mdx b/docs/speech-to-text/features/translation.mdx index 9b3e0aad..59656215 100644 --- a/docs/speech-to-text/features/translation.mdx +++ b/docs/speech-to-text/features/translation.mdx @@ -60,7 +60,7 @@ You can configure up to five translation languages at a time. ## Batch output -The returned JSON will include a new property called `translations`, which contains a list of translated text for each target language requested (using the same [ISO language codes](/speech-to-text/languages#languages) as for transcription). +The returned JSON will include a new property called `translations`, which contains a list of translated text for each target language requested (using the same [ISO language codes](/speech-to-text/languages) as for transcription). diff --git a/docs/speech-to-text/formatting.mdx b/docs/speech-to-text/formatting.mdx index e2842912..71bcfaa4 100644 --- a/docs/speech-to-text/formatting.mdx +++ b/docs/speech-to-text/formatting.mdx @@ -398,7 +398,7 @@ This configuration: The `sensitivity` parameter accepts values from 0 to 1. Higher values produce more punctuation in the output. :::warning -Disabling punctuation may slightly reduce speaker diarization accuracy. See the [speaker diarization and punctuation](/speech-to-text/features/diarization#speaker-diarization-and-punctuation) section for details. +Disabling punctuation may slightly reduce speaker diarization accuracy. See the [speaker diarization and punctuation](/speech-to-text/features/diarization#diarization-and-punctuation) section for details. ::: ## Next steps diff --git a/docs/speech-to-text/realtime/input.mdx b/docs/speech-to-text/realtime/input.mdx index 06b2ce15..904869dc 100644 --- a/docs/speech-to-text/realtime/input.mdx +++ b/docs/speech-to-text/realtime/input.mdx @@ -14,7 +14,7 @@ import realtimeSchema from "!asyncapi-schema-loader!@site/spec/realtime.yaml" :::info This page is about the **Real-time transcription API** (websocket). * For information on Batch SaaS, see the [Batch SaaS input](/speech-to-text/batch/input). -* For information on Flow Voice AI, see the [Flow Voice AI input](/voice-agents-flow/supported-formats-and-limits). +* For information on Flow Voice AI, see the [Flow Voice AI input](/voice-agents/flow/supported-formats-and-limits). ::: ## Supported input audio formats diff --git a/docs/voice-agents/flow/setup.mdx b/docs/voice-agents/flow/setup.mdx index 9d07e642..8531e27d 100644 --- a/docs/voice-agents/flow/setup.mdx +++ b/docs/voice-agents/flow/setup.mdx @@ -20,7 +20,7 @@ For more details, refer to [StartConversation API reference](/api-ref/flow-voice ### Function calling -[Function Calling](/voice-agents-flow/features/function-calling) allows you to connect Flow to external tools and systems. This unlocks Flow's ability to act in the real-world and better serve the needs of your users. +[Function Calling](/voice-agents/flow/features/function-calling) allows you to connect Flow to external tools and systems. This unlocks Flow's ability to act in the real-world and better serve the needs of your users. This could involve needing real-time information such as opening/closing times or validation services for authentication or action APIs that control a fast food system while placing a drive-thru order. @@ -31,7 +31,7 @@ You might want to control ongoing conversation based on what's spoken by the use #### Steering the conversation -[Application Inputs](/voice-agents-flow/features/application-inputs) allow you to steer the conversation by adding helpful updates & information asynchronously to Flow +[Application Inputs](/voice-agents/flow/features/application-inputs) allow you to steer the conversation by adding helpful updates & information asynchronously to Flow ### Managing call recordings and transcripts diff --git a/spec/flow-api.yaml b/spec/flow-api.yaml index f8e6e671..b5f55428 100644 --- a/spec/flow-api.yaml +++ b/spec/flow-api.yaml @@ -707,7 +707,7 @@ components: type: string # description: The id of the agent or persona to use during the conversation. description: | - Required in the the `StartConversation` message in the Flow API. Generated from the [Speechmatics Portal](https://portal.speechmatics.com/). This maps to the [language supported](/voice-agents-flow/supported-languages), agent's prompt, LLM, TTS voice, & custom dictionary. These can be customised by creating or modifying agents in the Portal. + Required in the the `StartConversation` message in the Flow API. Generated from the [Speechmatics Portal](https://portal.speechmatics.com/). This maps to the [language supported](/voice-agents/flow/supported-languages), agent's prompt, LLM, TTS voice, & custom dictionary. These can be customised by creating or modifying agents in the Portal. template_variables: type: object additionalProperties: From 18a82b0d7d20c9367d5dc823d278fdd6e9b3df42 Mon Sep 17 00:00:00 2001 From: Matt Nemitz Date: Fri, 19 Dec 2025 10:11:23 +0000 Subject: [PATCH 11/12] Sync redirects --- scripts/redirects/redirects.json | 40 +++++++++++++++++++++++++ vercel.json | 50 ++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/scripts/redirects/redirects.json b/scripts/redirects/redirects.json index 9481173c..40a720f2 100644 --- a/scripts/redirects/redirects.json +++ b/scripts/redirects/redirects.json @@ -22,5 +22,45 @@ { "source": "/speech-to-text/realtime/realtime-speaker-identification", "destination": "/speech-to-text/realtime/speaker-identification" + }, + { + "source": "/voice-agents-flow/features/application-inputs", + "destination": "/voice-agents/flow/features/application-inputs" + }, + { + "source": "/voice-agents-flow/setup", + "destination": "/voice-agents/flow/setup" + }, + { + "source": "/voice-agents-flow/features/function-calling", + "destination": "/voice-agents/flow/features/function-calling" + }, + { + "source": "/voice-agents-flow", + "destination": "/voice-agents/flow" + }, + { + "source": "/voice-agents-flow/supported-languages", + "destination": "/voice-agents/flow/supported-languages" + }, + { + "source": "/voice-agents-flow/features/webrtc-livekit", + "destination": "/voice-agents/flow/features/webrtc-livekit" + }, + { + "source": "/voice-agents-flow/guides/nextjs-guide", + "destination": "/voice-agents/flow/guides/nextjs-guide" + }, + { + "source": "/voice-agents-flow/guides/react-native", + "destination": "/voice-agents/flow/guides/react-native" + }, + { + "source": "/voice-agents-flow", + "destination": "/voice-agents/flow" + }, + { + "source": "/voice-agents-flow", + "destination": "/voice-agents/flow" } ] diff --git a/vercel.json b/vercel.json index 87c0a3ec..1d018c97 100644 --- a/vercel.json +++ b/vercel.json @@ -31,6 +31,56 @@ "destination": "/speech-to-text/realtime/speaker-identification", "permanent": true }, + { + "source": "/voice-agents-flow/features/application-inputs", + "destination": "/voice-agents/flow/features/application-inputs", + "permanent": true + }, + { + "source": "/voice-agents-flow/setup", + "destination": "/voice-agents/flow/setup", + "permanent": true + }, + { + "source": "/voice-agents-flow/features/function-calling", + "destination": "/voice-agents/flow/features/function-calling", + "permanent": true + }, + { + "source": "/voice-agents-flow", + "destination": "/voice-agents/flow", + "permanent": true + }, + { + "source": "/voice-agents-flow/supported-languages", + "destination": "/voice-agents/flow/supported-languages", + "permanent": true + }, + { + "source": "/voice-agents-flow/features/webrtc-livekit", + "destination": "/voice-agents/flow/features/webrtc-livekit", + "permanent": true + }, + { + "source": "/voice-agents-flow/guides/nextjs-guide", + "destination": "/voice-agents/flow/guides/nextjs-guide", + "permanent": true + }, + { + "source": "/voice-agents-flow/guides/react-native", + "destination": "/voice-agents/flow/guides/react-native", + "permanent": true + }, + { + "source": "/voice-agents-flow", + "destination": "/voice-agents/flow", + "permanent": true + }, + { + "source": "/voice-agents-flow", + "destination": "/voice-agents/flow", + "permanent": true + }, { "source": "/jobsapi", "destination": "/api-ref/batch/create-a-new-job", From bab89c4b3a8d7221198fb52526a0f2837e5a519e Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Fri, 19 Dec 2025 16:26:11 +0000 Subject: [PATCH 12/12] Fix Flow documentation link path in Kubernetes deployment page --- docs/deployments/kubernetes/index.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deployments/kubernetes/index.mdx b/docs/deployments/kubernetes/index.mdx index a6e20e62..a90dcc7a 100644 --- a/docs/deployments/kubernetes/index.mdx +++ b/docs/deployments/kubernetes/index.mdx @@ -33,4 +33,4 @@ Using Helm, customers can customize deployments through configurable values, man Speechmatics Kubernetes deployment supports the following applications: - [Realtime](/speech-to-text/realtime/quickstart): Stream audio from an input device or file and receive real-time transcription updates as audio is processed. - - [Voice Agent – Flow](/voice-agents-flow): A Voice Agent API that enables responsive, real-time speech-to-speech interactions in your applications. +- [Voice Agent – Flow](/voice-agents/flow): A Voice Agent API that enables responsive, real-time speech-to-speech interactions in your applications. \ No newline at end of file