From 4fce2b14d3cbefdc10d901e8df7a5ebf752ab41b Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Thu, 18 Dec 2025 15:49:55 +0000 Subject: [PATCH 1/6] Update Voice SDK overview documentation with quickstart example and restructured configuration sections - Adjust quickstart example - Restructure configuration elements with separate tabs for custom configs, overlays, and serialization - Add preset descriptions - Update page title from "Voice agents overview" to "Voice SDK overview" - Clarify SDK description to mention Python explicitly - Improve API key setup --- docs/voice-agents/overview.mdx | 122 +++++++++++++++++++++++++++------ 1 file changed, 100 insertions(+), 22 deletions(-) diff --git a/docs/voice-agents/overview.mdx b/docs/voice-agents/overview.mdx index 4f7dced0..9f989f2e 100644 --- a/docs/voice-agents/overview.mdx +++ b/docs/voice-agents/overview.mdx @@ -1,17 +1,17 @@ --- -description: Learn how to build voice-enabled applications with the Speechmatics Voice SDK +description: Learn how to build voice-enabled applications with the Speechmatics Voice SDK --- import Admonition from '@theme/Admonition'; import CodeBlock from '@theme/CodeBlock'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -import pythonVoiceQuickstart from "./assets/basic-quickstart.py?raw" -import pythonVoicePresets from "./assets/presets.py?raw" import pythonVoiceCustomConfig from "./assets/custom-config.py?raw" +import pythonVoiceConfigOverlays from "./assets/config-overlays.py?raw" +import pythonVoiceConfigSerialization from "./assets/config-serialization.py?raw" -# Voice agents overview -The Voice SDK builds on our Realtime API to provide features optimized for conversational AI: +# Voice SDK overview +The Voice SDK builds on our Realtime API to provide additional features optimized for conversational AI, using Python: - **Intelligent segmentation**: groups words into meaningful speech segments per speaker. - **Turn detection**: automatically detects when speakers finish talking. @@ -39,7 +39,8 @@ Use the Realtime SDK when: ### 1. Create an API key -[Create an API key in the portal](https://portal.speechmatics.com/settings/api-keys) to access the Voice SDK. Store your key securely as a managed secret. +[Create a Speechmatics API key in the portal](https://portal.speechmatics.com/settings/api-keys) to access the Voice SDK. +Store your key securely as a managed secret. ### 2. Install dependencies @@ -51,37 +52,114 @@ pip install speechmatics-voice pip install speechmatics-voice[smart] ``` -### 3. Configure +### 3. Quickstart + +Here's how to stream microphone audio to the Voice Agent and transcribe finalised segments of speech, with speaker ID: + +```python +import asyncio +import os +from speechmatics.rt import Microphone +from speechmatics.voice import VoiceAgentClient, AgentServerMessageType + +async def main(): + """Stream microphone audio to Speechmatics Voice Agent using 'scribe' preset""" + + # Audio configuration + SAMPLE_RATE = 16000 # Hz + CHUNK_SIZE = 160 # Samples per read + PRESET = "scribe" # Configuration preset + + # Create client with preset + client = VoiceAgentClient( + api_key=os.getenv("SPEECHMATICS_API_KEY"), + preset=PRESET + ) + + # Print finalised segments of speech with speaker ID + @client.on(AgentServerMessageType.ADD_SEGMENT) + def on_segment(message): + for segment in message["segments"]: + speaker = segment["speaker_id"] + text = segment["text"] + print(f"{speaker}: {text}") + + # Setup microphone + mic = Microphone(SAMPLE_RATE, CHUNK_SIZE) + if not mic.start(): + print("Error: Microphone not available") + return + + # Connect to the Voice Agent + await client.connect() + + # Stream microphone audio (interruptable using keyboard) + try: + while True: + audio_chunk = await mic.read(CHUNK_SIZE) + if not audio_chunk: + break # Microphone stopped producing data + await client.send_audio(audio_chunk) + except KeyboardInterrupt: + pass + finally: + await client.disconnect() + +if __name__ == "__main__": + asyncio.run(main()) -Replace `YOUR_API_KEY` with your actual API key from the portal: +``` + +#### Presets - the simplest way to get started +These are optimized configurations for common use cases and require no further settings: + +`fast` - low latency, fast responses + +`adaptive` - natural dialogue in conversation + +`smart_turn` - advanced conversation, with ML turn detection + +`external` - external end of turn - endpointing handled by the client + +`scribe` - note-taking + +`captions` - live captioning + +To view all available presets: +```python +presets = VoiceAgentConfigPreset.list_presets() +``` +### 4. Custom configurations + +For more control, you can also specify custom configurations: - - - {pythonVoiceQuickstart} - + +Specify configurations in a `VoiceAgentConfig` object: + + {pythonVoiceCustomConfig} + - + +Use presets as a starting point and customise with overlays: - {pythonVoicePresets} + {pythonVoiceConfigOverlays} - + +Export or import configurations using JSON: - {pythonVoiceCustomConfig} + {pythonVoiceConfigSerialization} -## FAQ +Note: If no config or preset is provided, the client will default to the external preset. -### Implementation and deployment -
-Can I deploy this in my own environment? -Yes! The Voice SDK can be consumed via our managed service or deployed in your own environment. To learn more about on-premises deployment options, [speak to sales](https://www.speechmatics.com/speak-to-sales). -
+ +## FAQ ### Support From 85300b16f911b1c020adbf091f4bdce0170f7548 Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Thu, 18 Dec 2025 15:50:18 +0000 Subject: [PATCH 2/6] Duplicate of overview doc --- docs/voice-agents/quickstart.mdx | 110 ------------------------------- 1 file changed, 110 deletions(-) delete mode 100644 docs/voice-agents/quickstart.mdx diff --git a/docs/voice-agents/quickstart.mdx b/docs/voice-agents/quickstart.mdx deleted file mode 100644 index 5b3dad73..00000000 --- a/docs/voice-agents/quickstart.mdx +++ /dev/null @@ -1,110 +0,0 @@ ---- -description: Learn how to build voice-enabled applications with the Speechmatics voice SDK ---- -import Admonition from '@theme/Admonition'; -import CodeBlock from '@theme/CodeBlock'; -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - -import pythonVoiceQuickstart from "./assets/basic-quickstart.py?raw" -import pythonVoicePresets from "./assets/presets.py?raw" -import pythonVoiceCustomConfig from "./assets/custom-config.py?raw" - -# Voice agent overview - -The voice SDK builds on our real-time API to provide features optimized for conversational AI: - -- **Intelligent segmentation**: groups words into meaningful speech segments per speaker. -- **Turn detection**: automatically detects when speakers finish talking. -- **Speaker management**: focus on or ignore specific speakers in multi-speaker scenarios. -- **Preset configurations**: offers ready-to-use settings for conversations, note-taking, and captions. -- **Simplified event handling**: delivers clean, structured segments instead of raw word-level events. - -### When to use the voice SDK vs real-time SDK - -Use the voice SDK when: - -- Building conversational AI or voice agents -- You need automatic turn detection -- You want speaker-focused transcription -- You need ready-to-use presets for common scenarios - -Use the realtime SDK when: - -- You need the raw stream of word-by-word transcription data -- Building custom segmentation logic -- You want fine-grained control over every event -- Processing batch files or custom workflows - -## Getting started - -### 1. Get your API key - -[Create an API key in the portal](https://portal.speechmatics.com/settings/api-keys) to access the voice SDK. Store your key securely as a managed secret. - -### 2. Install dependencies - -```bash -# Standard installation -pip install speechmatics-voice - -# With SMART_TURN (ML-based turn detection) -pip install speechmatics-voice[smart] -``` - -### 3. Quickstart - -Replace `YOUR_API_KEY` with your actual API key from the portal: - - - - - {pythonVoiceQuickstart} - - - - - {pythonVoicePresets} - - - - - {pythonVoiceCustomConfig} - - - - -## FAQ - -### Implementation and deployment - -
-Can I deploy this in my own environment? - -Yes! The voice agent SDK can be consumed via our managed service or deployed in your own environment. To learn more about on-premises deployment options, [speak to sales](https://www.speechmatics.com/speak-to-sales). -
- -### Support - -
-Where can I provide feedback or get help? - -You can submit feedback, bug reports, or feature requests through the Speechmatics [GitHub discussions](https://github.com/orgs/speechmatics/discussions). -
- -## Next steps - -For more information, see the [voice agent Python SDK](https://github.com/speechmatics/speechmatics-python-sdk/tree/main/sdk/voice) on github. - -To learn more, check out [the Speechmatics academy](https://github.com/speechmatics/speechmatics-academy) - -Ready to build something amazing with our voice agent SDK? We'd love to hear about your project and help you succeed. - -**Get in touch with us:** -- Share your feedback and feature requests -- Ask questions about implementation -- Discuss enterprise pricing and custom voices -- Report any issues or bugs you encounter - -[Contact our team](https://support.speechmatics.com) or join our developer community (https://www.reddit.com/r/Speechmatics) to connect with other builders using text to speech. - From def273e1d736219e1fff988c30b000e4aea21cec Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Thu, 18 Dec 2025 15:50:48 +0000 Subject: [PATCH 3/6] Add configuration constants and improve code doc --- docs/voice-agents/assets/basic-quickstart.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/docs/voice-agents/assets/basic-quickstart.py b/docs/voice-agents/assets/basic-quickstart.py index e649dbec..621f902d 100644 --- a/docs/voice-agents/assets/basic-quickstart.py +++ b/docs/voice-agents/assets/basic-quickstart.py @@ -4,10 +4,17 @@ from speechmatics.voice import VoiceAgentClient, AgentServerMessageType async def main(): + """Stream microphone audio to Speechmatics Voice Agent using 'scribe' preset""" + + # Audio configuration + SAMPLE_RATE = 16000 # Hz + CHUNK_SIZE = 160 # Samples per read + PRESET = "scribe" # Configuration preset + # Create client with preset client = VoiceAgentClient( api_key=os.getenv("YOUR_API_KEY"), - preset="scribe" + preset=PRESET ) # Handle final segments @@ -19,17 +26,20 @@ def on_segment(message): print(f"{speaker}: {text}") # Setup microphone - mic = Microphone(sample_rate=16000, chunk_size=320) + mic = Microphone(SAMPLE_RATE, CHUNK_SIZE) if not mic.start(): print("Error: Microphone not available") return - # Connect and stream + # Connect to the Voice agent await client.connect() + # Stream microphone audio (interruptible using keyboard) try: while True: - audio_chunk = await mic.read(320) + audio_chunk = await mic.read(CHUNK_SIZE) + if not audio_chunk: + break # Microphone stopped producing data await client.send_audio(audio_chunk) except KeyboardInterrupt: pass From 85795022b426edf0d062f012f5c5b7dcd252afe7 Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Thu, 18 Dec 2025 15:51:03 +0000 Subject: [PATCH 4/6] Add example code for Voice SDK configuration overlays with preset customization --- docs/voice-agents/assets/config-overlays.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 docs/voice-agents/assets/config-overlays.py diff --git a/docs/voice-agents/assets/config-overlays.py b/docs/voice-agents/assets/config-overlays.py new file mode 100644 index 00000000..be840baa --- /dev/null +++ b/docs/voice-agents/assets/config-overlays.py @@ -0,0 +1,9 @@ +from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig + +# Use preset with custom overrides +config = VoiceAgentConfigPreset.SCRIBE( + VoiceAgentConfig( + language="es", + max_delay=0.8 + ) +) From 03af5e2d2a9e7279ccc383907945ee40b2b03d01 Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Thu, 18 Dec 2025 15:51:24 +0000 Subject: [PATCH 5/6] Add example code for Voice SDK configuration serialization with JSON import/export --- docs/voice-agents/assets/config-serialization.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 docs/voice-agents/assets/config-serialization.py diff --git a/docs/voice-agents/assets/config-serialization.py b/docs/voice-agents/assets/config-serialization.py new file mode 100644 index 00000000..a2d3eb4b --- /dev/null +++ b/docs/voice-agents/assets/config-serialization.py @@ -0,0 +1,10 @@ +from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig + +# Export preset to JSON +config_json = VoiceAgentConfigPreset.SCRIBE().to_json() + +# Load from JSON +config = VoiceAgentConfig.from_json(config_json) + +# Or create from JSON string +config = VoiceAgentConfig.from_json('{"language": "en", "enable_diarization": true}') \ No newline at end of file From 8d6187bd9246d94a0cb38c835148b9256247c3f3 Mon Sep 17 00:00:00 2001 From: lgavincrl Date: Fri, 19 Dec 2025 16:12:37 +0000 Subject: [PATCH 6/6] Update preset descriptions and configuration for clarity --- docs/voice-agents/overview.mdx | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/docs/voice-agents/overview.mdx b/docs/voice-agents/overview.mdx index 9f989f2e..0dcdc8d3 100644 --- a/docs/voice-agents/overview.mdx +++ b/docs/voice-agents/overview.mdx @@ -111,15 +111,15 @@ if __name__ == "__main__": ``` #### Presets - the simplest way to get started -These are optimized configurations for common use cases and require no further settings: +These are purpose-built, optimized configurations, ready for use without further modification: `fast` - low latency, fast responses -`adaptive` - natural dialogue in conversation +`adaptive` - general conversation -`smart_turn` - advanced conversation, with ML turn detection +`smart_turn` - complex conversation -`external` - external end of turn - endpointing handled by the client +`external` - user handles end of turn `scribe` - note-taking @@ -132,7 +132,7 @@ presets = VoiceAgentConfigPreset.list_presets() ### 4. Custom configurations -For more control, you can also specify custom configurations: +For more control, you can also specify custom configurations or use presets as a starting point and customise with overlays: Specify configurations in a `VoiceAgentConfig` object: @@ -140,27 +140,20 @@ Specify configurations in a `VoiceAgentConfig` object: {pythonVoiceCustomConfig} - + Use presets as a starting point and customise with overlays: {pythonVoiceConfigOverlays} - -Export or import configurations using JSON: - - {pythonVoiceConfigSerialization} - - -Note: If no config or preset is provided, the client will default to the external preset. +Note: If no configuration or preset is provided, the client will default to the `external` preset. ## FAQ - ### Support
@@ -171,7 +164,7 @@ You can submit feedback, bug reports, or feature requests through the Speechmati ## Next steps -For more information, see the [Voice SDK](https://github.com/speechmatics/speechmatics-python-sdk/tree/main/sdk/voice) on github. +For more information, see the [Voice SDK](https://github.com/speechmatics/speechmatics-python-sdk/tree/main/sdk/voice) on GitHub. To learn more, check out [the Speechmatics Academy](https://github.com/speechmatics/speechmatics-academy).