diff --git a/docs/voice-agents/assets/basic-quickstart.py b/docs/voice-agents/assets/basic-quickstart.py index e649dbec..621f902d 100644 --- a/docs/voice-agents/assets/basic-quickstart.py +++ b/docs/voice-agents/assets/basic-quickstart.py @@ -4,10 +4,17 @@ from speechmatics.voice import VoiceAgentClient, AgentServerMessageType async def main(): + """Stream microphone audio to Speechmatics Voice Agent using 'scribe' preset""" + + # Audio configuration + SAMPLE_RATE = 16000 # Hz + CHUNK_SIZE = 160 # Samples per read + PRESET = "scribe" # Configuration preset + # Create client with preset client = VoiceAgentClient( api_key=os.getenv("YOUR_API_KEY"), - preset="scribe" + preset=PRESET ) # Handle final segments @@ -19,17 +26,20 @@ def on_segment(message): print(f"{speaker}: {text}") # Setup microphone - mic = Microphone(sample_rate=16000, chunk_size=320) + mic = Microphone(SAMPLE_RATE, CHUNK_SIZE) if not mic.start(): print("Error: Microphone not available") return - # Connect and stream + # Connect to the Voice agent await client.connect() + # Stream microphone audio (interruptible using keyboard) try: while True: - audio_chunk = await mic.read(320) + audio_chunk = await mic.read(CHUNK_SIZE) + if not audio_chunk: + break # Microphone stopped producing data await client.send_audio(audio_chunk) except KeyboardInterrupt: pass diff --git a/docs/voice-agents/assets/config-overlays.py b/docs/voice-agents/assets/config-overlays.py new file mode 100644 index 00000000..be840baa --- /dev/null +++ b/docs/voice-agents/assets/config-overlays.py @@ -0,0 +1,9 @@ +from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig + +# Use preset with custom overrides +config = VoiceAgentConfigPreset.SCRIBE( + VoiceAgentConfig( + language="es", + max_delay=0.8 + ) +) diff --git a/docs/voice-agents/assets/config-serialization.py b/docs/voice-agents/assets/config-serialization.py new file mode 100644 index 00000000..a2d3eb4b --- /dev/null +++ b/docs/voice-agents/assets/config-serialization.py @@ -0,0 +1,10 @@ +from speechmatics.voice import VoiceAgentConfigPreset, VoiceAgentConfig + +# Export preset to JSON +config_json = VoiceAgentConfigPreset.SCRIBE().to_json() + +# Load from JSON +config = VoiceAgentConfig.from_json(config_json) + +# Or create from JSON string +config = VoiceAgentConfig.from_json('{"language": "en", "enable_diarization": true}') \ No newline at end of file diff --git a/docs/voice-agents/overview.mdx b/docs/voice-agents/overview.mdx index 4f7dced0..0dcdc8d3 100644 --- a/docs/voice-agents/overview.mdx +++ b/docs/voice-agents/overview.mdx @@ -1,17 +1,17 @@ --- -description: Learn how to build voice-enabled applications with the Speechmatics Voice SDK +description: Learn how to build voice-enabled applications with the Speechmatics Voice SDK --- import Admonition from '@theme/Admonition'; import CodeBlock from '@theme/CodeBlock'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -import pythonVoiceQuickstart from "./assets/basic-quickstart.py?raw" -import pythonVoicePresets from "./assets/presets.py?raw" import pythonVoiceCustomConfig from "./assets/custom-config.py?raw" +import pythonVoiceConfigOverlays from "./assets/config-overlays.py?raw" +import pythonVoiceConfigSerialization from "./assets/config-serialization.py?raw" -# Voice agents overview -The Voice SDK builds on our Realtime API to provide features optimized for conversational AI: +# Voice SDK overview +The Voice SDK builds on our Realtime API to provide additional features optimized for conversational AI, using Python: - **Intelligent segmentation**: groups words into meaningful speech segments per speaker. - **Turn detection**: automatically detects when speakers finish talking. @@ -39,7 +39,8 @@ Use the Realtime SDK when: ### 1. Create an API key -[Create an API key in the portal](https://portal.speechmatics.com/settings/api-keys) to access the Voice SDK. Store your key securely as a managed secret. +[Create a Speechmatics API key in the portal](https://portal.speechmatics.com/settings/api-keys) to access the Voice SDK. +Store your key securely as a managed secret. ### 2. Install dependencies @@ -51,38 +52,108 @@ pip install speechmatics-voice pip install speechmatics-voice[smart] ``` -### 3. Configure +### 3. Quickstart + +Here's how to stream microphone audio to the Voice Agent and transcribe finalised segments of speech, with speaker ID: + +```python +import asyncio +import os +from speechmatics.rt import Microphone +from speechmatics.voice import VoiceAgentClient, AgentServerMessageType + +async def main(): + """Stream microphone audio to Speechmatics Voice Agent using 'scribe' preset""" + + # Audio configuration + SAMPLE_RATE = 16000 # Hz + CHUNK_SIZE = 160 # Samples per read + PRESET = "scribe" # Configuration preset + + # Create client with preset + client = VoiceAgentClient( + api_key=os.getenv("SPEECHMATICS_API_KEY"), + preset=PRESET + ) + + # Print finalised segments of speech with speaker ID + @client.on(AgentServerMessageType.ADD_SEGMENT) + def on_segment(message): + for segment in message["segments"]: + speaker = segment["speaker_id"] + text = segment["text"] + print(f"{speaker}: {text}") + + # Setup microphone + mic = Microphone(SAMPLE_RATE, CHUNK_SIZE) + if not mic.start(): + print("Error: Microphone not available") + return + + # Connect to the Voice Agent + await client.connect() + + # Stream microphone audio (interruptable using keyboard) + try: + while True: + audio_chunk = await mic.read(CHUNK_SIZE) + if not audio_chunk: + break # Microphone stopped producing data + await client.send_audio(audio_chunk) + except KeyboardInterrupt: + pass + finally: + await client.disconnect() + +if __name__ == "__main__": + asyncio.run(main()) -Replace `YOUR_API_KEY` with your actual API key from the portal: +``` + +#### Presets - the simplest way to get started +These are purpose-built, optimized configurations, ready for use without further modification: + +`fast` - low latency, fast responses + +`adaptive` - general conversation + +`smart_turn` - complex conversation + +`external` - user handles end of turn + +`scribe` - note-taking + +`captions` - live captioning + +To view all available presets: +```python +presets = VoiceAgentConfigPreset.list_presets() +``` + +### 4. Custom configurations +For more control, you can also specify custom configurations or use presets as a starting point and customise with overlays: - - - {pythonVoiceQuickstart} - - - + +Specify configurations in a `VoiceAgentConfig` object: - {pythonVoicePresets} + {pythonVoiceCustomConfig} - + +Use presets as a starting point and customise with overlays: - {pythonVoiceCustomConfig} + {pythonVoiceConfigOverlays} -## FAQ +Note: If no configuration or preset is provided, the client will default to the `external` preset. -### Implementation and deployment -
-Can I deploy this in my own environment? -Yes! The Voice SDK can be consumed via our managed service or deployed in your own environment. To learn more about on-premises deployment options, [speak to sales](https://www.speechmatics.com/speak-to-sales). -
+## FAQ ### Support
@@ -93,7 +164,7 @@ You can submit feedback, bug reports, or feature requests through the Speechmati ## Next steps -For more information, see the [Voice SDK](https://github.com/speechmatics/speechmatics-python-sdk/tree/main/sdk/voice) on github. +For more information, see the [Voice SDK](https://github.com/speechmatics/speechmatics-python-sdk/tree/main/sdk/voice) on GitHub. To learn more, check out [the Speechmatics Academy](https://github.com/speechmatics/speechmatics-academy). diff --git a/docs/voice-agents/quickstart.mdx b/docs/voice-agents/quickstart.mdx deleted file mode 100644 index 5b3dad73..00000000 --- a/docs/voice-agents/quickstart.mdx +++ /dev/null @@ -1,110 +0,0 @@ ---- -description: Learn how to build voice-enabled applications with the Speechmatics voice SDK ---- -import Admonition from '@theme/Admonition'; -import CodeBlock from '@theme/CodeBlock'; -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - -import pythonVoiceQuickstart from "./assets/basic-quickstart.py?raw" -import pythonVoicePresets from "./assets/presets.py?raw" -import pythonVoiceCustomConfig from "./assets/custom-config.py?raw" - -# Voice agent overview - -The voice SDK builds on our real-time API to provide features optimized for conversational AI: - -- **Intelligent segmentation**: groups words into meaningful speech segments per speaker. -- **Turn detection**: automatically detects when speakers finish talking. -- **Speaker management**: focus on or ignore specific speakers in multi-speaker scenarios. -- **Preset configurations**: offers ready-to-use settings for conversations, note-taking, and captions. -- **Simplified event handling**: delivers clean, structured segments instead of raw word-level events. - -### When to use the voice SDK vs real-time SDK - -Use the voice SDK when: - -- Building conversational AI or voice agents -- You need automatic turn detection -- You want speaker-focused transcription -- You need ready-to-use presets for common scenarios - -Use the realtime SDK when: - -- You need the raw stream of word-by-word transcription data -- Building custom segmentation logic -- You want fine-grained control over every event -- Processing batch files or custom workflows - -## Getting started - -### 1. Get your API key - -[Create an API key in the portal](https://portal.speechmatics.com/settings/api-keys) to access the voice SDK. Store your key securely as a managed secret. - -### 2. Install dependencies - -```bash -# Standard installation -pip install speechmatics-voice - -# With SMART_TURN (ML-based turn detection) -pip install speechmatics-voice[smart] -``` - -### 3. Quickstart - -Replace `YOUR_API_KEY` with your actual API key from the portal: - - - - - {pythonVoiceQuickstart} - - - - - {pythonVoicePresets} - - - - - {pythonVoiceCustomConfig} - - - - -## FAQ - -### Implementation and deployment - -
-Can I deploy this in my own environment? - -Yes! The voice agent SDK can be consumed via our managed service or deployed in your own environment. To learn more about on-premises deployment options, [speak to sales](https://www.speechmatics.com/speak-to-sales). -
- -### Support - -
-Where can I provide feedback or get help? - -You can submit feedback, bug reports, or feature requests through the Speechmatics [GitHub discussions](https://github.com/orgs/speechmatics/discussions). -
- -## Next steps - -For more information, see the [voice agent Python SDK](https://github.com/speechmatics/speechmatics-python-sdk/tree/main/sdk/voice) on github. - -To learn more, check out [the Speechmatics academy](https://github.com/speechmatics/speechmatics-academy) - -Ready to build something amazing with our voice agent SDK? We'd love to hear about your project and help you succeed. - -**Get in touch with us:** -- Share your feedback and feature requests -- Ask questions about implementation -- Discuss enterprise pricing and custom voices -- Report any issues or bugs you encounter - -[Contact our team](https://support.speechmatics.com) or join our developer community (https://www.reddit.com/r/Speechmatics) to connect with other builders using text to speech. -