From bbeb7f9ff2447ad252abd78463d0d42e83df63d3 Mon Sep 17 00:00:00 2001 From: Venkatesh Chandran Date: Thu, 18 Dec 2025 15:21:13 +0000 Subject: [PATCH 01/11] Realtime charts docs --- docs/deployments/kubernetes/prerequisites.mdx | 16 +- docs/deployments/kubernetes/realtime.mdx | 642 ++++++++++++++++-- 2 files changed, 580 insertions(+), 78 deletions(-) diff --git a/docs/deployments/kubernetes/prerequisites.mdx b/docs/deployments/kubernetes/prerequisites.mdx index d51c2038..bb46c6db 100644 --- a/docs/deployments/kubernetes/prerequisites.mdx +++ b/docs/deployments/kubernetes/prerequisites.mdx @@ -12,7 +12,11 @@ import TabItem from '@theme/TabItem'; # Prerequisites -## Access to the Docker and Helm Registry +## Helm version + +Ensure you are using Helm version 3.16.0 + +## Access to the docker and helm registry :::info It is important to create and reference resources within the same namespace if you do not have full control of your Kubernetes cluster or if it is a shared cluster @@ -39,7 +43,7 @@ helm registry login speechmaticspublic.azurecr.io \ --password ``` -## Speechmatics License +## Speechmatics license Please speak to `support@speechmatics.com` if you do not already have a valid Speechmatics license. @@ -61,7 +65,7 @@ global: license: $B64_ENCODED_LICENSE ``` -## GPU Drivers +## GPU drivers The Speechmatics inference server runs Nvidia Triton Server, which requires an Nvidia GPU. When running GPU nodes in Kubernetes, you will require the Nvidia device plugin which allows containers on the cluster to access the GPUs. @@ -79,7 +83,9 @@ You can validate a node has allocatable GPU resources with: kubectl get nodes -o yaml | yq .[].[].status.allocatable | grep nvidia ``` -## Nginx Ingress Controller +## Ingress controller + +### Nginx When setting up Speechmatics via an ingress controller, it is recommended to use the `ingress-nginx` ingress controller with snippet annotations enabled. You can confirm if your cluster supports Nginx with snippet annotations enabled using the following command: @@ -127,7 +133,7 @@ helm repo add nginx https://kubernetes.github.io/ingress-nginx helm install nginx nginx/ingress-nginx --version 4.11.4 -f nginx.values.yaml ``` -### Using Another Ingress Controller +### Using another ingress controller If you are running another ingress controller, when enabling ingress on the chart, you need to ensure that a `Request-Id` header is passed through. This is used to manage session usage. diff --git a/docs/deployments/kubernetes/realtime.mdx b/docs/deployments/kubernetes/realtime.mdx index 4b19828b..c4c796a5 100644 --- a/docs/deployments/kubernetes/realtime.mdx +++ b/docs/deployments/kubernetes/realtime.mdx @@ -13,10 +13,6 @@ import CodeBlock from '@theme/CodeBlock'; ## Quickstart -:::info -It is important to create and reference resources within the same namespace if you do not have full control of your Kubernetes cluster or if it is a shared cluster -::: - ### Installation Providing the [Prerequisites](/deployments/kubernetes/prerequisites) have been met for the Speechmatics Helm chart, use the command below to install: @@ -29,7 +25,7 @@ helm upgrade --install speechmatics-realtime \\ --set proxy.ingress.url="speechmatics.example.com"`} -### Validate the Capacity +### Validate the capacity You can confirm whether the transcribers and inference servers are available using: @@ -41,11 +37,11 @@ If the transcribers and inference servers are available, it will show `CAPACITY` ```bash NAME REPLICAS CAPACITY USAGE VERSION SPEC HASH -inference-server-enhanced-recipe1 1 480 0 1 b5784af49332f9948481195451eab6ca -speechmatics-realtime-en 1 2 0 1 83929f2b9b2448cdc818d0e46e37600b +inference-server-enhanced-recipe1 1 360 0 2 b5784af49332f9948481195451eab6ca +rt-transcriber-en 1 2 0 4 83929f2b9b2448cdc818d0e46e37600b ``` -### Run a Session +### Run a session ```bash speechmatics rt transcribe \ @@ -58,106 +54,555 @@ speechmatics rt transcribe \ ## Configuration -See the examples below on how to configure the Helm chart for different deployment scenarios. +### Speech-to-text + +All Speech-to-text components are deployed as SessionGroups, which is a CRD managed by this chart. Speech-to-text is made up of the transcriber and the inference server. +Transcribers have a SessionGroup deployed per language, whereas inference servers support a collection of languages in what is referred to as recipes. If running in `standard` operating point, then all languages are available from the one SessionGroup. If running in `enhanced` operating point, you will need to specify the recipe relevant to the languages being used. There are a total of 4 recipes. For more information on the languages available in each enhanced recipe see [the Speechmatics docs](https://docs.speechmatics.com/on-prem/containers/accessing-images#enhanced-operating-point) + +```yaml +## Standard operating point deployment +global: + transcriber: + # Inference server supports all languages + languages: ["en", "fr"] - - +inferenceServerStandardAll: + # Deploys inference server with standard operating point model + enabled: true +``` ```yaml +## Enhanced operating point recipe-1 deployment global: transcriber: - languages: ["ar", "ba", "be", "bg", "bn", "ca", "cmn", "cmn_en", "cs", "cy", "da", "de", "el", "en", "en_ms", "en_ta", "eo", "es", "es-bilingual-en", "et", "eu", "fa", "fi", "fr", "ga", "gl", "he", "hi", "hr", "hu", "ia", "id", "it", "ja", "ko", "lt", "lv", "mn", "mr", "ms", "mt", "nl", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sv", "sw", "ta", "th", "tr", "ug", "uk", "ur", "vi", "yue"] + # Supported languages: ba,be,cy,en,eo,eu,ga,mn,mr,ta,tr,ug,uk + languages: ["en"] -# Enable all enhanced and standard inference server recipes inferenceServerEnhancedRecipe1: + # Deploys inference server with enhanced operating point recipe1 model enabled: true +``` + +```yaml +## Enhanced operating point recipe-2 deployment +global: + transcriber: + # Supported languages: bg,es,et,fa,gl,hr,ia,id,lt,lv,ro,sk,sl,ur + languages: ["es"] inferenceServerEnhancedRecipe2: + # Deploys inference server with enhanced operating point recipe2 model enabled: true +``` + +```yaml +## Enhanced operating point recipe-3 deployment +global: + transcriber: + # Supported languages: ca,cs,da,de,el,fi,he,hi,hu,it,ko,ms,sv,sw + languages: ["de"] inferenceServerEnhancedRecipe3: + # Deploys inference server with enhanced operating point recipe3 model enabled: true +``` -inferenceServerEnhancedRecipe4: - enabled: true +```yaml +## Enhanced operating point recipe-4 deployment +global: + transcriber: + # Supported languages: ar,bn,cmn,fr,ja,mt,no,nl,pl,pt,ru,th,vi,yue + languages: ["fr"] -inferenceServerStandardAll: +inferenceServerEnhancedRecipe4: + # Deploys inference server with enhanced operating point recipe4 model enabled: true ``` - - +### Resource manager -```yaml -# Disable default enhanced inference server recipes -inferenceServerEnhancedRecipe1: - enabled: false +Resource manager components include all non-transcription components as well as sidecars running in the transcription components. The version of these components can be globally configured so all components are updated together: -# Enable custom inference server deployment with just en models -inferenceServerCustom: - enabled: true - fullnameOverride: inference-server-en - - tritonServer: +```yaml +global: + resourceManager: image: - # Repository for the en-only inference server triton container - repository: sm-gpu-inference-server-en - - inferenceSidecar: - enabled: true + tag: 1.2.3 +``` - # Configuration for custom model deployments - registerFeatures: - capacity: 600 - customModelCosts: - "*:diar_standard": 0 - "*:body_standard": 0 - "*:diar_enhanced": 0 - "*:body_enhanced": 0 - en:am_en_standard: 0 - en:ensemble_en_standard: 20 - en:lm_en_enhanced: 10 - en:am_en_enhanced: 0 - en:ensemble_en_enhanced: 20 +The services include: + +| Service Name | Description | +| ------------------------------- | -------------------------------------------------------------------------------------------- | +| resource-manager | The main API and controller for speech-to-text sessions | +| resource-manager-metrics | Exports Prometheus metrics for session usage/availability | +| resource-manager-reconciliation | Responsible for reconciling the state of Redis if pods are killed without returning capacity | +| sessiongroups-controller | Provisions and manages SessionGroup resources | +| worker-proxy | Sidecar running inside transcribers to request and proxy connections to the inference server | +| readiness-tracker | Sidecar running inside transcribers to manage connections/capacity and idle status | +| inference-sidecar | Sidecar running inside inference-server pods to manage connections/capacity | + +### Proxy service + +Proxy service is a proxy between the client and transcriber. This service allows for multi-cluster deployments as well as exporting metrics about sessions, including session count and latency. + +### Session groups + +SessionGroups is a custom Speechmatics CRD which is used to allocate idle transcribers/inference servers, manage scaling up and down of transcribers/inference servers, and protection of active sessions. It provides the following benefits: + +- Auto-scaling up and down of sensitive websocket connections based on a buffer +- Bin-packing of sessions to run an efficient number of nodes +- Prevent sessions from being terminated by node scale down +- Rolling update of transcribers/inference servers without interrupting existing sessions +- Control over session capacity and how many connections a transcriber/inference server can accept + +You can view deployed session groups and their usage with the command: `kubectl get sessiongroups` + +The output will look something like: + +```bash +NAME REPLICAS CAPACITY USAGE VERSION SPEC HASH +inference-server-enhanced-recipe1 1 360 0 2 0492bb2d21f1fa9dac851e31a48667d9 +rt-transcriber-en 1 2 0 4 ebf88debb77fe9853455ac7d5a24c6ef ``` - - +Replicas refers to the number of pods deployed, Capacity refers to the total capacity deployed and Usage is how much of the capacity is currently used. + +#### Scaling + +The auto-scaling works using a buffer, so as more transcribers/inference servers get allocated connections, more idle transcribers/inference servers will scale up. Auto-scaling buffers can be configured in helm values: ```yaml global: - # Enable scaling for all sessiongroups resources sessionGroups: scaling: + # Enable auto-scaling of session groups enabled: true -inferenceServerEnhancedRecipe1: +transcribers: sessionGroups: scaling: - # Scale up inference server pods when there are 300 inference tokens remaining - scaleOnCapacityLeft: 300 + # -- Minimum number of pods to deploy + minReplicas: 3 + + # -- Max number of pods to scale to + maxReplicas: 10 + + # -- Wait time before scaling down a transcriber once idle + scaleDownDelay: 1m0s + + # -- Session capacity for when to scale up (Supports decimals) + scaleOnCapacityLeft: 5 +``` + +`((replicas x maxConcurrentConnections) - scaleOnCapacityLeft) = supported sessions before scaling` + +Once there is less than 5 idle connections available, SessionGroups will scale up the transcribers to ensure a capacity of 5. For example, 3 replicas totals 6 total connections. Once there is more than 2 active connections, another pod will added until the available connections totals 5. + +#### Session protection +SessionGroups also protects sessions from being terminated during scale down of nodes, and rolling update. SessionGroups will manage the update process of speech-to-text components by identifying idle pods that can be updated and leaving pods with active sessions. + + +### Concurrency + +By default, the chart is configured to allow 2 connections to each transcriber. This can be configured with the following values: + +```yaml transcribers: - sessionGroups: - scaling: - # Scale up transcriber pods when there is only capacity for 1 more session - scaleOnCapacityLeft: 1 + transcriber: + maxConcurrentConnections: + value: NUMBER_OF_CONNECTIONS +``` + +This value can be overridden for each language with the `transcribers.languages.overrides.maxConcurrentConnections` values: + +```yaml +transcribers: + languages: + overrides: + maxConcurrentConnections: + en: NUMBER_OF_EN_CONNECTIONS + es: NUMBER_OF_ES_CONNECTIONS +``` + +Changing this value could affect the resource requirements of the transcriber. + +The recommended resource requests for 2 connections to each transcriber (double session workers) are the current default values: + +```yaml +transcribers: + readinessTracker: + resources: + requests: + cpu: 10m + memory: 15Mi + + workerProxy: + resources: + requests: + cpu: 10m + memory: 15Mi +``` + +### Model costs + +The cost of a session for a transcriber is `1`. However, depending on the features and languages used in a session (and the configured model cost), the cost of that session to the inference server can be between `18-24` for enhanced and `120-150` for standard. + +The capacity of an inference server determines the number of sessions that can be connected to it: `capacity/cost_per_session`. + +**Recommended capacities:** + +| Component | Capacity | Cost per session | Respective Session Count | +| --------------------------------- | -------- | ----------------------------------------------- | ------------------------------------------------- | +| inference-server-enhanced-recipe1 | 480 | 21 for English (en)
20 for other languages | 22 for English only
24 for other languages | +| inference-server-enhanced-recipe2 | 480 | 26 for Spanish (es)
20 for other languages | 18 for Spanish only
24 for other languages | +| inference-server-enhanced-recipe3 | 480 | 26 for German (de)
20 for other languages | 18 for German only
24 for other languages | +| inference-server-enhanced-recipe4 | 480 | 26 for French (fr)
20 for other languages | 18 for French only
24 for other languages | +| inference-server-standard-all | 2400 | 16 for English (en)
20 for other languages | 150 for English only
120 for other languages | +| transcriber | 2 | 1 | 2 | + +By default, the chart has been configured to set the capacity to `480` for enhanced recipes and `2400` for standard-all. Model costs have been configured alongside these capacities to maintain the recommended session counts for each recipe (shown in the table above). + +The model cost and capacity of an inference server can be overridden in the helm values: + +```yaml +inferenceServer: + inferenceSidecar: + registerFeatures: + capacity: CAPACITY + modelCosts: + : MODEL_COST # e.g. ensemble: 20 +``` + +**Note:** The above capacity is derived for a [Standard_NC4as_T4_v3](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes/gpu-accelerated/ncast4v3-series) instance. When using other GPU servers (such as L4 or H100) the number of sessions per inference server instance can be increased by changing capacity while keeping the session cost as same. + +### Multi-channel + +The maximum number of channels supported in transcriber for each language can be configured different to capacity of a transcriber defined in `transcriber.maxConcurrentConnections.value`. With this we can scale based on capacity but have a different maximum number of channels supported from transcriber. + +Configure the maximum number of channels per transcriber as `(NUMBER_OF_CONNECTIONS x required number of channels per session)`. + +This can be configured with the following values: + +```yaml +transcribers: + transcriber: + maxRecognizers: + value: MAXIMUM_NUMBER_OF_CHANNELS +``` + +This value can be overridden for each language with the `transcribers.languages.overrides.maxRecognizers` values: + +```yaml +transcribers: + languages: + overrides: + maxRecognizers: + en: MAXIMUM_NUMBER_OF_EN_CHANNELS + es: MAXIMUM_NUMBER_OF_ES_CHANNELS +``` + +By default, proxy service accepts only 1 channel per session. This can be updated by configuring following values: + +```yaml +proxy: + proxy: + config: + maxChannelsPerSession: MAX_NUMBER_OF_CHANNELS_PER_SESSION ``` -
-
+### Usage reporting -## Hardware Recommendations +By default, all events will be reported to `usage.speechmatics.com`. If you are running your own usage container, you can update the configuration to point to that endpoint with these values: -Below are the recommended Azure node sizes for running Speechmatics on Kubernetes: +```yaml +transcribers: + eats: + url: "USAGE_CONTAINER_URL:PORT" +``` -Service | Node Size ----|--- -STT (Inference Server) | Standard_NC4as_T4_v3 -STT (Transcriber) | Standard_E16s_v5 -All Other Services | Standard_D*s_v5 +### Production environments +#### Infrastructure setup + +By default, no node selectors or taints/tolerations will be in place. Providing GPU drivers have been setup correctly, inference servers should always schedule on a GPU node due to their default resource requirements: + +```yaml + resources: + limits: + # -- GPU requirements for Triton server + nvidia.com/gpu: "1" +``` + +However, this does not stop other services running on these GPU nodes. It is recommended to run the inference servers and transcribers on separate nodes, so adding a taint to the GPU nodes will ensure that the transcribers are not able to run there. + +It is possible to configure node selectors and tolerations for both session groups to deploy on separate node types. + +```yaml +inferenceServer: + tritonServer: + nodeSelector: {} + # gpu: "true" + + # -- Tolerations for Triton server deployments + tolerations: + - key: "node-type" + operator: "Equal" + value: "gpu" + effect: "NoSchedule" + +transcribers: + transcriber: + tolerations: + - key: "node-type" + operator: "Equal" + value: "transcriber-only" + effect: "NoSchedule" +``` + +##### Hardware recommendations + +Below are the Azure VM sizes which we recommend for running our services + +| Service | Node Type | +| ------------------ | -------------------- | +| Inference Server | Standard_NC4as_T4_v3 | +| Transcriber | Standard_E16s_v5 | +| All other services | Standard_D*s_v5 | + +#### Redis + +If you are running a multi-cluster solution, there needs to be a Redis deployed for each cluster. + +By default, this chart deploys Redis in the same namespace as the Speechmatics services and configures those services to use it automatically. + +You can instead use your own Redis instance and configure the Speechmatics services to point to it. If you want to use a Redis instance running in the same Kubernetes cluster, disable the Redis instance managed by this chart and set the Redis URL (service name and port) via values, for example: + +```yaml +resourceManager: + redis: + enabled: false + url: +``` + +##### External redis + +Redis can also be hosted outside the cluster. The chart can be configured to use this external Redis by setting the values below. This configuration will create a secret that stores the Redis connection URL. + +```yaml +resourceManager: + externalRedis: + enabled: true + + redis: + enabled: false + url: B64_ENCODED_REDIS_CONN_URL # base64 encoded + + secrets: + redis: + create: true +``` + +Alternatively, you can create the Redis connection URL secret yourself and pass its name to the chart. The secret value must be base64 encoded and use the key `redis_url`. + +```yaml +resourceManager: + externalRedis: + enabled: true + + redis: + enabled: false + + secrets: + redis: + name: NAME_OF_REDIS_CONNECTION_SECRET +``` + +#### Custom dictionary cache redis + +The resource manager and transcribers can be configured to use a custom dictionary (CD) cache backed by Redis. Enable this with `resourceManager.cdCache.enabled=true` and `transcribers.transcriber.cdCache.enabled=true`. + +Redis for the CD cache can be hosted externally, and the chart can be configured to use it by setting the values below. This configuration will create a secret that stores the CD cache Redis connection URL. + +```yaml +transcribers: + transcriber: + cdCache: + enabled: true + +resourceManager: + cdCache: + enabled: true + url: B64_ENCODED_CD_CACHE_REDIS_CONN_URL # base64 encoded + + secrets: + cdCache: + create: true +``` + +Alternatively, you can create the CD cache Redis connection URL secret yourself and pass its name to the chart. The secret value must be base64 encoded and use the key `cd_cache_url`. + +```yaml +transcribers: + transcriber: + cdCache: + enabled: true + +resourceManager: + cdCache: + enabled: true + + secrets: + cdCache: + name: NAME_OF_REDIS_CONNECTION_SECRET +``` + +You can configure the maximum size of a CD cache entry, its expiry time in seconds, and the maximum number of entries (keys) stored in Redis as follows: + +```yaml +resourceManager: + cdCache: + enabled: true + # Maximum size of a CD cache entry + maxEntrySizeBytes: 10485760 # calculation is 10MB (1024 * 1024 * 10) + # Number of seconds before a CD cache entry expires + expirySecond: 86400 + # Maximum number of CD cache entries + maxKeysPerContract: 200 +``` + +A single custom dictionary cache Redis instance can be shared across multiple clusters, with the resource manager in each cluster pointing to the same Redis. + +#### Service upgrades + +Real-time speech-to-text sessions leverage websockets which are sensitive to any service disruption. This is why we recommend using SessionGroups to manage transcription components, which allow for in-place transcriber and inference server updates. However, non-sessiongroup components such as proxy-service and an ingress controller will not be protected disruptions. + +##### Ingress controller configuration + +Any updates to an ingress controller could result in nginx processes restarting which will break a websocket connection. If you are using nginx, it is recommended to set the following configuration in the nginx helm chart: + +```yaml +controller: + config: + # This prevents nginx worker process from shutting down for 24h in case of active sessions + worker-shutdown-timeout: 86400s + + extraArgs: + # This prevents nginx from shutting down for 24h in case of active connections + shutdown-grace-period: 86400 + + terminationGracePeriodSeconds: 86400 +``` + +This will ensure that nginx will not restart for at least 24h. This can be configured to your preferred max session duration. + +##### Proxy service deployments + +Any restart of a proxy-service pod will also terminate a websocket connection. The proxy chart allows you to configure multiple deployments and A/B switch between them based on service labels: + +```yaml +proxy: + proxy: + deployments: + a: + active: true + image: + tag: 1.2.3 + b: + active: false + image: + tag: 1.2.4 +``` + +The example above will deploy 2 proxy-service deployments (a and b), with active traffic being sent to `a`. Once `b` is up and running, the `active` can be switched to the `b` deployment to avoid any disruption. Traffic will eventually drain away from `a` but the pods will remain scaled up. + +#### Language specific configuration + +##### Resources + +Different transcriber languages can have different requirements to others. The chart allows you to better fine-tune the resource requirements of a language using the `transcribers.transcriber.languages.overrides` block. Each language is already configured at the recommended value when operating with 2 concurrent sessions per transcriber. If the concurrency level is changed, then the resources will need to be updated for each transcriber language. + +```yaml +transcribers: + transcriber: + languages: + # -- Override specific behaviour per language + overrides: + resources: + ar: + requests: + cpu: 500m + memory: 5Gi + sv: + requests: + cpu: 200m + memory: 3Gi +``` + +##### Autoscaling + +Requirements for auto-scaling individual languages is also likely to be different depending on the number of languages supported and traffic demand. The buffer can be configured independently for each language under `transcribers.transcriber.languages.overrides.sessionGroupsScaling` + +```yaml +transcribers: + transcriber: + languages: + # -- Override specific behaviour per language + overrides: + sessionGroupsScaling: + ar: + # -- Minimum number of ar pods to run + minReplicas: 3 + + # -- Buffer to start scaling on once capacity is exceeded + scaleOnCapacityLeft: 4 + bg: + minReplicas: 20 + scaleOnCapacityLeft: 15 +``` + +#### TLS ingress configuration + +The proxy ingress can be configured to add a TLS block with the following values: + +```yaml +proxy: + proxy: + config: + # -- Dont allow insecure websockets connections to proxy + useInsecureWebsockets: false + ingress: + url: $REALTIME_URL + tls: + # -- Name of the TLS secret + secretName: my-certificate + + # Add any needed annotations (cert-manager example) + annotations: + cert-manager.io/cluster-issuer: letsencrypt +``` + +### Observability + +See the [observability](observability/) directory for information on setting up observability for your cluster. + +Following the steps in the [README](observability/README.md) will deploy an observability stack alongside the `sm-realtime` deployment for log monitoring & metrics scraping of services, providing better visibility into your deployment. This will allow the support team to better assist you, if required. + +The stack includes: + +**Log Monitoring:** +- **Request tracing**: Filter and search logs by `requestid` to trace individual transcription requests. +- **Service-level logs**: View logs for specific services (e.g., `proxy`, `resource-manager`) within a specified namespace. +- **Log export**: Download logs for offline analysis or for sending to support. + +**Metrics Scraping (via Prometheus)** +- Monitor all metrics via the Metrics dashboard in Grafana, and incorporate them into dashboards. + +**Dashboards (via Grafana)** +- **Pre-built dashboard**: A starter dashboard is included for quick visibility into service errors and model usage patterns +- **Custom queries**: Build custom dashboards and alerts using LogQL and PromQL ## Uninstall @@ -174,15 +619,66 @@ Depending on the configuration setup, you may also need to remove PVCs created f kubectl get pvc | grep redis-data ``` -## FAQ +## FAQs + +*1. I can see SessionGroups in the cluster, but logs are saying there is no available transcriber* + +> You can log into redis and view the registered resources with the command `keys *` - this will show what transcription components have registered their capacity with resource manager. If the pod IPs for a session group is not visible they, try restarting the readiness-tracker container or inference-sidecar container. + +*2. Connection times for transcription sessions are very slow* + +> When running standard, the connection times can be slow as the transcribers are configured in Enhanced operating point mode by default. You can modify "pre-warm" on the chart to initiate a standard session on startup so follow up connections are faster. +> +> The operating point for pre-warm can be changed to standard using following helm values: + +```yaml +transcribers: + transcriber: + preWarm: + operatingPoint: standard +``` + +*3. Sessions are being dropped after an nginx upgrade* + +> Many different service updates, including proxy-service and the ingress controller, can impact active sessions. Ensure that pods for these services are not restarted while they are handling sessions. +> +> Nginx can be configured to prevent it from being restarted if there is an active session using the following helm values: + +```yaml +# nginx values +controller: + config: + # This prevents nginx worker process from shutting down for 24h in case of active sessions + worker-shutdown-timeout: 86400s + + extraArgs: + # This prevents nginx from shutting down for 24h in case of active connections + shutdown-grace-period: 86400 + + # Used to prevent nginx pods being terminated for 24h while there are active sessions + terminationGracePeriodSeconds: 86400 +``` +> +>For more details, see "Service Upgrades" above + +*4. Installation is complaining that SessionGroups kind does not exist* + +> The SessionGroup CRDs are added as part of the charts under the `crds/` directory. These will only be installed when installing with `helm install` and if the CRDs do not already exist on the cluster. +> +> If installing with `helm template | kubectl apply -f -` then the CRDs will not be included in the outputted template. Instead, the CRDs can be applied with `kubectl apply -f ./crds`. -*Why should I use the sm-realtime Helm chart over a Docker container deployment?* +*5. I see a lot of containers in CrashLoopBackoff* -> The sm-realtime chart provides a set of containers which will help protect and auto-scale sensitive websocket connections, and ensure performance with session capacity management. - It also provides cost benefits with custom scheduling to help bin-pack active workers onto busy Kubernetes nodes. +> Resource manager components require a connection to Redis to start successfully. Ensure that redis is running and validate the `redis_url` is correct with `kubectl get cm resource-manager-config -o yaml` or `kubectl get secret REDIS_SECRET_NAME -o yaml` if you are using an External Redis. +> +> If redis is taking a long time to start, the timeout of the RM pods can be increased with the `resourceManager.redis.timeoutConnection` value. It currently defaults to 10m. +> +> You can check the logs with `kubectl logs $POD_NAME` or check the latest events with `kubectl describe pod $POD_NAME` -*What is SessionGroups?* +*6. Transcriber Pods are stuck in Init:0/1* -> SessionGroups is Speechmatics' custom Kubernetes auto-scaling and session management solution for websocket containers. - SessionGroups ensures that cluster nodes can safely scale up and down without impacting live STT and Flow sessions running on it. Additionally, it will bin-pack new sessions onto busy nodes for cost efficiency. - It comes as a CustomResourceDefinition (CRD) and controller deployed as part of the Speechmatics Realtime Helm chart. +> By default, the chart enables `preWarm` which is used to warm up the transcriber models to allow for faster connection times. The transcriber pod init container will check that there is capacity for the preWarm session on startup, meaning if the inference servers have not yet started, the pod will continue to stay in `Init:0/1`. +> +> If the inference servers have started and it is still stuck in `Init:0/1`, then it could be an issue with the default configuration of preWarm. By default, pre-warm will attempt to warm up with `enhanced`, this can be changed with `transcribers.transcriber.preWarm.operatingPoint=standard`. It could also be related to the inference servers deployed in relation to the languages you are trying to run. If languages are deployed which are not supported by inference servers, then those languages will get stuck trying to get inference server capacity. +> +>This behaviour can be disabled by setting `transcribers.workerProxy.checkForCapacityOnStart=false`. Alternatively, pre-warm can be disabled with `transcribers.transcriber.preWarm.enabled=false`. From 55c6f9cc410c472715bd5265f7c82378426fbfe0 Mon Sep 17 00:00:00 2001 From: Venkatesh Chandran Date: Thu, 18 Dec 2025 15:23:14 +0000 Subject: [PATCH 02/11] minor fix --- docs/deployments/kubernetes/realtime.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deployments/kubernetes/realtime.mdx b/docs/deployments/kubernetes/realtime.mdx index c4c796a5..b574c62d 100644 --- a/docs/deployments/kubernetes/realtime.mdx +++ b/docs/deployments/kubernetes/realtime.mdx @@ -619,7 +619,7 @@ Depending on the configuration setup, you may also need to remove PVCs created f kubectl get pvc | grep redis-data ``` -## FAQs +## FAQ *1. I can see SessionGroups in the cluster, but logs are saying there is no available transcriber* From a8821cef69bf133e8c8420dad3a36bb00b5878cd Mon Sep 17 00:00:00 2001 From: smvenkateshc <88679930+smvenkateshc@users.noreply.github.com> Date: Thu, 18 Dec 2025 18:48:14 +0000 Subject: [PATCH 03/11] Update docs/deployments/kubernetes/realtime.mdx Co-authored-by: Matt Nemitz --- docs/deployments/kubernetes/realtime.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deployments/kubernetes/realtime.mdx b/docs/deployments/kubernetes/realtime.mdx index b574c62d..774f8f23 100644 --- a/docs/deployments/kubernetes/realtime.mdx +++ b/docs/deployments/kubernetes/realtime.mdx @@ -56,7 +56,7 @@ speechmatics rt transcribe \ ### Speech-to-text -All Speech-to-text components are deployed as SessionGroups, which is a CRD managed by this chart. Speech-to-text is made up of the transcriber and the inference server. +All speech-to-text components are deployed as `SessionGroups`, which is a CRD managed by this chart. Speech-to-text is made up of the transcriber and the inference server. Transcribers have a SessionGroup deployed per language, whereas inference servers support a collection of languages in what is referred to as recipes. If running in `standard` operating point, then all languages are available from the one SessionGroup. If running in `enhanced` operating point, you will need to specify the recipe relevant to the languages being used. There are a total of 4 recipes. For more information on the languages available in each enhanced recipe see [the Speechmatics docs](https://docs.speechmatics.com/on-prem/containers/accessing-images#enhanced-operating-point) From b68d7a873a742dddb7a81ae165fb0e6c3faf5574 Mon Sep 17 00:00:00 2001 From: smvenkateshc <88679930+smvenkateshc@users.noreply.github.com> Date: Thu, 18 Dec 2025 18:48:21 +0000 Subject: [PATCH 04/11] Update docs/deployments/kubernetes/realtime.mdx Co-authored-by: Matt Nemitz --- docs/deployments/kubernetes/realtime.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deployments/kubernetes/realtime.mdx b/docs/deployments/kubernetes/realtime.mdx index 774f8f23..2cb565fd 100644 --- a/docs/deployments/kubernetes/realtime.mdx +++ b/docs/deployments/kubernetes/realtime.mdx @@ -202,7 +202,7 @@ Once there is less than 5 idle connections available, SessionGroups will scale u #### Session protection -SessionGroups also protects sessions from being terminated during scale down of nodes, and rolling update. SessionGroups will manage the update process of speech-to-text components by identifying idle pods that can be updated and leaving pods with active sessions. +`SessionGroups` also protects sessions from being terminated during scale down of nodes, and rolling update. `SessionGroups` will manage the update process of speech-to-text components by identifying idle pods that can be updated and leaving pods with active sessions. ### Concurrency From c23fbcc724bae22e7cef31363eccce468cb8bd46 Mon Sep 17 00:00:00 2001 From: smvenkateshc <88679930+smvenkateshc@users.noreply.github.com> Date: Thu, 18 Dec 2025 18:48:44 +0000 Subject: [PATCH 05/11] Update docs/deployments/kubernetes/realtime.mdx Co-authored-by: Matt Nemitz --- docs/deployments/kubernetes/realtime.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deployments/kubernetes/realtime.mdx b/docs/deployments/kubernetes/realtime.mdx index 2cb565fd..1857063b 100644 --- a/docs/deployments/kubernetes/realtime.mdx +++ b/docs/deployments/kubernetes/realtime.mdx @@ -256,7 +256,7 @@ The capacity of an inference server determines the number of sessions that can b | Component | Capacity | Cost per session | Respective Session Count | | --------------------------------- | -------- | ----------------------------------------------- | ------------------------------------------------- | -| inference-server-enhanced-recipe1 | 480 | 21 for English (en)
20 for other languages | 22 for English only
24 for other languages | +| inference-server-enhanced-recipe1 | 480 | 21 for English (en)
20 for other languages | 22 for English only
24 for other languages | | inference-server-enhanced-recipe2 | 480 | 26 for Spanish (es)
20 for other languages | 18 for Spanish only
24 for other languages | | inference-server-enhanced-recipe3 | 480 | 26 for German (de)
20 for other languages | 18 for German only
24 for other languages | | inference-server-enhanced-recipe4 | 480 | 26 for French (fr)
20 for other languages | 18 for French only
24 for other languages | From 56072b4de0411a5a72df1ae7d5dfbb61dbc72b78 Mon Sep 17 00:00:00 2001 From: smvenkateshc <88679930+smvenkateshc@users.noreply.github.com> Date: Thu, 18 Dec 2025 18:48:51 +0000 Subject: [PATCH 06/11] Update docs/deployments/kubernetes/realtime.mdx Co-authored-by: Matt Nemitz --- docs/deployments/kubernetes/realtime.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deployments/kubernetes/realtime.mdx b/docs/deployments/kubernetes/realtime.mdx index 1857063b..20294f2e 100644 --- a/docs/deployments/kubernetes/realtime.mdx +++ b/docs/deployments/kubernetes/realtime.mdx @@ -258,7 +258,7 @@ The capacity of an inference server determines the number of sessions that can b | --------------------------------- | -------- | ----------------------------------------------- | ------------------------------------------------- | | inference-server-enhanced-recipe1 | 480 | 21 for English (en)
20 for other languages | 22 for English only
24 for other languages | | inference-server-enhanced-recipe2 | 480 | 26 for Spanish (es)
20 for other languages | 18 for Spanish only
24 for other languages | -| inference-server-enhanced-recipe3 | 480 | 26 for German (de)
20 for other languages | 18 for German only
24 for other languages | +| inference-server-enhanced-recipe3 | 480 | 26 for German (de)
20 for other languages | 18 for German only
24 for other languages | | inference-server-enhanced-recipe4 | 480 | 26 for French (fr)
20 for other languages | 18 for French only
24 for other languages | | inference-server-standard-all | 2400 | 16 for English (en)
20 for other languages | 150 for English only
120 for other languages | | transcriber | 2 | 1 | 2 | From 1d9849daffb47b5ee44d9e8d445890239d0e42f8 Mon Sep 17 00:00:00 2001 From: smvenkateshc <88679930+smvenkateshc@users.noreply.github.com> Date: Thu, 18 Dec 2025 18:49:00 +0000 Subject: [PATCH 07/11] Update docs/deployments/kubernetes/realtime.mdx Co-authored-by: Matt Nemitz --- docs/deployments/kubernetes/realtime.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deployments/kubernetes/realtime.mdx b/docs/deployments/kubernetes/realtime.mdx index 20294f2e..998478d9 100644 --- a/docs/deployments/kubernetes/realtime.mdx +++ b/docs/deployments/kubernetes/realtime.mdx @@ -260,7 +260,7 @@ The capacity of an inference server determines the number of sessions that can b | inference-server-enhanced-recipe2 | 480 | 26 for Spanish (es)
20 for other languages | 18 for Spanish only
24 for other languages | | inference-server-enhanced-recipe3 | 480 | 26 for German (de)
20 for other languages | 18 for German only
24 for other languages | | inference-server-enhanced-recipe4 | 480 | 26 for French (fr)
20 for other languages | 18 for French only
24 for other languages | -| inference-server-standard-all | 2400 | 16 for English (en)
20 for other languages | 150 for English only
120 for other languages | +| inference-server-standard-all | 2400 | 16 for English (en)
20 for other languages | 150 for English only
120 for other languages | | transcriber | 2 | 1 | 2 | By default, the chart has been configured to set the capacity to `480` for enhanced recipes and `2400` for standard-all. Model costs have been configured alongside these capacities to maintain the recommended session counts for each recipe (shown in the table above). From 9782bee2263e14fa126e7010309560f9077f29ed Mon Sep 17 00:00:00 2001 From: smvenkateshc <88679930+smvenkateshc@users.noreply.github.com> Date: Thu, 18 Dec 2025 18:49:10 +0000 Subject: [PATCH 08/11] Update docs/deployments/kubernetes/realtime.mdx Co-authored-by: Matt Nemitz --- docs/deployments/kubernetes/realtime.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deployments/kubernetes/realtime.mdx b/docs/deployments/kubernetes/realtime.mdx index 998478d9..43c78990 100644 --- a/docs/deployments/kubernetes/realtime.mdx +++ b/docs/deployments/kubernetes/realtime.mdx @@ -259,7 +259,7 @@ The capacity of an inference server determines the number of sessions that can b | inference-server-enhanced-recipe1 | 480 | 21 for English (en)
20 for other languages | 22 for English only
24 for other languages | | inference-server-enhanced-recipe2 | 480 | 26 for Spanish (es)
20 for other languages | 18 for Spanish only
24 for other languages | | inference-server-enhanced-recipe3 | 480 | 26 for German (de)
20 for other languages | 18 for German only
24 for other languages | -| inference-server-enhanced-recipe4 | 480 | 26 for French (fr)
20 for other languages | 18 for French only
24 for other languages | +| inference-server-enhanced-recipe4 | 480 | 26 for French (fr)
20 for other languages | 18 for French only
24 for other languages | | inference-server-standard-all | 2400 | 16 for English (en)
20 for other languages | 150 for English only
120 for other languages | | transcriber | 2 | 1 | 2 | From 394c65d1153dc5e3f169b7f10d7ef0abe658075d Mon Sep 17 00:00:00 2001 From: smvenkateshc <88679930+smvenkateshc@users.noreply.github.com> Date: Thu, 18 Dec 2025 18:49:16 +0000 Subject: [PATCH 09/11] Update docs/deployments/kubernetes/realtime.mdx Co-authored-by: Matt Nemitz --- docs/deployments/kubernetes/realtime.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deployments/kubernetes/realtime.mdx b/docs/deployments/kubernetes/realtime.mdx index 43c78990..e6435ccb 100644 --- a/docs/deployments/kubernetes/realtime.mdx +++ b/docs/deployments/kubernetes/realtime.mdx @@ -257,7 +257,7 @@ The capacity of an inference server determines the number of sessions that can b | Component | Capacity | Cost per session | Respective Session Count | | --------------------------------- | -------- | ----------------------------------------------- | ------------------------------------------------- | | inference-server-enhanced-recipe1 | 480 | 21 for English (en)
20 for other languages | 22 for English only
24 for other languages | -| inference-server-enhanced-recipe2 | 480 | 26 for Spanish (es)
20 for other languages | 18 for Spanish only
24 for other languages | +| inference-server-enhanced-recipe2 | 480 | 26 for Spanish (es)
20 for other languages | 18 for Spanish only
24 for other languages | | inference-server-enhanced-recipe3 | 480 | 26 for German (de)
20 for other languages | 18 for German only
24 for other languages | | inference-server-enhanced-recipe4 | 480 | 26 for French (fr)
20 for other languages | 18 for French only
24 for other languages | | inference-server-standard-all | 2400 | 16 for English (en)
20 for other languages | 150 for English only
120 for other languages | From 5461eba0be0b64d54e5f7707299905862d2d1dbd Mon Sep 17 00:00:00 2001 From: smvenkateshc <88679930+smvenkateshc@users.noreply.github.com> Date: Thu, 18 Dec 2025 18:49:38 +0000 Subject: [PATCH 10/11] Update docs/deployments/kubernetes/realtime.mdx Co-authored-by: Matt Nemitz --- docs/deployments/kubernetes/realtime.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/deployments/kubernetes/realtime.mdx b/docs/deployments/kubernetes/realtime.mdx index e6435ccb..6cd489ef 100644 --- a/docs/deployments/kubernetes/realtime.mdx +++ b/docs/deployments/kubernetes/realtime.mdx @@ -149,7 +149,7 @@ Proxy service is a proxy between the client and transcriber. This service allows ### Session groups -SessionGroups is a custom Speechmatics CRD which is used to allocate idle transcribers/inference servers, manage scaling up and down of transcribers/inference servers, and protection of active sessions. It provides the following benefits: +`SessionGroups` is a custom Speechmatics CRD which is used to allocate idle transcribers/inference servers, manage scaling up and down of transcribers/inference servers, and protection of active sessions. It provides the following benefits: - Auto-scaling up and down of sensitive websocket connections based on a buffer - Bin-packing of sessions to run an efficient number of nodes From f09a7e6af53c94e53b45b1bfb96e5cf4da1020e9 Mon Sep 17 00:00:00 2001 From: Venkatesh Chandran Date: Thu, 18 Dec 2025 19:00:03 +0000 Subject: [PATCH 11/11] Extra contents --- docs/deployments/kubernetes/realtime.mdx | 67 +++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/docs/deployments/kubernetes/realtime.mdx b/docs/deployments/kubernetes/realtime.mdx index 6cd489ef..9a0f32d3 100644 --- a/docs/deployments/kubernetes/realtime.mdx +++ b/docs/deployments/kubernetes/realtime.mdx @@ -276,7 +276,9 @@ inferenceServer: : MODEL_COST # e.g. ensemble: 20 ``` -**Note:** The above capacity is derived for a [Standard_NC4as_T4_v3](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes/gpu-accelerated/ncast4v3-series) instance. When using other GPU servers (such as L4 or H100) the number of sessions per inference server instance can be increased by changing capacity while keeping the session cost as same. +:::note +The above capacity is derived for a [Standard_NC4as_T4_v3](https://learn.microsoft.com/en-us/azure/virtual-machines/sizes/gpu-accelerated/ncast4v3-series) instance. When using other GPU servers (such as L4 or H100) the number of sessions per inference server instance can be increased by changing capacity while keeping the session cost as same. +::: ### Multi-channel @@ -323,6 +325,29 @@ transcribers: url: "USAGE_CONTAINER_URL:PORT" ``` +### Health check + +The resource manager service performs a lightweight health check by verifying that it can establish a TCP connection to a transcriber before assigning a session to it. Aside from this, it does not run periodic health checks on resources. + +The resource manager also supports an “ignore resources” mechanism for inference servers that the transcriber reports as problematic. If the transcriber cannot communicate with an inference server (or encounters any error while doing so), it will request a new inference server and include an ignore list. Ignored resources are excluded from selection, even when they have capacity or when they are the only available resources. + + +#### Single server deployments + +Resource manager will force client requests to not consider problematic resources (transcribers and inference servers). This is to ensure that requests are not redirected to a resource that is not healthy and drop session while trying to connect to it. + +If you run a single-server deployment where blacklisting the only resource would result in lost sessions, you can disable this behavior by setting `resourceManager.config.disableIgnoreResources` to `true`. Below is an example of how to do this: + +```yaml +resourceManager: + config: + disableIgnoreResources: true +``` + +:::note +This setting is unsafe in production. Use it only for single-server deployments where you do not have high availability for transcribers and inference servers. +::: + ### Production environments #### Infrastructure setup @@ -420,6 +445,14 @@ resourceManager: name: NAME_OF_REDIS_CONNECTION_SECRET ``` +##### Redis Dependency Note + +By default, chart includes a Redis dependency from the Bitnami Legacy repository, which has publicly disclosed CVEs. + +Customers who require additional security controls or remediation for these CVEs can: + * Override the Redis image repository and version to use a maintained Redis image, or + * Disable the bundled Redis and connect to a managed Redis service. + #### Custom dictionary cache redis The resource manager and transcribers can be configured to use a custom dictionary (CD) cache backed by Redis. Enable this with `resourceManager.cdCache.enabled=true` and `transcribers.transcriber.cdCache.enabled=true`. @@ -475,6 +508,38 @@ resourceManager: A single custom dictionary cache Redis instance can be shared across multiple clusters, with the resource manager in each cluster pointing to the same Redis. +#### Load balancing strategies + +Configure the load balancing strategy with `resourceManager.config.resourceSelectionStrategy` (default: `0`) + +```yaml +resourceManager: + config: + resourceSelectionStrategy: 0 +``` + +The primary load balancing approaches are bin packing and round-robin. + +- **Bin packing** options: (2, 0), is best suited to deployments with long-running sessions. The SessionGroups service will not scale down a resource that is currently serving a session, so distributing load evenly across all resources can prevent any of them from being scaled down. +- **Round-robin** options: (3, 1), is better suited to short sessions, where resources can be scaled down once sessions end. + +There is also a “capacity preference” aspect, which applies only when `restartAfterNSessions` is enabled. As a transcriber approaches its Nth session, it reduces its capacity; in that situation, it is preferable to route sessions to lower-capacity resources to consume remaining session slots first. + +- `0`: (do not use) Select the most-utilized available resource; on ties, prefer higher-capacity resources. This packs sessions onto resources sequentially to maximize utilization. +- `1`: (do not use) Select the least-utilized available resource; on ties, prefer higher-capacity resources. This provides even load balancing across all matching resources. +- `2`: (bin packing) Select the most-utilized available resource; on ties, prefer lower-capacity resources. This packs sessions onto resources with least capacity first to maximize utilization. +- `3`: (round-robin) Select the least-utilized available resource; on ties, prefer lower-capacity resources. This provides even load balancing, preferring resources with least capacity. + +The feature to restart transcribers after N sessions is enabled with `transcribers.readinessTracker.restartAfterNSessions` (default: `0` which disables the feature): + +```yaml +transcribers: + readinessTracker: + restartAfterNSessions: 100 +``` + +Use this only if the transcriber needs to be restarted after a set number of sessions. + #### Service upgrades Real-time speech-to-text sessions leverage websockets which are sensitive to any service disruption. This is why we recommend using SessionGroups to manage transcription components, which allow for in-place transcriber and inference server updates. However, non-sessiongroup components such as proxy-service and an ingress controller will not be protected disruptions.