From 68ec491c88d9c8caec91692a1a291726fae58679 Mon Sep 17 00:00:00 2001 From: Fiona Corden Date: Mon, 12 Jan 2026 09:01:00 +0000 Subject: [PATCH 1/9] AIT-221: Document how token streaming interacts with rate limits --- src/data/nav/aitransport.ts | 4 +++ .../token-streaming/token-rate-limits.mdx | 36 +++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx diff --git a/src/data/nav/aitransport.ts b/src/data/nav/aitransport.ts index e049109ea2..976dfab005 100644 --- a/src/data/nav/aitransport.ts +++ b/src/data/nav/aitransport.ts @@ -34,6 +34,10 @@ export default { name: 'Message per token', link: '/docs/ai-transport/features/token-streaming/message-per-token', }, + { + name: 'Connection and channel limits', + link: '/docs/ai-transport/features/token-streaming/connection-and-channel-limits', + }, ], }, { diff --git a/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx b/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx new file mode 100644 index 0000000000..219fa2ebda --- /dev/null +++ b/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx @@ -0,0 +1,36 @@ +--- +title: Token rate limits +meta_description: "Learn how token streaming interacts with Ably message limits and how to ensure your application delivers consistent performance." +--- + +LLM token streaming introduces bursty traffic patterns, with some models outputting 150+ tokens per second. Output rates vary unpredictably over the lifetime of a response stream, and you have limited control over third-party model behaviour. Without planning, concurrent token streams across multiple channels risk triggering [rate limits](/docs/platform/pricing/limits). + +Ably scales as your traffic grows, and [rate limits](/docs/platform/pricing/limits) exist to protect service quality from accidental spikes or deliberate abuse. They also provide a level of protection to consumption rates if abuse does occur. On the correct package for your use case, hitting a limit is an infrequent occurrence. The approach to staying within limits depends on which [token streaming pattern](/docs/ai-transport/features/token-streaming) you use. + +## Message-per-response + +The [message-per-response](/docs/ai-transport/features/token-streaming/message-per-response) pattern includes automatic rate limit protection. AI Transport prevents a single response stream from reaching the message rate limit through adaptive batching: + +1. Your agent streams tokens to the channel at the model's output rate +2. As the token rate approaches a threshold percentage of the [connection inbound message rate](/docs/platform/pricing/limits#connection), Ably batches tokens together automatically +3. Clients receive the same number of tokens per second, delivered in fewer messages + +By default, a single response stream uses up to 50% of the connection inbound message rate. This allows two simultaneous response streams on the same channel or connection. [Contact Ably](/contact) to adjust this threshold if your application requires a different allocation. + +## Message-per-token + +The [message-per-token](/docs/ai-transport/features/token-streaming/message-per-token) pattern requires you to manage rate limits directly. Each token publishes as a separate message, so high-speed model output can consume message allowances quickly. + +To stay within limits: + +- Batch tokens in your agent before publishing to the SDK, reducing message count while maintaining delivery speed +- Calculate your headroom by comparing your model's peak output rate against your package's [connection inbound message rate](/docs/platform/pricing/limits#connection) +- Account for concurrency by multiplying peak rates by the maximum number of simultaneous streams your application supports + +If your application requires higher message rates than your current package allows, [contact Ably](/contact) to discuss options. + +## Next steps + +- Review [Ably platform limits](/docs/platform/pricing/limits) to understand rate limit thresholds for your package +- Learn about the [message-per-response](/docs/ai-transport/features/token-streaming/message-per-response) pattern for automatic rate limit protection +- Learn about the [message-per-token](/docs/ai-transport/features/token-streaming/message-per-token) pattern for fine-grained control From fe8a4a7b0d864ab17d505b7bb60e4a1b2ac334e5 Mon Sep 17 00:00:00 2001 From: Fiona Corden Date: Mon, 12 Jan 2026 09:52:12 +0000 Subject: [PATCH 2/9] Fix article title in nav --- src/data/nav/aitransport.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data/nav/aitransport.ts b/src/data/nav/aitransport.ts index 976dfab005..18468db630 100644 --- a/src/data/nav/aitransport.ts +++ b/src/data/nav/aitransport.ts @@ -35,8 +35,8 @@ export default { link: '/docs/ai-transport/features/token-streaming/message-per-token', }, { - name: 'Connection and channel limits', - link: '/docs/ai-transport/features/token-streaming/connection-and-channel-limits', + name: 'Token rate limits', + link: '/docs/ai-transport/features/token-streaming/token-rate-limits', }, ], }, From 6dbe9054b943b90e8f86c3d4fd08c6009c1585bd Mon Sep 17 00:00:00 2001 From: Fiona Corden Date: Mon, 12 Jan 2026 09:58:10 +0000 Subject: [PATCH 3/9] Update text to clarify language and remove duplicate link --- .../features/token-streaming/token-rate-limits.mdx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx b/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx index 219fa2ebda..d06df1e11b 100644 --- a/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx +++ b/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx @@ -3,9 +3,9 @@ title: Token rate limits meta_description: "Learn how token streaming interacts with Ably message limits and how to ensure your application delivers consistent performance." --- -LLM token streaming introduces bursty traffic patterns, with some models outputting 150+ tokens per second. Output rates vary unpredictably over the lifetime of a response stream, and you have limited control over third-party model behaviour. Without planning, concurrent token streams across multiple channels risk triggering [rate limits](/docs/platform/pricing/limits). +LLM token streaming introduces bursty traffic patterns to your application, with some models outputting 150+ tokens per second. Output rates can vary unpredictably over the lifetime of a response stream, and you have limited control over third-party model behaviour. Without planning, concurrent token streams across multiple channels risk triggering [rate limits](/docs/platform/pricing/limits). -Ably scales as your traffic grows, and [rate limits](/docs/platform/pricing/limits) exist to protect service quality from accidental spikes or deliberate abuse. They also provide a level of protection to consumption rates if abuse does occur. On the correct package for your use case, hitting a limit is an infrequent occurrence. The approach to staying within limits depends on which [token streaming pattern](/docs/ai-transport/features/token-streaming) you use. +Ably scales as your traffic grows, and rate limits exist to protect service quality in the case of accidental spikes or deliberate abuse. They also provide a level of protection to consumption rates if abuse does occur. On the correct package for your use case, hitting a limit is an infrequent occurrence. The approach to staying within limits when using AI Transport depends on which [token streaming pattern](/docs/ai-transport/features/token-streaming) you use. ## Message-per-response @@ -23,9 +23,9 @@ The [message-per-token](/docs/ai-transport/features/token-streaming/message-per- To stay within limits: -- Batch tokens in your agent before publishing to the SDK, reducing message count while maintaining delivery speed - Calculate your headroom by comparing your model's peak output rate against your package's [connection inbound message rate](/docs/platform/pricing/limits#connection) - Account for concurrency by multiplying peak rates by the maximum number of simultaneous streams your application supports +- If required, batch tokens in your agent before publishing to the SDK, reducing message count while maintaining delivery speed If your application requires higher message rates than your current package allows, [contact Ably](/contact) to discuss options. From b2b0d66215054d3ec96aeb46e8554c1abdfddcfe Mon Sep 17 00:00:00 2001 From: Fiona Corden Date: Mon, 12 Jan 2026 15:34:13 +0000 Subject: [PATCH 4/9] Update based on Paddy's comments --- .../features/token-streaming/token-rate-limits.mdx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx b/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx index d06df1e11b..1496542d5f 100644 --- a/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx +++ b/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx @@ -3,19 +3,19 @@ title: Token rate limits meta_description: "Learn how token streaming interacts with Ably message limits and how to ensure your application delivers consistent performance." --- -LLM token streaming introduces bursty traffic patterns to your application, with some models outputting 150+ tokens per second. Output rates can vary unpredictably over the lifetime of a response stream, and you have limited control over third-party model behaviour. Without planning, concurrent token streams across multiple channels risk triggering [rate limits](/docs/platform/pricing/limits). +LLM token streaming introduces bursty traffic patterns to your application, with some models outputting 150+ distinct events (i.e. tokens or response deltas) per second. Output rates can vary unpredictably over the lifetime of a response stream, and you have limited control over third-party model behaviour. Without planning, concurrent token streams across multiple channels risk triggering [rate limits](/docs/platform/pricing/limits). Ably scales as your traffic grows, and rate limits exist to protect service quality in the case of accidental spikes or deliberate abuse. They also provide a level of protection to consumption rates if abuse does occur. On the correct package for your use case, hitting a limit is an infrequent occurrence. The approach to staying within limits when using AI Transport depends on which [token streaming pattern](/docs/ai-transport/features/token-streaming) you use. ## Message-per-response -The [message-per-response](/docs/ai-transport/features/token-streaming/message-per-response) pattern includes automatic rate limit protection. AI Transport prevents a single response stream from reaching the message rate limit through adaptive batching: +The [message-per-response](/docs/ai-transport/features/token-streaming/message-per-response) pattern includes automatic rate limit protection. AI Transport prevents a single response stream from reaching the message rate limit through batching: 1. Your agent streams tokens to the channel at the model's output rate -2. As the token rate approaches a threshold percentage of the [connection inbound message rate](/docs/platform/pricing/limits#connection), Ably batches tokens together automatically +2. Ably publishes the first token immediately, then batches subsequent tokens automatically on receipt 3. Clients receive the same number of tokens per second, delivered in fewer messages -By default, a single response stream uses up to 50% of the connection inbound message rate. This allows two simultaneous response streams on the same channel or connection. [Contact Ably](/contact) to adjust this threshold if your application requires a different allocation. +By default, a single response stream will be delivered at 25 messages per second or the model output rate, whichever is lower. This means you can publish two simultaneous response streams on the same channel or connection with any [Ably package](/docs/platform/pricing#packages), because each stream is limited to 50% of the [connection inbound message rate](/docs/platform/pricing/limits#connection). [Contact Ably](/contact) to adjust this threshold if your application requires a different allocation. ## Message-per-token From 10f08701e43ec93875d1aab881c4559924f9afa0 Mon Sep 17 00:00:00 2001 From: Fiona Corden Date: Mon, 12 Jan 2026 15:42:16 +0000 Subject: [PATCH 5/9] Update naming following review --- src/data/nav/aitransport.ts | 2 +- .../ai-transport/features/token-streaming/token-rate-limits.mdx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data/nav/aitransport.ts b/src/data/nav/aitransport.ts index 18468db630..cc0c4e570b 100644 --- a/src/data/nav/aitransport.ts +++ b/src/data/nav/aitransport.ts @@ -35,7 +35,7 @@ export default { link: '/docs/ai-transport/features/token-streaming/message-per-token', }, { - name: 'Token rate limits', + name: 'Token streaming limits', link: '/docs/ai-transport/features/token-streaming/token-rate-limits', }, ], diff --git a/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx b/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx index 1496542d5f..c4ca765232 100644 --- a/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx +++ b/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx @@ -1,5 +1,5 @@ --- -title: Token rate limits +title: Token streaming limits meta_description: "Learn how token streaming interacts with Ably message limits and how to ensure your application delivers consistent performance." --- From a29374dc3e300a7378bd14367ef75d1b28bffe9e Mon Sep 17 00:00:00 2001 From: Fiona Corden Date: Wed, 14 Jan 2026 16:03:30 +0000 Subject: [PATCH 6/9] WIP - update with transport param --- .../token-streaming/token-rate-limits.mdx | 36 +++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx b/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx index c4ca765232..6c5a73d02d 100644 --- a/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx +++ b/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx @@ -9,13 +9,43 @@ Ably scales as your traffic grows, and rate limits exist to protect service qual ## Message-per-response -The [message-per-response](/docs/ai-transport/features/token-streaming/message-per-response) pattern includes automatic rate limit protection. AI Transport prevents a single response stream from reaching the message rate limit through batching: +The [message-per-response](/docs/ai-transport/features/token-streaming/message-per-response) pattern includes automatic rate limit protection. AI Transport prevents a single response stream from reaching the message rate limit by rolling up multiple appends into a single published message: 1. Your agent streams tokens to the channel at the model's output rate -2. Ably publishes the first token immediately, then batches subsequent tokens automatically on receipt +2. Ably publishes the first token immediately, then automatically rolls up subsequent tokens on receipt 3. Clients receive the same number of tokens per second, delivered in fewer messages -By default, a single response stream will be delivered at 25 messages per second or the model output rate, whichever is lower. This means you can publish two simultaneous response streams on the same channel or connection with any [Ably package](/docs/platform/pricing#packages), because each stream is limited to 50% of the [connection inbound message rate](/docs/platform/pricing/limits#connection). [Contact Ably](/contact) to adjust this threshold if your application requires a different allocation. +By default, a single response stream will be delivered at 25 messages per second or the model output rate, whichever is lower. This means you can publish two simultaneous response streams on the same channel or connection with any [Ably package](/docs/platform/pricing#packages), because each stream is limited to 50% of the [connection inbound message rate](/docs/platform/pricing/limits#connection). You will be charged for the number of published messages, not for the number of streamed tokens. + +### Configuring rollup behaviour + +Ably appends all tokens for a single response that are received during the rollup window into one published message. You can specify the rollup window for a particular connection by setting the `appendRollupWindow` transport parameter. This allows you to control how much of the connection message rate can be consumed by a single response stream. + + +| appendRollupWindow | Maximum message rate for a single response | +|---|---| +| 0ms | Model output rate | +| 20ms | 50 messages/s | +| 40ms *(default)* | 25 messages/s | +| 100ms | 10 messages/s | +| 500ms *(max)* | 2 messages/s | + +The following example code demonstrates establishing a connection to Ably with `appendRollupWindow` set to 100ms: + + +```javascript +const ably = new Ably.Realtime( + { + key: 'your-api-key', + transportParams: { appendRollupWindow: 100 } + } +); +``` + + + ## Message-per-token From 135894c54593bcc6bd8311148b2107e8a2aeeda9 Mon Sep 17 00:00:00 2001 From: Fiona Corden Date: Wed, 14 Jan 2026 17:21:31 +0000 Subject: [PATCH 7/9] Complete updates to include transport param documentation --- .../features/token-streaming/token-rate-limits.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx b/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx index 6c5a73d02d..7b6d66a8f4 100644 --- a/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx +++ b/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx @@ -19,7 +19,7 @@ By default, a single response stream will be delivered at 25 messages per second ### Configuring rollup behaviour -Ably appends all tokens for a single response that are received during the rollup window into one published message. You can specify the rollup window for a particular connection by setting the `appendRollupWindow` transport parameter. This allows you to control how much of the connection message rate can be consumed by a single response stream. +Ably joins all appends for a single response that are received during the rollup window into one published message. You can specify the rollup window for a particular connection by setting the `appendRollupWindow` transport parameter. This allows you to determine how much of the connection message rate can be consumed by a single response stream and control your consumption costs. | appendRollupWindow | Maximum message rate for a single response | @@ -44,7 +44,7 @@ const ably = new Ably.Realtime( ## Message-per-token From 5764de12fcd59d7c60452715c9d672caffbb75dd Mon Sep 17 00:00:00 2001 From: Fiona Corden Date: Thu, 15 Jan 2026 17:28:21 +0000 Subject: [PATCH 8/9] fixup: clarify paragraph based on review and add note on server-side batching --- .../features/token-streaming/token-rate-limits.mdx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx b/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx index 7b6d66a8f4..f9c2d44824 100644 --- a/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx +++ b/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx @@ -3,7 +3,7 @@ title: Token streaming limits meta_description: "Learn how token streaming interacts with Ably message limits and how to ensure your application delivers consistent performance." --- -LLM token streaming introduces bursty traffic patterns to your application, with some models outputting 150+ distinct events (i.e. tokens or response deltas) per second. Output rates can vary unpredictably over the lifetime of a response stream, and you have limited control over third-party model behaviour. Without planning, concurrent token streams across multiple channels risk triggering [rate limits](/docs/platform/pricing/limits). +LLM token streaming introduces bursty traffic patterns to your application, with some models outputting 150+ distinct events (i.e. tokens or response deltas) per second. Output rates can vary unpredictably over the lifetime of a response stream, and you have limited control over third-party model behaviour. Without planning, token streams risk triggering [rate limits](/docs/platform/pricing/limits). Ably scales as your traffic grows, and rate limits exist to protect service quality in the case of accidental spikes or deliberate abuse. They also provide a level of protection to consumption rates if abuse does occur. On the correct package for your use case, hitting a limit is an infrequent occurrence. The approach to staying within limits when using AI Transport depends on which [token streaming pattern](/docs/ai-transport/features/token-streaming) you use. @@ -56,6 +56,7 @@ To stay within limits: - Calculate your headroom by comparing your model's peak output rate against your package's [connection inbound message rate](/docs/platform/pricing/limits#connection) - Account for concurrency by multiplying peak rates by the maximum number of simultaneous streams your application supports - If required, batch tokens in your agent before publishing to the SDK, reducing message count while maintaining delivery speed +- Enable [server-side batching](/docs/messages/batch#server-side) to reduce the number of messages delivered to your subscribers If your application requires higher message rates than your current package allows, [contact Ably](/contact) to discuss options. From 04a450010cc97c1d4aa47b3404ec1f379fbe09ad Mon Sep 17 00:00:00 2001 From: Fiona Corden Date: Thu, 15 Jan 2026 22:28:23 +0000 Subject: [PATCH 9/9] Add heading link tags --- .../features/token-streaming/token-rate-limits.mdx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx b/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx index f9c2d44824..40007d791a 100644 --- a/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx +++ b/src/pages/docs/ai-transport/features/token-streaming/token-rate-limits.mdx @@ -7,7 +7,7 @@ LLM token streaming introduces bursty traffic patterns to your application, with Ably scales as your traffic grows, and rate limits exist to protect service quality in the case of accidental spikes or deliberate abuse. They also provide a level of protection to consumption rates if abuse does occur. On the correct package for your use case, hitting a limit is an infrequent occurrence. The approach to staying within limits when using AI Transport depends on which [token streaming pattern](/docs/ai-transport/features/token-streaming) you use. -## Message-per-response +## Message-per-response The [message-per-response](/docs/ai-transport/features/token-streaming/message-per-response) pattern includes automatic rate limit protection. AI Transport prevents a single response stream from reaching the message rate limit by rolling up multiple appends into a single published message: @@ -17,7 +17,7 @@ The [message-per-response](/docs/ai-transport/features/token-streaming/message-p By default, a single response stream will be delivered at 25 messages per second or the model output rate, whichever is lower. This means you can publish two simultaneous response streams on the same channel or connection with any [Ably package](/docs/platform/pricing#packages), because each stream is limited to 50% of the [connection inbound message rate](/docs/platform/pricing/limits#connection). You will be charged for the number of published messages, not for the number of streamed tokens. -### Configuring rollup behaviour +### Configuring rollup behaviour Ably joins all appends for a single response that are received during the rollup window into one published message. You can specify the rollup window for a particular connection by setting the `appendRollupWindow` transport parameter. This allows you to determine how much of the connection message rate can be consumed by a single response stream and control your consumption costs. @@ -47,7 +47,7 @@ const ably = new Ably.Realtime( If you configure the `appendRollupWindow` to allow a single response to use more than your [connection inbound message rate](/docs/platform/pricing/limits#connection) then you will see [limit enforcement](/docs/platform/pricing/limits#hitting) behaviour if you stream tokens faster than the allowed message rate. -## Message-per-token +## Message-per-token The [message-per-token](/docs/ai-transport/features/token-streaming/message-per-token) pattern requires you to manage rate limits directly. Each token publishes as a separate message, so high-speed model output can consume message allowances quickly. @@ -60,7 +60,7 @@ To stay within limits: If your application requires higher message rates than your current package allows, [contact Ably](/contact) to discuss options. -## Next steps +## Next steps - Review [Ably platform limits](/docs/platform/pricing/limits) to understand rate limit thresholds for your package - Learn about the [message-per-response](/docs/ai-transport/features/token-streaming/message-per-response) pattern for automatic rate limit protection