From bc0f03e66020595d36d2a856fdd755fab59dfbf5 Mon Sep 17 00:00:00 2001 From: Sean Brar Date: Thu, 5 Mar 2026 17:08:58 -0800 Subject: [PATCH] feat: Add Anthropic implicit caching support - Implement `implicit_caching` option for Anthropic provider - Set smart defaults: enabled for single-call, disabled for fan-out - Update ProviderCapabilities to enforce caching support - Add runtime errors for unsupported providers - Document caching approaches in docs (caching.md, portable-code.md, etc.) - Add test coverage for new caching behavior --- docs/caching.md | 82 +++++++++++++++++++++---- docs/configuration.md | 8 ++- docs/portable-code.md | 20 +++--- docs/reference/provider-capabilities.md | 6 +- src/pollux/execute.py | 11 ++++ src/pollux/options.py | 3 + src/pollux/providers/anthropic.py | 5 ++ src/pollux/providers/base.py | 1 + src/pollux/providers/mock.py | 1 + src/pollux/providers/models.py | 1 + tests/conftest.py | 1 + tests/test_pipeline.py | 80 +++++++++++++++++++++++- tests/test_providers.py | 21 +++++++ 13 files changed, 216 insertions(+), 24 deletions(-) diff --git a/docs/caching.md b/docs/caching.md index 3169379..72ef16f 100644 --- a/docs/caching.md +++ b/docs/caching.md @@ -73,9 +73,65 @@ compare_efficiency(946_800, 10) More questions on the same content = greater savings. -## Creating a Cache +## Two Approaches to Caching -Use `create_cache()` to upload content to the provider once, then pass the +Context caching implementations vary significantly across providers: + +- **Implicit caching (Anthropic):** Anthropic caches prompt prefixes during + generation. Pollux toggles this with `Options(implicit_caching=...)`. +- **Explicit caching (Gemini):** You upload context once with + `create_cache()`, get a handle back, and pass that handle to later calls. + +## Implicit Caching (Anthropic) + +Anthropic caches shared prefixes from the top of the request downward: +system instruction, tools, conversation history, and repeated prompt context. +You do not create a cache object yourself. Pollux decides whether to ask for +implicit caching on each provider call. + +### Cost Mechanics + +Unlike explicit caching, Anthropic changes token pricing per request: + +- **Cache writes:** +25% (1.25x standard cost) +- **Cache reads:** -90% (0.10x standard cost) + +Caching pays off when a prefix is written once and then reused. Without +caching, sending the same prefix twice costs 2.0x. With caching, it costs +1.35x. + +### Default Behavior + +Because cache writes cost more, Pollux does not treat implicit caching as a +blanket default: + +- **Single provider call:** Pollux enables implicit caching by default. +- **Multi-call fan-out:** Pollux disables it by default. + +This is a request-shape rule, not an API-entrypoint rule. `run()` always makes +one provider call, so the default is on. `run_many()` with multiple prompts +makes multiple parallel calls, so the default is off. `run_many(["Q"])` still +makes one provider call, so the default is on there too. + +The reason is cost. In a conversation, the write premium lands once and later +turns benefit from cheap cache reads. In a wide fan-out, many identical calls +arrive before the cache is warm, so you pay the write premium repeatedly. + +You can override the default when you need to: + +```python +from pollux import Options + +# Disable Anthropic implicit caching for a one-off call. +options = Options(implicit_caching=False) +``` + +Setting `implicit_caching=True` on a provider that does not support it raises +`ConfigurationError`. Pollux does not silently ignore the request. + +## Explicit Caching (Gemini) + +For Gemini, use `create_cache()` to upload content to the provider once, then pass the returned handle to `run()` or `run_many()` via `Options(cache=handle)`: ```python @@ -143,7 +199,7 @@ the same handle reuse the cached context automatically. ## Cache Identity -Cache keys are deterministic: `hash(model + provider + content hashes of sources)`. +For explicit caches, keys are deterministic: `hash(model + provider + content hashes of sources)`. This means: @@ -156,7 +212,7 @@ This means: ## Single-Flight Protection -When multiple concurrent calls target the same cache key (common in fan-out +When multiple concurrent calls target the same explicit cache key (common in fan-out workloads), Pollux deduplicates the creation call: only one coroutine performs the upload, and others await the same result. This eliminates duplicate uploads without requiring caller-side coordination. @@ -171,9 +227,9 @@ Check `metrics.cache_used` on subsequent calls: Keep prompts and sources stable between runs when comparing warm vs reuse behavior. Usage counters are provider-dependent. -## Tuning TTL +## Tuning Explicit Cache TTL -Pass `ttl_seconds` to `create_cache()` to control the cache lifetime. The +Pass `ttl_seconds` to `create_cache()` to control the explicit cache lifetime. The default is 3600 seconds (1 hour). Tune it to match your expected reuse window: - **Too short:** the cache expires before you reuse it, wasting the @@ -183,7 +239,8 @@ default is 3600 seconds (1 hour). Tune it to match your expected reuse window: For interactive workloads where you run a batch and then refine prompts within the same session, 3600s is a reasonable starting point. For one-shot scripts, -shorter TTLs (300-600s) avoid lingering cache entries. +shorter TTLs (300-600s) avoid lingering cache entries. Anthropic manages the +lifetime of implicit caches on its side. ## When Caching Pays Off @@ -191,16 +248,19 @@ Caching is most effective when: - **Sources are large:** video, long PDFs, multi-image sets - **Prompt sets are repeated:** fan-out workflows with 3+ prompts per source -- **Reuse happens within TTL:** default 3600s; tune via `ttl_seconds` + using explicit caching +- **Conversations are deep:** multi-turn dialogues with large system prompts + using implicit caching Caching adds overhead for single-prompt, small-source calls. Start without caching and enable it when you see repeated context in your workload. ## Provider Dependency -Persistent context caching is **Gemini-only**. Calling `create_cache()` with -a provider that lacks `persistent_cache` support raises an actionable error. -See [Provider Capabilities](reference/provider-capabilities.md) for the full +Calling `create_cache()` with a provider that lacks `persistent_cache` +support raises an actionable error. `Options(implicit_caching=True)` raises in +the same way on providers that lack implicit caching support. See +[Provider Capabilities](reference/provider-capabilities.md) for the full matrix. --- diff --git a/docs/configuration.md b/docs/configuration.md index c1387f4..7c3aee9 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -31,7 +31,7 @@ All fields and their defaults: | Field | Type | Default | Description | |---|---|---|---| -| `provider` | `"gemini" \| "openai"` | *(required)* | Provider to use | +| `provider` | `"gemini" \| "openai" \| "anthropic"` | *(required)* | Provider to use | | `model` | `str` | *(required)* | Model identifier | | `api_key` | `str \| None` | `None` | Explicit key; auto-resolved from env if omitted | | `use_mock` | `bool` | `False` | Use mock provider (no network calls) | @@ -44,10 +44,12 @@ If `api_key` is omitted, Pollux resolves it from environment variables: - Gemini: `GEMINI_API_KEY` - OpenAI: `OPENAI_API_KEY` +- Anthropic: `ANTHROPIC_API_KEY` ```bash export GEMINI_API_KEY="your-key" export OPENAI_API_KEY="your-key" +export ANTHROPIC_API_KEY="your-key" ``` You can also pass a key directly: @@ -132,6 +134,7 @@ options = Options( response_schema=MyPydanticModel, # Structured output extraction reasoning_effort="medium", # Controls model thinking depth delivery_mode="realtime", # Only "realtime" is supported + implicit_caching=True, # Auto-cache prefix (Anthropic only) ) ``` @@ -147,7 +150,8 @@ options = Options( | `delivery_mode` | `str` | `"realtime"` | Only `"realtime"` is supported; `"deferred"` raises an error | | `history` | `list[dict] \| None` | `None` | Conversation history. See [Continuing Conversations Across Turns](conversations-and-agents.md) | | `continue_from` | `ResultEnvelope \| None` | `None` | Resume from a prior result. See [Continuing Conversations Across Turns](conversations-and-agents.md) | -| `cache` | `CacheHandle \| None` | `None` | Persistent context cache. See [Reducing Costs with Context Caching](caching.md) | +| `cache` | `CacheHandle \| None` | `None` | Persistent explicit cache (Gemini). See [Reducing Costs with Context Caching](caching.md) | +| `implicit_caching` | `bool \| None` | `None` | Enable or disable Anthropic implicit caching. Defaults to `True` for a single provider call and `False` for multi-call fan-out. `implicit_caching=True` raises on providers that do not support it. See [Reducing Costs with Context Caching](caching.md) | !!! note OpenAI GPT-5 family models (`gpt-5`, `gpt-5-mini`, `gpt-5-nano`) reject diff --git a/docs/portable-code.md b/docs/portable-code.md index 878199b..d92ae4a 100644 --- a/docs/portable-code.md +++ b/docs/portable-code.md @@ -11,11 +11,13 @@ You want analysis code that works across providers. Switch from Gemini to OpenAI (or back) by changing a config line, not rewriting your pipeline. This page shows the patterns that make that work. -Pollux is capability-transparent, not capability-equalizing. Both providers +Pollux is capability-transparent, not capability-equalizing. All providers support the core pipeline (text generation, structured output, tool calling, -conversation continuity), but some features are provider-specific. Context -caching is Gemini-only, for example. When you use an unsupported feature, -Pollux raises a `ConfigurationError` or `APIError`. No silent degradation. +conversation continuity), but some features are provider-specific. For +example, Gemini uses explicit cache handles (`create_cache()`), while +Anthropic uses implicit caching (`Options(implicit_caching=True)`). +When you use an unsupported feature for a provider, Pollux raises a +`ConfigurationError` or `APIError`. No silent degradation. This keeps behavior legible in both development and production. !!! info "Boundary" @@ -114,9 +116,8 @@ asyncio.run(main()) its model. Your analysis functions never reference provider names or models directly. -2. **Use `create_cache()` for persistent caching.** Caching is now - opt-in via `create_cache()` and `Options(cache=handle)`. Only call - it when the provider supports `persistent_cache` (e.g. Gemini). +2. **Handle features conditionally.** Explicit caching (`create_cache()`) is + Gemini-specific, while implicit caching (`implicit_caching=True`) is Anthropic-specific. Handle conditional optimizations near the edge, or wrap them dynamically if needed. 3. **Write provider-agnostic functions.** `analyze_document` accepts a provider name and builds the config internally. The prompt, source, and @@ -253,8 +254,9 @@ async def test_analyze_document_mock(provider: str) -> None: ## What to Watch For - **Keep the portable subset in mind.** Text generation, structured output, - tool calling, and conversation continuity work on both providers. Context - caching is Gemini-only. YouTube URLs have limited OpenAI support. + tool calling, and conversation continuity work on all providers. Context + caching has different paradigms (explicit for Gemini, implicit for Anthropic). + YouTube URLs have limited OpenAI support. Check [Provider Capabilities](reference/provider-capabilities.md). - **Config errors are your portability signal.** A `ConfigurationError` for an unsupported feature marks the boundary of portability. Handle it at diff --git a/docs/reference/provider-capabilities.md b/docs/reference/provider-capabilities.md index a6638f8..2dce4cb 100644 --- a/docs/reference/provider-capabilities.md +++ b/docs/reference/provider-capabilities.md @@ -20,7 +20,8 @@ Pollux is **capability-transparent**, not capability-equalizing: providers are a | PDF URL inputs | ✅ (via URI part) | ✅ (native `input_file.file_url`) | ✅ (native `document` URL block) | | | Image URL inputs | ✅ (via URI part) | ✅ (native `input_image.image_url`) | ✅ (native `image` URL block) | | | YouTube URL inputs | ✅ | ⚠️ limited | ⚠️ limited | OpenAI/Anthropic parity layers (download/re-upload) are out of scope | -| Provider-side context caching | ✅ | ❌ | ❌ | OpenAI and Anthropic providers return unsupported for caching | +| Explicit context caching (`create_cache`) | ✅ | ❌ | ❌ | Persistent cache handles are Gemini-only | +| Implicit prompt caching (`Options.implicit_caching`) | ❌ | ❌ | ✅ | Anthropic-only request-level optimization | | Structured outputs (`response_schema`) | ✅ | ✅ | ✅ | JSON-schema path in all providers | | Reasoning controls (`reasoning_effort`) | ✅ | ✅ | ✅ | Passed through to provider; see notes below | | Deferred delivery (`delivery_mode="deferred"`) | ❌ | ❌ | ❌ | Not supported; raises `ConfigurationError` | @@ -69,6 +70,9 @@ Pollux is **capability-transparent**, not capability-equalizing: providers are a ### Anthropic - Remote URL support is intentionally narrow: images and PDFs only. +- Implicit prompt caching is enabled with `Options(implicit_caching=True)`. + Pollux defaults it on for single-call workloads and off for multi-call + fan-out. Requesting it on unsupported providers raises `ConfigurationError`. - Reasoning: `reasoning_effort` maps to `output_config.effort`. Pollux uses `thinking.type="adaptive"` on adaptive-capable models (currently Opus 4.6 and Sonnet 4.6) and falls back to manual thinking budgets on older diff --git a/src/pollux/execute.py b/src/pollux/execute.py index b3c2842..9872b25 100644 --- a/src/pollux/execute.py +++ b/src/pollux/execute.py @@ -103,6 +103,11 @@ async def execute_plan(plan: Plan, provider: Provider) -> ExecutionTrace: "Provider does not support reasoning controls", hint="Remove reasoning_effort or choose a provider with reasoning support.", ) + if options.implicit_caching is True and not caps.implicit_caching: + raise ConfigurationError( + "Provider does not support implicit caching", + hint="Remove implicit_caching=True or choose a provider with implicit caching support.", + ) if wants_conversation and not caps.conversation: raise ConfigurationError( "Provider does not support conversation continuity", @@ -178,6 +183,11 @@ async def execute_plan(plan: Plan, provider: Provider) -> ExecutionTrace: upload_lock = asyncio.Lock() retry_policy = config.retry responses: list[dict[str, Any]] = [] + implicit_caching = ( + options.implicit_caching + if options.implicit_caching is not None + else caps.implicit_caching and len(prompts) == 1 + ) total_usage: dict[str, int] = {} conversation_state: dict[str, Any] | None = None @@ -288,6 +298,7 @@ async def _execute_call(call_idx: int) -> dict[str, Any]: previous_response_id=previous_response_id, provider_state=request_provider_state, max_tokens=options.max_tokens, + implicit_caching=implicit_caching, ) if retry_policy.max_attempts <= 1: diff --git a/src/pollux/options.py b/src/pollux/options.py index 0ea6ee2..859855c 100644 --- a/src/pollux/options.py +++ b/src/pollux/options.py @@ -47,6 +47,9 @@ class Options: max_tokens: int | None = None #: Persistent context cache obtained from ``create_cache()``. cache: CacheHandle | None = None + #: Controls implicit model-level caching (e.g., Anthropic prefix caching). + #: Defaults to True for a single provider call, False for multi-call fan-out. + implicit_caching: bool | None = None def __post_init__(self) -> None: """Validate option shapes early for clear errors.""" diff --git a/src/pollux/providers/anthropic.py b/src/pollux/providers/anthropic.py index 2c90e71..ebbc193 100644 --- a/src/pollux/providers/anthropic.py +++ b/src/pollux/providers/anthropic.py @@ -67,6 +67,7 @@ def capabilities(self) -> ProviderCapabilities: reasoning=True, deferred_delivery=False, conversation=True, + implicit_caching=True, ) @staticmethod @@ -231,8 +232,12 @@ async def generate( ), } + if request.implicit_caching: + create_kwargs["cache_control"] = {"type": "ephemeral"} + if request.system_instruction: create_kwargs["system"] = request.system_instruction + if request.temperature is not None: create_kwargs["temperature"] = request.temperature if request.top_p is not None: diff --git a/src/pollux/providers/base.py b/src/pollux/providers/base.py index ee9bebe..bfb191e 100644 --- a/src/pollux/providers/base.py +++ b/src/pollux/providers/base.py @@ -25,6 +25,7 @@ class ProviderCapabilities: reasoning: bool = False deferred_delivery: bool = False conversation: bool = False + implicit_caching: bool = False @runtime_checkable diff --git a/src/pollux/providers/mock.py b/src/pollux/providers/mock.py index c7dc2c5..64c3f92 100644 --- a/src/pollux/providers/mock.py +++ b/src/pollux/providers/mock.py @@ -27,6 +27,7 @@ def capabilities(self) -> ProviderCapabilities: reasoning=False, deferred_delivery=False, conversation=False, + implicit_caching=False, ) async def generate(self, request: ProviderRequest) -> ProviderResponse: diff --git a/src/pollux/providers/models.py b/src/pollux/providers/models.py index a313389..a9f17ef 100644 --- a/src/pollux/providers/models.py +++ b/src/pollux/providers/models.py @@ -53,6 +53,7 @@ class ProviderRequest: previous_response_id: str | None = None provider_state: dict[str, Any] | None = None max_tokens: int | None = None + implicit_caching: bool = False @dataclass diff --git a/tests/conftest.py b/tests/conftest.py index 74d68c2..dc24575 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -58,6 +58,7 @@ async def generate(self, request: ProviderRequest) -> ProviderResponse: "history": request.history, "previous_response_id": request.previous_response_id, "provider_state": request.provider_state, + "implicit_caching": request.implicit_caching, } prompt = ( request.parts[-1] diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index a7a078b..911f35b 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -26,7 +26,13 @@ from pollux.request import normalize_request from pollux.retry import RetryPolicy from pollux.source import Source -from tests.conftest import CACHE_MODEL, GEMINI_MODEL, OPENAI_MODEL, FakeProvider +from tests.conftest import ( + ANTHROPIC_MODEL, + CACHE_MODEL, + GEMINI_MODEL, + OPENAI_MODEL, + FakeProvider, +) from tests.helpers import CaptureProvider as KwargsCaptureProvider from tests.helpers import GateProvider, ScriptedProvider @@ -1325,6 +1331,78 @@ class ExampleSchema(BaseModel): assert response_schema["type"] == "object" +@pytest.mark.asyncio +async def test_implicit_caching_defaults_to_true_for_single_call_when_supported( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Single-call Anthropic-style workloads should default implicit caching on.""" + fake = KwargsCaptureProvider( + _capabilities=ProviderCapabilities( + persistent_cache=False, + uploads=True, + structured_outputs=False, + reasoning=False, + deferred_delivery=False, + conversation=False, + implicit_caching=True, + ) + ) + monkeypatch.setattr(pollux, "_get_provider", lambda _config: fake) + cfg = Config(provider="anthropic", model=ANTHROPIC_MODEL, use_mock=True) + + await pollux.run("Q1?", config=cfg) + + assert len(fake.generate_kwargs) == 1 + request = fake.generate_kwargs[0]["request"] + assert request.implicit_caching is True + + +@pytest.mark.asyncio +async def test_implicit_caching_defaults_to_false_for_multi_call_fanout( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Multi-call fan-out should default implicit caching off.""" + fake = KwargsCaptureProvider( + _capabilities=ProviderCapabilities( + persistent_cache=False, + uploads=True, + structured_outputs=False, + reasoning=False, + deferred_delivery=False, + conversation=False, + implicit_caching=True, + ) + ) + monkeypatch.setattr(pollux, "_get_provider", lambda _config: fake) + cfg = Config(provider="anthropic", model=ANTHROPIC_MODEL, use_mock=True) + + await pollux.run_many(("Q1?", "Q2?"), config=cfg) + + assert len(fake.generate_kwargs) == 2 + assert all( + call["request"].implicit_caching is False for call in fake.generate_kwargs + ) + + +@pytest.mark.asyncio +async def test_implicit_caching_requires_provider_capability_when_enabled( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Explicit implicit_caching=True should fail on providers that lack it.""" + fake = FakeProvider() + monkeypatch.setattr(pollux, "_get_provider", lambda _config: fake) + cfg = Config(provider="gemini", model=GEMINI_MODEL, use_mock=True) + + with pytest.raises(ConfigurationError, match="implicit caching") as exc: + await pollux.run( + "Q1?", + config=cfg, + options=Options(implicit_caching=True), + ) + + assert exc.value.hint is not None + + @pytest.mark.asyncio async def test_delivery_mode_deferred_is_explicitly_not_implemented( monkeypatch: pytest.MonkeyPatch, diff --git a/tests/test_providers.py b/tests/test_providers.py index b52ccfb..bf55ffb 100644 --- a/tests/test_providers.py +++ b/tests/test_providers.py @@ -1748,11 +1748,32 @@ async def test_anthropic_generate_with_system_instruction() -> None: model=ANTHROPIC_MODEL, parts=["Hello"], system_instruction="Be concise.", + implicit_caching=True, ) ) assert messages.last_kwargs is not None assert messages.last_kwargs["system"] == "Be concise." + assert messages.last_kwargs["cache_control"] == {"type": "ephemeral"} + + +@pytest.mark.asyncio +async def test_anthropic_generate_with_implicit_caching_disabled() -> None: + """Disabling implicit_caching omits Anthropic cache_control.""" + provider, messages = _anthropic_provider_with_fake() + + await provider.generate( + ProviderRequest( + model=ANTHROPIC_MODEL, + parts=["Hello"], + system_instruction="Be concise.", + implicit_caching=False, + ) + ) + + assert messages.last_kwargs is not None + assert messages.last_kwargs["system"] == "Be concise." + assert "cache_control" not in messages.last_kwargs @pytest.mark.asyncio