From bc0f03e66020595d36d2a856fdd755fab59dfbf5 Mon Sep 17 00:00:00 2001
From: Sean Brar <hello@seanbrar.com>
Date: Thu, 5 Mar 2026 17:08:58 -0800
Subject: [PATCH] feat: Add Anthropic implicit caching support

- Implement `implicit_caching` option for Anthropic provider
- Set smart defaults: enabled for single-call, disabled for fan-out
- Update ProviderCapabilities to enforce caching support
- Add runtime errors for unsupported providers
- Document caching approaches in docs (caching.md, portable-code.md, etc.)
- Add test coverage for new caching behavior
---
 docs/caching.md                         | 82 +++++++++++++++++++++----
 docs/configuration.md                   |  8 ++-
 docs/portable-code.md                   | 20 +++---
 docs/reference/provider-capabilities.md |  6 +-
 src/pollux/execute.py                   | 11 ++++
 src/pollux/options.py                   |  3 +
 src/pollux/providers/anthropic.py       |  5 ++
 src/pollux/providers/base.py            |  1 +
 src/pollux/providers/mock.py            |  1 +
 src/pollux/providers/models.py          |  1 +
 tests/conftest.py                       |  1 +
 tests/test_pipeline.py                  | 80 +++++++++++++++++++++++-
 tests/test_providers.py                 | 21 +++++++
 13 files changed, 216 insertions(+), 24 deletions(-)

diff --git a/docs/caching.md b/docs/caching.md
index 3169379..72ef16f 100644
--- a/docs/caching.md
+++ b/docs/caching.md
@@ -73,9 +73,65 @@ compare_efficiency(946_800, 10)
 
 More questions on the same content = greater savings.
 
-## Creating a Cache
+## Two Approaches to Caching
 
-Use `create_cache()` to upload content to the provider once, then pass the
+Context caching implementations vary significantly across providers:
+
+- **Implicit caching (Anthropic):** Anthropic caches prompt prefixes during
+  generation. Pollux toggles this with `Options(implicit_caching=...)`.
+- **Explicit caching (Gemini):** You upload context once with
+  `create_cache()`, get a handle back, and pass that handle to later calls.
+
+## Implicit Caching (Anthropic)
+
+Anthropic caches shared prefixes from the top of the request downward:
+system instruction, tools, conversation history, and repeated prompt context.
+You do not create a cache object yourself. Pollux decides whether to ask for
+implicit caching on each provider call.
+
+### Cost Mechanics
+
+Unlike explicit caching, Anthropic changes token pricing per request:
+
+- **Cache writes:** +25% (1.25x standard cost)
+- **Cache reads:** -90% (0.10x standard cost)
+
+Caching pays off when a prefix is written once and then reused. Without
+caching, sending the same prefix twice costs 2.0x. With caching, it costs
+1.35x.
+
+### Default Behavior
+
+Because cache writes cost more, Pollux does not treat implicit caching as a
+blanket default:
+
+- **Single provider call:** Pollux enables implicit caching by default.
+- **Multi-call fan-out:** Pollux disables it by default.
+
+This is a request-shape rule, not an API-entrypoint rule. `run()` always makes
+one provider call, so the default is on. `run_many()` with multiple prompts
+makes multiple parallel calls, so the default is off. `run_many(["Q"])` still
+makes one provider call, so the default is on there too.
+
+The reason is cost. In a conversation, the write premium lands once and later
+turns benefit from cheap cache reads. In a wide fan-out, many identical calls
+arrive before the cache is warm, so you pay the write premium repeatedly.
+
+You can override the default when you need to:
+
+```python
+from pollux import Options
+
+# Disable Anthropic implicit caching for a one-off call.
+options = Options(implicit_caching=False)
+```
+
+Setting `implicit_caching=True` on a provider that does not support it raises
+`ConfigurationError`. Pollux does not silently ignore the request.
+
+## Explicit Caching (Gemini)
+
+For Gemini, use `create_cache()` to upload content to the provider once, then pass the
 returned handle to `run()` or `run_many()` via `Options(cache=handle)`:
 
 ```python
@@ -143,7 +199,7 @@ the same handle reuse the cached context automatically.
 
 ## Cache Identity
 
-Cache keys are deterministic: `hash(model + provider + content hashes of sources)`.
+For explicit caches, keys are deterministic: `hash(model + provider + content hashes of sources)`.
 
 This means:
 
@@ -156,7 +212,7 @@ This means:
 
 ## Single-Flight Protection
 
-When multiple concurrent calls target the same cache key (common in fan-out
+When multiple concurrent calls target the same explicit cache key (common in fan-out
 workloads), Pollux deduplicates the creation call: only one coroutine performs
 the upload, and others await the same result. This eliminates duplicate uploads
 without requiring caller-side coordination.
@@ -171,9 +227,9 @@ Check `metrics.cache_used` on subsequent calls:
 Keep prompts and sources stable between runs when comparing warm vs reuse
 behavior. Usage counters are provider-dependent.
 
-## Tuning TTL
+## Tuning Explicit Cache TTL
 
-Pass `ttl_seconds` to `create_cache()` to control the cache lifetime. The
+Pass `ttl_seconds` to `create_cache()` to control the explicit cache lifetime. The
 default is 3600 seconds (1 hour). Tune it to match your expected reuse window:
 
 - **Too short:** the cache expires before you reuse it, wasting the
@@ -183,7 +239,8 @@ default is 3600 seconds (1 hour). Tune it to match your expected reuse window:
 
 For interactive workloads where you run a batch and then refine prompts within
 the same session, 3600s is a reasonable starting point. For one-shot scripts,
-shorter TTLs (300-600s) avoid lingering cache entries.
+shorter TTLs (300-600s) avoid lingering cache entries. Anthropic manages the
+lifetime of implicit caches on its side.
 
 ## When Caching Pays Off
 
@@ -191,16 +248,19 @@ Caching is most effective when:
 
 - **Sources are large:** video, long PDFs, multi-image sets
 - **Prompt sets are repeated:** fan-out workflows with 3+ prompts per source
-- **Reuse happens within TTL:** default 3600s; tune via `ttl_seconds`
+  using explicit caching
+- **Conversations are deep:** multi-turn dialogues with large system prompts
+  using implicit caching
 
 Caching adds overhead for single-prompt, small-source calls. Start without
 caching and enable it when you see repeated context in your workload.
 
 ## Provider Dependency
 
-Persistent context caching is **Gemini-only**. Calling `create_cache()` with
-a provider that lacks `persistent_cache` support raises an actionable error.
-See [Provider Capabilities](reference/provider-capabilities.md) for the full
+Calling `create_cache()` with a provider that lacks `persistent_cache`
+support raises an actionable error. `Options(implicit_caching=True)` raises in
+the same way on providers that lack implicit caching support. See
+[Provider Capabilities](reference/provider-capabilities.md) for the full
 matrix.
 
 ---
diff --git a/docs/configuration.md b/docs/configuration.md
index c1387f4..7c3aee9 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -31,7 +31,7 @@ All fields and their defaults:
 
 | Field | Type | Default | Description |
 |---|---|---|---|
-| `provider` | `"gemini" \| "openai"` | *(required)* | Provider to use |
+| `provider` | `"gemini" \| "openai" \| "anthropic"` | *(required)* | Provider to use |
 | `model` | `str` | *(required)* | Model identifier |
 | `api_key` | `str \| None` | `None` | Explicit key; auto-resolved from env if omitted |
 | `use_mock` | `bool` | `False` | Use mock provider (no network calls) |
@@ -44,10 +44,12 @@ If `api_key` is omitted, Pollux resolves it from environment variables:
 
 - Gemini: `GEMINI_API_KEY`
 - OpenAI: `OPENAI_API_KEY`
+- Anthropic: `ANTHROPIC_API_KEY`
 
 ```bash
 export GEMINI_API_KEY="your-key"
 export OPENAI_API_KEY="your-key"
+export ANTHROPIC_API_KEY="your-key"
 ```
 
 You can also pass a key directly:
@@ -132,6 +134,7 @@ options = Options(
     response_schema=MyPydanticModel,  # Structured output extraction
     reasoning_effort="medium",        # Controls model thinking depth
     delivery_mode="realtime",         # Only "realtime" is supported
+    implicit_caching=True,            # Auto-cache prefix (Anthropic only)
 )
 ```
 
@@ -147,7 +150,8 @@ options = Options(
 | `delivery_mode` | `str` | `"realtime"` | Only `"realtime"` is supported; `"deferred"` raises an error |
 | `history` | `list[dict] \| None` | `None` | Conversation history. See [Continuing Conversations Across Turns](conversations-and-agents.md) |
 | `continue_from` | `ResultEnvelope \| None` | `None` | Resume from a prior result. See [Continuing Conversations Across Turns](conversations-and-agents.md) |
-| `cache` | `CacheHandle \| None` | `None` | Persistent context cache. See [Reducing Costs with Context Caching](caching.md) |
+| `cache` | `CacheHandle \| None` | `None` | Persistent explicit cache (Gemini). See [Reducing Costs with Context Caching](caching.md) |
+| `implicit_caching` | `bool \| None` | `None` | Enable or disable Anthropic implicit caching. Defaults to `True` for a single provider call and `False` for multi-call fan-out. `implicit_caching=True` raises on providers that do not support it. See [Reducing Costs with Context Caching](caching.md) |
 
 !!! note
     OpenAI GPT-5 family models (`gpt-5`, `gpt-5-mini`, `gpt-5-nano`) reject
diff --git a/docs/portable-code.md b/docs/portable-code.md
index 878199b..d92ae4a 100644
--- a/docs/portable-code.md
+++ b/docs/portable-code.md
@@ -11,11 +11,13 @@ You want analysis code that works across providers. Switch from Gemini to
 OpenAI (or back) by changing a config line, not rewriting your pipeline.
 This page shows the patterns that make that work.
 
-Pollux is capability-transparent, not capability-equalizing. Both providers
+Pollux is capability-transparent, not capability-equalizing. All providers
 support the core pipeline (text generation, structured output, tool calling,
-conversation continuity), but some features are provider-specific. Context
-caching is Gemini-only, for example. When you use an unsupported feature,
-Pollux raises a `ConfigurationError` or `APIError`. No silent degradation.
+conversation continuity), but some features are provider-specific. For
+example, Gemini uses explicit cache handles (`create_cache()`), while
+Anthropic uses implicit caching (`Options(implicit_caching=True)`).
+When you use an unsupported feature for a provider, Pollux raises a
+`ConfigurationError` or `APIError`. No silent degradation.
 This keeps behavior legible in both development and production.
 
 !!! info "Boundary"
@@ -114,9 +116,8 @@ asyncio.run(main())
    its model. Your analysis functions never reference provider names or
    models directly.
 
-2. **Use `create_cache()` for persistent caching.** Caching is now
-   opt-in via `create_cache()` and `Options(cache=handle)`. Only call
-   it when the provider supports `persistent_cache` (e.g. Gemini).
+2. **Handle features conditionally.** Explicit caching (`create_cache()`) is
+   Gemini-specific, while implicit caching (`implicit_caching=True`) is Anthropic-specific. Handle conditional optimizations near the edge, or wrap them dynamically if needed.
 
 3. **Write provider-agnostic functions.** `analyze_document` accepts a
    provider name and builds the config internally. The prompt, source, and
@@ -253,8 +254,9 @@ async def test_analyze_document_mock(provider: str) -> None:
 ## What to Watch For
 
 - **Keep the portable subset in mind.** Text generation, structured output,
-  tool calling, and conversation continuity work on both providers. Context
-  caching is Gemini-only. YouTube URLs have limited OpenAI support.
+  tool calling, and conversation continuity work on all providers. Context
+  caching has different paradigms (explicit for Gemini, implicit for Anthropic). 
+  YouTube URLs have limited OpenAI support.
   Check [Provider Capabilities](reference/provider-capabilities.md).
 - **Config errors are your portability signal.** A `ConfigurationError` for
   an unsupported feature marks the boundary of portability. Handle it at
diff --git a/docs/reference/provider-capabilities.md b/docs/reference/provider-capabilities.md
index a6638f8..2dce4cb 100644
--- a/docs/reference/provider-capabilities.md
+++ b/docs/reference/provider-capabilities.md
@@ -20,7 +20,8 @@ Pollux is **capability-transparent**, not capability-equalizing: providers are a
 | PDF URL inputs | ✅ (via URI part) | ✅ (native `input_file.file_url`) | ✅ (native `document` URL block) | |
 | Image URL inputs | ✅ (via URI part) | ✅ (native `input_image.image_url`) | ✅ (native `image` URL block) | |
 | YouTube URL inputs | ✅ | ⚠️ limited | ⚠️ limited | OpenAI/Anthropic parity layers (download/re-upload) are out of scope |
-| Provider-side context caching | ✅ | ❌ | ❌ | OpenAI and Anthropic providers return unsupported for caching |
+| Explicit context caching (`create_cache`) | ✅ | ❌ | ❌ | Persistent cache handles are Gemini-only |
+| Implicit prompt caching (`Options.implicit_caching`) | ❌ | ❌ | ✅ | Anthropic-only request-level optimization |
 | Structured outputs (`response_schema`) | ✅ | ✅ | ✅ | JSON-schema path in all providers |
 | Reasoning controls (`reasoning_effort`) | ✅ | ✅ | ✅ | Passed through to provider; see notes below |
 | Deferred delivery (`delivery_mode="deferred"`) | ❌ | ❌ | ❌ | Not supported; raises `ConfigurationError` |
@@ -69,6 +70,9 @@ Pollux is **capability-transparent**, not capability-equalizing: providers are a
 ### Anthropic
 
 - Remote URL support is intentionally narrow: images and PDFs only.
+- Implicit prompt caching is enabled with `Options(implicit_caching=True)`.
+  Pollux defaults it on for single-call workloads and off for multi-call
+  fan-out. Requesting it on unsupported providers raises `ConfigurationError`.
 - Reasoning: `reasoning_effort` maps to `output_config.effort`.
   Pollux uses `thinking.type="adaptive"` on adaptive-capable models
   (currently Opus 4.6 and Sonnet 4.6) and falls back to manual thinking budgets on older
diff --git a/src/pollux/execute.py b/src/pollux/execute.py
index b3c2842..9872b25 100644
--- a/src/pollux/execute.py
+++ b/src/pollux/execute.py
@@ -103,6 +103,11 @@ async def execute_plan(plan: Plan, provider: Provider) -> ExecutionTrace:
             "Provider does not support reasoning controls",
             hint="Remove reasoning_effort or choose a provider with reasoning support.",
         )
+    if options.implicit_caching is True and not caps.implicit_caching:
+        raise ConfigurationError(
+            "Provider does not support implicit caching",
+            hint="Remove implicit_caching=True or choose a provider with implicit caching support.",
+        )
     if wants_conversation and not caps.conversation:
         raise ConfigurationError(
             "Provider does not support conversation continuity",
@@ -178,6 +183,11 @@ async def execute_plan(plan: Plan, provider: Provider) -> ExecutionTrace:
     upload_lock = asyncio.Lock()
     retry_policy = config.retry
     responses: list[dict[str, Any]] = []
+    implicit_caching = (
+        options.implicit_caching
+        if options.implicit_caching is not None
+        else caps.implicit_caching and len(prompts) == 1
+    )
     total_usage: dict[str, int] = {}
     conversation_state: dict[str, Any] | None = None
 
@@ -288,6 +298,7 @@ async def _execute_call(call_idx: int) -> dict[str, Any]:
                         previous_response_id=previous_response_id,
                         provider_state=request_provider_state,
                         max_tokens=options.max_tokens,
+                        implicit_caching=implicit_caching,
                     )
 
                     if retry_policy.max_attempts <= 1:
diff --git a/src/pollux/options.py b/src/pollux/options.py
index 0ea6ee2..859855c 100644
--- a/src/pollux/options.py
+++ b/src/pollux/options.py
@@ -47,6 +47,9 @@ class Options:
     max_tokens: int | None = None
     #: Persistent context cache obtained from ``create_cache()``.
     cache: CacheHandle | None = None
+    #: Controls implicit model-level caching (e.g., Anthropic prefix caching).
+    #: Defaults to True for a single provider call, False for multi-call fan-out.
+    implicit_caching: bool | None = None
 
     def __post_init__(self) -> None:
         """Validate option shapes early for clear errors."""
diff --git a/src/pollux/providers/anthropic.py b/src/pollux/providers/anthropic.py
index 2c90e71..ebbc193 100644
--- a/src/pollux/providers/anthropic.py
+++ b/src/pollux/providers/anthropic.py
@@ -67,6 +67,7 @@ def capabilities(self) -> ProviderCapabilities:
             reasoning=True,
             deferred_delivery=False,
             conversation=True,
+            implicit_caching=True,
         )
 
     @staticmethod
@@ -231,8 +232,12 @@ async def generate(
             ),
         }
 
+        if request.implicit_caching:
+            create_kwargs["cache_control"] = {"type": "ephemeral"}
+
         if request.system_instruction:
             create_kwargs["system"] = request.system_instruction
+
         if request.temperature is not None:
             create_kwargs["temperature"] = request.temperature
         if request.top_p is not None:
diff --git a/src/pollux/providers/base.py b/src/pollux/providers/base.py
index ee9bebe..bfb191e 100644
--- a/src/pollux/providers/base.py
+++ b/src/pollux/providers/base.py
@@ -25,6 +25,7 @@ class ProviderCapabilities:
     reasoning: bool = False
     deferred_delivery: bool = False
     conversation: bool = False
+    implicit_caching: bool = False
 
 
 @runtime_checkable
diff --git a/src/pollux/providers/mock.py b/src/pollux/providers/mock.py
index c7dc2c5..64c3f92 100644
--- a/src/pollux/providers/mock.py
+++ b/src/pollux/providers/mock.py
@@ -27,6 +27,7 @@ def capabilities(self) -> ProviderCapabilities:
             reasoning=False,
             deferred_delivery=False,
             conversation=False,
+            implicit_caching=False,
         )
 
     async def generate(self, request: ProviderRequest) -> ProviderResponse:
diff --git a/src/pollux/providers/models.py b/src/pollux/providers/models.py
index a313389..a9f17ef 100644
--- a/src/pollux/providers/models.py
+++ b/src/pollux/providers/models.py
@@ -53,6 +53,7 @@ class ProviderRequest:
     previous_response_id: str | None = None
     provider_state: dict[str, Any] | None = None
     max_tokens: int | None = None
+    implicit_caching: bool = False
 
 
 @dataclass
diff --git a/tests/conftest.py b/tests/conftest.py
index 74d68c2..dc24575 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -58,6 +58,7 @@ async def generate(self, request: ProviderRequest) -> ProviderResponse:
             "history": request.history,
             "previous_response_id": request.previous_response_id,
             "provider_state": request.provider_state,
+            "implicit_caching": request.implicit_caching,
         }
         prompt = (
             request.parts[-1]
diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py
index a7a078b..911f35b 100644
--- a/tests/test_pipeline.py
+++ b/tests/test_pipeline.py
@@ -26,7 +26,13 @@
 from pollux.request import normalize_request
 from pollux.retry import RetryPolicy
 from pollux.source import Source
-from tests.conftest import CACHE_MODEL, GEMINI_MODEL, OPENAI_MODEL, FakeProvider
+from tests.conftest import (
+    ANTHROPIC_MODEL,
+    CACHE_MODEL,
+    GEMINI_MODEL,
+    OPENAI_MODEL,
+    FakeProvider,
+)
 from tests.helpers import CaptureProvider as KwargsCaptureProvider
 from tests.helpers import GateProvider, ScriptedProvider
 
@@ -1325,6 +1331,78 @@ class ExampleSchema(BaseModel):
     assert response_schema["type"] == "object"
 
 
+@pytest.mark.asyncio
+async def test_implicit_caching_defaults_to_true_for_single_call_when_supported(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Single-call Anthropic-style workloads should default implicit caching on."""
+    fake = KwargsCaptureProvider(
+        _capabilities=ProviderCapabilities(
+            persistent_cache=False,
+            uploads=True,
+            structured_outputs=False,
+            reasoning=False,
+            deferred_delivery=False,
+            conversation=False,
+            implicit_caching=True,
+        )
+    )
+    monkeypatch.setattr(pollux, "_get_provider", lambda _config: fake)
+    cfg = Config(provider="anthropic", model=ANTHROPIC_MODEL, use_mock=True)
+
+    await pollux.run("Q1?", config=cfg)
+
+    assert len(fake.generate_kwargs) == 1
+    request = fake.generate_kwargs[0]["request"]
+    assert request.implicit_caching is True
+
+
+@pytest.mark.asyncio
+async def test_implicit_caching_defaults_to_false_for_multi_call_fanout(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Multi-call fan-out should default implicit caching off."""
+    fake = KwargsCaptureProvider(
+        _capabilities=ProviderCapabilities(
+            persistent_cache=False,
+            uploads=True,
+            structured_outputs=False,
+            reasoning=False,
+            deferred_delivery=False,
+            conversation=False,
+            implicit_caching=True,
+        )
+    )
+    monkeypatch.setattr(pollux, "_get_provider", lambda _config: fake)
+    cfg = Config(provider="anthropic", model=ANTHROPIC_MODEL, use_mock=True)
+
+    await pollux.run_many(("Q1?", "Q2?"), config=cfg)
+
+    assert len(fake.generate_kwargs) == 2
+    assert all(
+        call["request"].implicit_caching is False for call in fake.generate_kwargs
+    )
+
+
+@pytest.mark.asyncio
+async def test_implicit_caching_requires_provider_capability_when_enabled(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Explicit implicit_caching=True should fail on providers that lack it."""
+    fake = FakeProvider()
+    monkeypatch.setattr(pollux, "_get_provider", lambda _config: fake)
+    cfg = Config(provider="gemini", model=GEMINI_MODEL, use_mock=True)
+
+    with pytest.raises(ConfigurationError, match="implicit caching") as exc:
+        await pollux.run(
+            "Q1?",
+            config=cfg,
+            options=Options(implicit_caching=True),
+        )
+
+    assert exc.value.hint is not None
+
+
 @pytest.mark.asyncio
 async def test_delivery_mode_deferred_is_explicitly_not_implemented(
     monkeypatch: pytest.MonkeyPatch,
diff --git a/tests/test_providers.py b/tests/test_providers.py
index b52ccfb..bf55ffb 100644
--- a/tests/test_providers.py
+++ b/tests/test_providers.py
@@ -1748,11 +1748,32 @@ async def test_anthropic_generate_with_system_instruction() -> None:
             model=ANTHROPIC_MODEL,
             parts=["Hello"],
             system_instruction="Be concise.",
+            implicit_caching=True,
         )
     )
 
     assert messages.last_kwargs is not None
     assert messages.last_kwargs["system"] == "Be concise."
+    assert messages.last_kwargs["cache_control"] == {"type": "ephemeral"}
+
+
+@pytest.mark.asyncio
+async def test_anthropic_generate_with_implicit_caching_disabled() -> None:
+    """Disabling implicit_caching omits Anthropic cache_control."""
+    provider, messages = _anthropic_provider_with_fake()
+
+    await provider.generate(
+        ProviderRequest(
+            model=ANTHROPIC_MODEL,
+            parts=["Hello"],
+            system_instruction="Be concise.",
+            implicit_caching=False,
+        )
+    )
+
+    assert messages.last_kwargs is not None
+    assert messages.last_kwargs["system"] == "Be concise."
+    assert "cache_control" not in messages.last_kwargs
 
 
 @pytest.mark.asyncio