seanbrar · seanbrar · Mar 5, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/.github/ISSUE_TEMPLATE/bug.md b/.github/ISSUE_TEMPLATE/bug.md
@@ -40,7 +40,7 @@ labels: [bug]
 <!--
 Anything that may affect repro:
 - Input type (local file, PDF URL, image URL, YouTube URL)
-- Config toggles (for example enable_caching)
+- Config toggles (for example use_mock, request_concurrency)
 - Expected limits/cost considerations
 -->
 

diff --git a/README.md b/README.md
@@ -138,7 +138,6 @@ from pollux import Config
 config = Config(
     provider="gemini",
     model="gemini-2.5-flash-lite",
-    enable_caching=True,  # Gemini-only in v1.0
 )
 ```
 

diff --git a/cookbook/optimization/cache-warming-and-ttl.py b/cookbook/optimization/cache-warming-and-ttl.py
@@ -7,7 +7,7 @@
 
 Pattern:
     - Keep prompts and sources fixed.
-    - Enable caching with a meaningful TTL.
+    - Create a persistent cache via ``create_cache()``.
     - Run once to warm and once to reuse (back-to-back).
     - Compare tokens and cache signal.
 """
@@ -16,7 +16,6 @@
 
 import argparse
 import asyncio
-from dataclasses import replace
 from pathlib import Path
 from typing import TYPE_CHECKING
 
@@ -28,15 +27,15 @@
     print_section,
 )
 from cookbook.utils.runtime import add_runtime_args, build_config_or_exit, usage_tokens
-from pollux import Config, Source, run_many
+from pollux import Config, Options, Source, create_cache, run_many
 
 if TYPE_CHECKING:
     from pollux.result import ResultEnvelope
 
-PROMPTS = [
+PROMPTS = (
     "List 5 key concepts with one-sentence explanations.",
     "Extract three actionable recommendations.",
-]
+)
 
 
 def describe(run_name: str, envelope: ResultEnvelope) -> None:
@@ -52,15 +51,17 @@ def describe(run_name: str, envelope: ResultEnvelope) -> None:
     )
 
 
-async def main_async(directory: Path, *, limit: int, config: Config) -> None:
+async def main_async(directory: Path, *, limit: int, config: Config, ttl: int) -> None:
     files = sorted(path for path in directory.rglob("*") if path.is_file())[:limit]
     if not files:
         raise SystemExit(f"No files found under: {directory}")
 
     sources = [Source.from_file(path) for path in files]
 
-    warm = await run_many(PROMPTS, sources=sources, config=config)
-    reuse = await run_many(PROMPTS, sources=sources, config=config)
+    handle = await create_cache(sources, config=config, ttl_seconds=ttl)
+
+    warm = await run_many(PROMPTS, config=config, options=Options(cache=handle))
+    reuse = await run_many(PROMPTS, config=config, options=Options(cache=handle))
     warm_tokens = usage_tokens(warm)
     reuse_tokens = usage_tokens(reuse)
     saved = None
@@ -107,16 +108,14 @@ def main() -> None:
         hint="No input directory found. Run `just demo-data` or pass --input /path/to/dir.",
     )
     config = build_config_or_exit(args)
-    cached_config = replace(
-        config, enable_caching=True, ttl_seconds=max(1, int(args.ttl))
-    )
 
-    print_header("Cache warming and TTL", config=cached_config)
+    print_header("Cache warming and TTL", config=config)
     asyncio.run(
         main_async(
             directory,
             limit=max(1, int(args.limit)),
-            config=cached_config,
+            config=config,
+            ttl=max(1, int(args.ttl)),
         )
     )
 

diff --git a/cookbook/utils/runtime.py b/cookbook/utils/runtime.py
@@ -60,14 +60,11 @@ def build_config_or_exit(args: argparse.Namespace) -> Config:
 def print_run_mode(config: Config) -> None:
     """Print a compact runtime mode line for recipe users."""
     mode = "mock" if config.use_mock else "real-api"
-    caching = f"on(ttl={config.ttl_seconds}s)" if config.enable_caching else "off"
     extra = ""
     # Keep the mode line compact; only call out non-default concurrency.
     if getattr(config, "request_concurrency", 6) != 6:
         extra = f" | request_concurrency={config.request_concurrency}"
-    print(
-        f"Mode: {mode} | provider={config.provider} | model={config.model} | caching={caching}{extra}"
-    )
+    print(f"Mode: {mode} | provider={config.provider} | model={config.model}{extra}")
 
 
 def usage_tokens(envelope: ResultEnvelope) -> int | None:

diff --git a/docs/caching.md b/docs/caching.md
@@ -1,5 +1,5 @@
 <!-- Intent: Teach context caching mechanics: the redundant-context problem,
-     enabling caching, cache identity, TTL tuning, and when caching pays off.
+     creating a cache, cache identity, TTL tuning, and when caching pays off.
      Do NOT cover source patterns or structured output in depth — link to those
      pages. Assumes the reader understands run_many() and fan-out workflows.
      Register: conceptual opening → guided applied. -->
@@ -73,29 +73,30 @@ compare_efficiency(946_800, 10)
 
 More questions on the same content = greater savings.
 
-## Enabling Caching
+## Creating a Cache
 
-Let's see this in practice. Two flags in `Config` control caching:
+Use `create_cache()` to upload content to the provider once, then pass the
+returned handle to `run()` or `run_many()` via `Options(cache=handle)`:
 
 ```python
 import asyncio
-from pollux import Config, Source, run_many
+from pollux import Config, Options, Source, create_cache, run_many
 
 async def main() -> None:
     config = Config(
         provider="gemini",
         model="gemini-2.5-flash-lite",
-        enable_caching=True,
-        ttl_seconds=3600,
     )
-    prompts = ["Summarize in one sentence.", "List 3 keywords."]
     sources = [Source.from_text(
         "ACME Corp Q3 2025 earnings: revenue $4.2B (+12% YoY), "
         "operating margin 18.5%, guidance raised for Q4."
     )]
 
-    first = await run_many(prompts=prompts, sources=sources, config=config)
-    second = await run_many(prompts=prompts, sources=sources, config=config)
+    handle = await create_cache(sources, config=config, ttl_seconds=3600)
+
+    prompts = ["Summarize in one sentence.", "List 3 keywords."]
+    first = await run_many(prompts=prompts, config=config, options=Options(cache=handle))
+    second = await run_many(prompts=prompts, config=config, options=Options(cache=handle))
 
     print("first:", first["status"])
     print("second:", second["status"])
@@ -106,27 +107,43 @@ asyncio.run(main())
 
 ### Step-by-Step Walkthrough
 
-1. **Set `enable_caching=True`.** This tells Pollux to upload content to the
-   provider's cache on the first call, rather than sending it inline.
+1. **Call `create_cache()`.** Pass your sources, config, and a TTL. Pollux
+   uploads the content to the provider and returns a `CacheHandle`.
 
 2. **Set `ttl_seconds`.** The TTL controls how long the cached content lives on
    the provider. Match it to your reuse window. 3600s (1 hour) is a
    reasonable default for interactive sessions.
 
-3. **Run the same sources with different prompts.** The first `run_many()` call
-   uploads the content and creates a cache entry. The second call detects the
-   same content hash and reuses the cached reference.
+3. **Pass the handle via `Options(cache=handle)`.** Each `run()` or `run_many()`
+   call that uses this handle references the cached content instead of
+   re-uploading it.
 
 4. **Verify with `metrics.cache_used`.** Check
    `result["metrics"]["cache_used"]` on subsequent calls. `True` confirms
    the provider served content from cache rather than re-uploading.
 
-Pollux computes cache identity from model + source content hash. The second
-call reuses the cached context automatically.
+Pollux computes cache identity from model + source content hash. Calls with
+the same handle reuse the cached context automatically.
+
+!!! warning "Options restricted when using a cache handle"
+    When `Options(cache=handle)` is set, the following fields **cannot** be
+    passed alongside it:
+
+    - `system_instruction` — bake it into `create_cache(system_instruction=...)`
+      instead.
+    - `tools` — bake them into `create_cache(tools=...)` instead (when
+      supported).
+    - `tool_choice` — remove it when using cached content.
+    - `sources` — bake them into `create_cache()` instead.
+
+    Pollux raises `ConfigurationError` immediately if it detects these
+    conflicts.  This mirrors a hard constraint in the Gemini API, where
+    `cached_content` cannot coexist with `system_instruction`, `tools`, or
+    `tool_config` in the same `GenerateContent` request.
 
 ## Cache Identity
 
-Cache keys are deterministic: `hash(model + content hashes of sources)`.
+Cache keys are deterministic: `hash(model + provider + content hashes of sources)`.
 
 This means:
 
@@ -156,8 +173,8 @@ behavior. Usage counters are provider-dependent.
 
 ## Tuning TTL
 
-The default TTL is 3600 seconds (1 hour). Tune `ttl_seconds` to match your
-expected reuse window:
+Pass `ttl_seconds` to `create_cache()` to control the cache lifetime. The
+default is 3600 seconds (1 hour). Tune it to match your expected reuse window:
 
 - **Too short:** the cache expires before you reuse it, wasting the
   warm-up cost.
@@ -181,9 +198,9 @@ caching and enable it when you see repeated context in your workload.
 
 ## Provider Dependency
 
-Context caching is **Gemini-only**. Enabling it with OpenAI raises
-an actionable error. See
-[Provider Capabilities](reference/provider-capabilities.md) for the full
+Persistent context caching is **Gemini-only**. Calling `create_cache()` with
+a provider that lacks `persistent_cache` support raises an actionable error.
+See [Provider Capabilities](reference/provider-capabilities.md) for the full
 matrix.
 
 ---

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -35,8 +35,6 @@ All fields and their defaults:
 | `model` | `str` | *(required)* | Model identifier |
 | `api_key` | `str \| None` | `None` | Explicit key; auto-resolved from env if omitted |
 | `use_mock` | `bool` | `False` | Use mock provider (no network calls) |
-| `enable_caching` | `bool` | `False` | Enable provider-side context caching |
-| `ttl_seconds` | `int` | `3600` | Cache time-to-live in seconds |
 | `request_concurrency` | `int` | `6` | Max concurrent API calls in multi-prompt execution |
 | `retry` | `RetryPolicy` | `RetryPolicy()` | Retry configuration |
 
@@ -78,16 +76,14 @@ pipeline logic, testing integrations, and CI.
 config = Config(
     provider="gemini",
     model="gemini-2.5-flash-lite",
-    enable_caching=True,       # Reuse uploaded context (Gemini-only)
-    ttl_seconds=3600,          # Cache lifetime
     request_concurrency=6,     # Concurrent API calls
 )
 ```
 
 | Need | Direction |
 |---|---|
 | Fast iteration without API calls | `use_mock=True` |
-| Reduce token spend on repeated context | `enable_caching=True`. See [Reducing Costs with Context Caching](caching.md) |
+| Reduce token spend on repeated context | Use `create_cache()`. See [Reducing Costs with Context Caching](caching.md) |
 | Higher throughput for many prompts/sources | Increase `request_concurrency` |
 | Better resilience to transient failures | Customize `retry=RetryPolicy(...)` |
 
@@ -151,6 +147,7 @@ options = Options(
 | `delivery_mode` | `str` | `"realtime"` | Only `"realtime"` is supported; `"deferred"` raises an error |
 | `history` | `list[dict] \| None` | `None` | Conversation history. See [Continuing Conversations Across Turns](conversations-and-agents.md) |
 | `continue_from` | `ResultEnvelope \| None` | `None` | Resume from a prior result. See [Continuing Conversations Across Turns](conversations-and-agents.md) |
+| `cache` | `CacheHandle \| None` | `None` | Persistent context cache. See [Reducing Costs with Context Caching](caching.md) |
 
 !!! note
     OpenAI GPT-5 family models (`gpt-5`, `gpt-5-mini`, `gpt-5-nano`) reject
@@ -159,6 +156,13 @@ options = Options(
     See [Writing Portable Code Across Providers](portable-code.md#model-specific-constraints)
     for the full constraints mapping.
 
+!!! warning "Cache handle restrictions"
+    When `cache` is set, `system_instruction`, `tools`, and `tool_choice`
+    **must not** be passed in the same `Options`. `system_instruction` and
+    `tools` can be baked into `create_cache()`, while `tool_choice` must be
+    set only on uncached calls. See
+    [Reducing Costs with Context Caching](caching.md) for details.
+
 ## Safety Notes
 
 - `Config` is immutable (`frozen=True`). Create a new instance to change values.

diff --git a/docs/portable-code.md b/docs/portable-code.md
@@ -35,8 +35,7 @@ varying parts in config; keep the stable parts in functions.
 
 ## Complete Example
 
-A document analysis function that works on any provider. Caching is used
-when available, skipped otherwise.
+A document analysis function that works on any provider.
 
 ```python
 import asyncio
@@ -55,38 +54,32 @@ class DocumentSummary(BaseModel):
 
 @dataclass
 class ProviderConfig:
-    """Maps a provider to a model and capability flags."""
+    """Maps a provider to a model."""
     provider: str
     model: str
-    supports_caching: bool = False
 
 
 # Provider-specific details live here, not in your pipeline logic
 PROVIDERS = {
-    "gemini": ProviderConfig("gemini", "gemini-2.5-flash-lite", supports_caching=True),
+    "gemini": ProviderConfig("gemini", "gemini-2.5-flash-lite"),
     "openai": ProviderConfig("openai", "gpt-5-nano"),
 }
 
 
-def make_config(provider_name: str, *, enable_caching: bool = False) -> Config:
+def make_config(provider_name: str) -> Config:
     """Build a Config for the given provider with safe defaults."""
     pc = PROVIDERS[provider_name]
-    return Config(
-        provider=pc.provider,
-        model=pc.model,
-        enable_caching=enable_caching and pc.supports_caching,
-    )
+    return Config(provider=pc.provider, model=pc.model)
 
 
 async def analyze_document(
     file_path: str,
     prompt: str,
     *,
     provider_name: str = "gemini",
-    enable_caching: bool = False,
 ) -> DocumentSummary:
     """Analyze a document — works with any supported provider."""
-    config = make_config(provider_name, enable_caching=enable_caching)
+    config = make_config(provider_name)
     options = Options(response_schema=DocumentSummary)
 
     result = await run(
@@ -118,12 +111,12 @@ asyncio.run(main())
 ### Step-by-Step Walkthrough
 
 1. **Centralize provider details.** `ProviderConfig` maps each provider to
-   its model and capability flags. Your analysis functions never reference
-   provider names or models directly.
+   its model. Your analysis functions never reference provider names or
+   models directly.
 
-2. **Guard capability-specific features.** `make_config` only enables caching
-   when both the caller requests it *and* the provider supports it. This
-   avoids `ConfigurationError` at runtime.
+2. **Use `create_cache()` for persistent caching.** Caching is now
+   opt-in via `create_cache()` and `Options(cache=handle)`. Only call
+   it when the provider supports `persistent_cache` (e.g. Gemini).
 
 3. **Write provider-agnostic functions.** `analyze_document` accepts a
    provider name and builds the config internally. The prompt, source, and

diff --git a/docs/reference/api.md b/docs/reference/api.md
@@ -14,10 +14,14 @@ The primary execution functions are exported from `pollux`:
 
 ::: pollux.continue_tool
 
+::: pollux.create_cache
+
 ## Core Types
 
 ::: pollux.Source
 
+::: pollux.CacheHandle
+
 ::: pollux.Config
 
 ::: pollux.Options

diff --git a/docs/reference/provider-capabilities.md b/docs/reference/provider-capabilities.md
@@ -84,20 +84,19 @@ Pollux is **capability-transparent**, not capability-equalizing: providers are a
 
 When a requested feature is unsupported for the selected provider or release scope, Pollux raises `ConfigurationError` or `APIError` with a concrete hint, instead of degrading silently.
 
-For example, enabling caching with OpenAI:
+For example, creating a persistent cache with OpenAI:
 
 ```python
-from pollux import Config
-
-config = Config(
-    provider="openai",
-    model="gpt-5-nano",
-    enable_caching=True,  # not supported for OpenAI
+from pollux import Config, Source, create_cache
+
+config = Config(provider="openai", model="gpt-5-nano")
+# This raises immediately:
+# ConfigurationError: Provider 'openai' does not support persistent caching
+# hint: "Use a provider that supports persistent_cache (e.g. Gemini)."
+handle = await create_cache(
+    [Source.from_text("hello")], config=config
 )
-# At execution time, Pollux raises:
-# ConfigurationError: Provider does not support caching
-# hint: "Disable caching or choose a provider with caching support."
 ```
 
-The error is raised at execution time (not at `Config` creation) because
-caching support is a provider capability checked during plan execution.
+The error is raised at `create_cache()` call time because persistent caching
+is a provider capability checked before the upload attempt.