From 06aa47aa6551de974548bbc6c66fabcfa02d0f08 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Sat, 21 Mar 2026 11:02:28 +0000 Subject: [PATCH 01/34] feat(proxy): durable http bridge ownership --- .../20260321_120000_add_http_bridge_leases.py | 74 ++++ ...322_000000_merge_http_bridge_lease_head.py | 25 ++ app/db/models.py | 28 ++ app/dependencies.py | 2 + app/modules/proxy/api.py | 18 +- app/modules/proxy/bridge_repository.py | 165 +++++++ app/modules/proxy/repo_bundle.py | 2 + app/modules/proxy/service.py | 401 +++++++++++++++--- .../.openspec.yaml | 2 + .../durable-http-bridge-ownership/proposal.md | 17 + .../specs/responses-api-compat/spec.md | 26 ++ .../durable-http-bridge-ownership/tasks.md | 18 + .../specs/responses-api-compat/context.md | 8 +- openspec/specs/responses-api-compat/ops.md | 6 +- openspec/specs/responses-api-compat/spec.md | 27 +- .../integration/test_http_responses_bridge.py | 82 ++-- .../test_load_balancer_integration.py | 2 + .../unit/test_proxy_load_balancer_refresh.py | 2 + .../test_proxy_service_additional_limits.py | 2 + 19 files changed, 811 insertions(+), 96 deletions(-) create mode 100644 app/db/alembic/versions/20260321_120000_add_http_bridge_leases.py create mode 100644 app/db/alembic/versions/20260322_000000_merge_http_bridge_lease_head.py create mode 100644 app/modules/proxy/bridge_repository.py create mode 100644 openspec/changes/durable-http-bridge-ownership/.openspec.yaml create mode 100644 openspec/changes/durable-http-bridge-ownership/proposal.md create mode 100644 openspec/changes/durable-http-bridge-ownership/specs/responses-api-compat/spec.md create mode 100644 openspec/changes/durable-http-bridge-ownership/tasks.md diff --git a/app/db/alembic/versions/20260321_120000_add_http_bridge_leases.py b/app/db/alembic/versions/20260321_120000_add_http_bridge_leases.py new file mode 100644 index 00000000..63421b56 --- /dev/null +++ b/app/db/alembic/versions/20260321_120000_add_http_bridge_leases.py @@ -0,0 +1,74 @@ +"""add http_bridge_leases table + +Revision ID: 20260321_120000_add_http_bridge_leases +Revises: 20260320_000000_add_request_log_requested_actual_tiers +Create Date: 2026-03-21 +""" + +from __future__ import annotations + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.engine import Connection + +# revision identifiers, used by Alembic. +revision = "20260321_120000_add_http_bridge_leases" +down_revision = "20260320_000000_add_request_log_requested_actual_tiers" +branch_labels = None +depends_on = None + + +def _table_exists(connection: Connection, table_name: str) -> bool: + inspector = sa.inspect(connection) + return inspector.has_table(table_name) + + +def _index_exists(connection: Connection, index_name: str, table_name: str) -> bool: + inspector = sa.inspect(connection) + if not inspector.has_table(table_name): + return False + return any(index["name"] == index_name for index in inspector.get_indexes(table_name)) + + +def upgrade() -> None: + bind = op.get_bind() + if not _table_exists(bind, "http_bridge_leases"): + op.create_table( + "http_bridge_leases", + sa.Column("session_id", sa.String(), primary_key=True), + sa.Column("affinity_kind", sa.String(), nullable=False), + sa.Column("affinity_key", sa.String(), nullable=False), + sa.Column("api_key_scope", sa.String(), nullable=False, server_default=sa.text("''")), + sa.Column("owner_instance_id", sa.String(), nullable=False), + sa.Column("lease_expires_at", sa.DateTime(), nullable=False), + sa.Column("account_id", sa.String(), nullable=True), + sa.Column("request_model", sa.String(), nullable=True), + sa.Column("codex_session", sa.Boolean(), nullable=False, server_default=sa.false()), + sa.Column("idle_ttl_seconds", sa.Float(), nullable=False), + sa.Column("upstream_turn_state", sa.String(), nullable=True), + sa.Column("downstream_turn_state", sa.String(), nullable=True), + sa.Column("created_at", sa.DateTime(), server_default=sa.func.now(), nullable=False), + sa.Column("updated_at", sa.DateTime(), server_default=sa.func.now(), nullable=False), + ) + if not _index_exists(bind, "ix_http_bridge_leases_owner_expires", "http_bridge_leases"): + op.create_index( + "ix_http_bridge_leases_owner_expires", + "http_bridge_leases", + ["owner_instance_id", "lease_expires_at"], + ) + if not _index_exists(bind, "ix_http_bridge_leases_expires", "http_bridge_leases"): + op.create_index( + "ix_http_bridge_leases_expires", + "http_bridge_leases", + ["lease_expires_at"], + ) + + +def downgrade() -> None: + bind = op.get_bind() + if _table_exists(bind, "http_bridge_leases"): + if _index_exists(bind, "ix_http_bridge_leases_expires", "http_bridge_leases"): + op.drop_index("ix_http_bridge_leases_expires", table_name="http_bridge_leases") + if _index_exists(bind, "ix_http_bridge_leases_owner_expires", "http_bridge_leases"): + op.drop_index("ix_http_bridge_leases_owner_expires", table_name="http_bridge_leases") + op.drop_table("http_bridge_leases") diff --git a/app/db/alembic/versions/20260322_000000_merge_http_bridge_lease_head.py b/app/db/alembic/versions/20260322_000000_merge_http_bridge_lease_head.py new file mode 100644 index 00000000..4fc5d63d --- /dev/null +++ b/app/db/alembic/versions/20260322_000000_merge_http_bridge_lease_head.py @@ -0,0 +1,25 @@ +"""merge http bridge lease head + +Revision ID: 20260322_000000_merge_http_bridge_lease_head +Revises: 20260321_120000_add_http_bridge_leases, 20260321_210000_merge_request_log_tiers_and_dashboard_index_heads +Create Date: 2026-03-22 +""" + +from __future__ import annotations + +# revision identifiers, used by Alembic. +revision = "20260322_000000_merge_http_bridge_lease_head" +down_revision = ( + "20260321_120000_add_http_bridge_leases", + "20260321_210000_merge_request_log_tiers_and_dashboard_index_heads", +) +branch_labels = None +depends_on = None + + +def upgrade() -> None: + pass + + +def downgrade() -> None: + pass diff --git a/app/db/models.py b/app/db/models.py index 085a18f1..af48475c 100644 --- a/app/db/models.py +++ b/app/db/models.py @@ -157,6 +157,34 @@ class StickySession(Base): ) +class HttpBridgeLease(Base): + __tablename__ = "http_bridge_leases" + __table_args__ = ( + Index("ix_http_bridge_leases_owner_expires", "owner_instance_id", "lease_expires_at"), + Index("ix_http_bridge_leases_expires", "lease_expires_at"), + ) + + session_id: Mapped[str] = mapped_column(String, primary_key=True) + affinity_kind: Mapped[str] = mapped_column(String, nullable=False) + affinity_key: Mapped[str] = mapped_column(String, nullable=False) + api_key_scope: Mapped[str] = mapped_column(String, nullable=False, default="", server_default=text("''")) + owner_instance_id: Mapped[str] = mapped_column(String, nullable=False) + lease_expires_at: Mapped[datetime] = mapped_column(DateTime, nullable=False) + account_id: Mapped[str | None] = mapped_column(String, nullable=True) + request_model: Mapped[str | None] = mapped_column(String, nullable=True) + codex_session: Mapped[bool] = mapped_column(Boolean, default=False, server_default=false(), nullable=False) + idle_ttl_seconds: Mapped[float] = mapped_column(Float, nullable=False) + upstream_turn_state: Mapped[str | None] = mapped_column(String, nullable=True) + downstream_turn_state: Mapped[str | None] = mapped_column(String, nullable=True) + created_at: Mapped[datetime] = mapped_column(DateTime, server_default=func.now(), nullable=False) + updated_at: Mapped[datetime] = mapped_column( + DateTime, + server_default=func.now(), + onupdate=func.now(), + nullable=False, + ) + + class DashboardSettings(Base): __tablename__ = "dashboard_settings" diff --git a/app/dependencies.py b/app/dependencies.py index c25ef49c..985f180a 100644 --- a/app/dependencies.py +++ b/app/dependencies.py @@ -19,6 +19,7 @@ from app.modules.firewall.repository import FirewallRepository from app.modules.firewall.service import FirewallService from app.modules.oauth.service import OauthService +from app.modules.proxy.bridge_repository import HttpBridgeLeasesRepository from app.modules.proxy.repo_bundle import ProxyRepositories from app.modules.proxy.service import ProxyService from app.modules.proxy.sticky_repository import StickySessionsRepository @@ -151,6 +152,7 @@ async def _proxy_repo_context() -> AsyncIterator[ProxyRepositories]: usage=UsageRepository(session), request_logs=RequestLogsRepository(session), sticky_sessions=StickySessionsRepository(session), + http_bridge_leases=HttpBridgeLeasesRepository(session), api_keys=ApiKeysRepository(session), additional_usage=AdditionalUsageRepository(session), ) diff --git a/app/modules/proxy/api.py b/app/modules/proxy/api.py index 34b018d4..69817341 100644 --- a/app/modules/proxy/api.py +++ b/app/modules/proxy/api.py @@ -445,13 +445,9 @@ async def _stream_responses( rate_limit_headers = await context.service.rate_limit_headers() bridge_active = prefer_http_bridge and proxy_service_module.get_settings().http_responses_session_bridge_enabled downstream_turn_state = ( - proxy_service_module.ensure_http_downstream_turn_state(request.headers) if bridge_active else None - ) - turn_state_headers = ( - proxy_service_module.build_downstream_turn_state_response_headers(downstream_turn_state) - if downstream_turn_state is not None - else {} + proxy_service_module.requested_http_downstream_turn_state(request.headers) if bridge_active else None ) + turn_state_headers: dict[str, str] = {} payload.stream = True if prefer_http_bridge: stream = context.service.stream_http_responses( @@ -464,6 +460,7 @@ async def _stream_responses( api_key_reservation=reservation, suppress_text_done_events=suppress_text_done_events, downstream_turn_state=downstream_turn_state, + response_headers_out=turn_state_headers, ) else: stream = context.service.stream_responses( @@ -521,13 +518,9 @@ async def _collect_responses( rate_limit_headers = await context.service.rate_limit_headers() bridge_active = prefer_http_bridge and proxy_service_module.get_settings().http_responses_session_bridge_enabled downstream_turn_state = ( - proxy_service_module.ensure_http_downstream_turn_state(request.headers) if bridge_active else None - ) - turn_state_headers = ( - proxy_service_module.build_downstream_turn_state_response_headers(downstream_turn_state) - if downstream_turn_state is not None - else {} + proxy_service_module.requested_http_downstream_turn_state(request.headers) if bridge_active else None ) + turn_state_headers: dict[str, str] = {} payload.stream = True if prefer_http_bridge: stream = context.service.stream_http_responses( @@ -540,6 +533,7 @@ async def _collect_responses( api_key_reservation=reservation, suppress_text_done_events=suppress_text_done_events, downstream_turn_state=downstream_turn_state, + response_headers_out=turn_state_headers, ) else: stream = context.service.stream_responses( diff --git a/app/modules/proxy/bridge_repository.py b/app/modules/proxy/bridge_repository.py new file mode 100644 index 00000000..28aaa75a --- /dev/null +++ b/app/modules/proxy/bridge_repository.py @@ -0,0 +1,165 @@ +from __future__ import annotations + +from datetime import datetime + +from sqlalchemy import delete, select, update +from sqlalchemy.dialects.postgresql import insert as pg_insert +from sqlalchemy.dialects.sqlite import insert as sqlite_insert +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.sql import Insert, func + +from app.core.utils.time import to_utc_naive +from app.db.models import HttpBridgeLease + + +class HttpBridgeLeasesRepository: + def __init__(self, session: AsyncSession) -> None: + self._session = session + + async def get_by_session_id(self, session_id: str) -> HttpBridgeLease | None: + if not session_id: + return None + statement = select(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id) + result = await self._session.execute(statement) + return result.scalar_one_or_none() + + async def upsert( + self, + *, + session_id: str, + affinity_kind: str, + affinity_key: str, + api_key_scope: str, + owner_instance_id: str, + lease_expires_at: datetime, + account_id: str | None, + request_model: str | None, + codex_session: bool, + idle_ttl_seconds: float, + upstream_turn_state: str | None, + downstream_turn_state: str | None, + ) -> HttpBridgeLease: + statement = self._build_upsert_statement( + session_id=session_id, + affinity_kind=affinity_kind, + affinity_key=affinity_key, + api_key_scope=api_key_scope, + owner_instance_id=owner_instance_id, + lease_expires_at=lease_expires_at, + account_id=account_id, + request_model=request_model, + codex_session=codex_session, + idle_ttl_seconds=idle_ttl_seconds, + upstream_turn_state=upstream_turn_state, + downstream_turn_state=downstream_turn_state, + ) + await self._session.execute(statement) + await self._session.commit() + row = await self.get_by_session_id(session_id) + if row is None: + raise RuntimeError(f"HttpBridgeLease upsert failed for session_id={session_id!r}") + await self._session.refresh(row) + return row + + async def delete(self, session_id: str) -> bool: + if not session_id: + return False + statement = delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id) + result = await self._session.execute(statement.returning(HttpBridgeLease.session_id)) + await self._session.commit() + return result.scalar_one_or_none() is not None + + async def touch( + self, + session_id: str, + *, + lease_expires_at: datetime, + account_id: str | None, + request_model: str | None, + codex_session: bool, + idle_ttl_seconds: float, + upstream_turn_state: str | None, + downstream_turn_state: str | None, + ) -> bool: + if not session_id: + return False + statement = ( + update(HttpBridgeLease) + .where(HttpBridgeLease.session_id == session_id) + .values( + lease_expires_at=to_utc_naive(lease_expires_at), + account_id=account_id, + request_model=request_model, + codex_session=codex_session, + idle_ttl_seconds=idle_ttl_seconds, + upstream_turn_state=upstream_turn_state, + downstream_turn_state=downstream_turn_state, + updated_at=func.now(), + ) + .returning(HttpBridgeLease.session_id) + ) + result = await self._session.execute(statement) + await self._session.commit() + return result.scalar_one_or_none() is not None + + async def purge_expired(self, *, expires_before: datetime) -> int: + statement = delete(HttpBridgeLease).where(HttpBridgeLease.lease_expires_at < to_utc_naive(expires_before)) + result = await self._session.execute(statement.returning(HttpBridgeLease.session_id)) + deleted = len(result.scalars().all()) + await self._session.commit() + return deleted + + def _build_upsert_statement( + self, + *, + session_id: str, + affinity_kind: str, + affinity_key: str, + api_key_scope: str, + owner_instance_id: str, + lease_expires_at: datetime, + account_id: str | None, + request_model: str | None, + codex_session: bool, + idle_ttl_seconds: float, + upstream_turn_state: str | None, + downstream_turn_state: str | None, + ) -> Insert: + dialect = self._session.get_bind().dialect.name + if dialect == "postgresql": + insert_fn = pg_insert + elif dialect == "sqlite": + insert_fn = sqlite_insert + else: + raise RuntimeError(f"HttpBridgeLease upsert unsupported for dialect={dialect!r}") + statement = insert_fn(HttpBridgeLease).values( + session_id=session_id, + affinity_kind=affinity_kind, + affinity_key=affinity_key, + api_key_scope=api_key_scope, + owner_instance_id=owner_instance_id, + lease_expires_at=to_utc_naive(lease_expires_at), + account_id=account_id, + request_model=request_model, + codex_session=codex_session, + idle_ttl_seconds=idle_ttl_seconds, + upstream_turn_state=upstream_turn_state, + downstream_turn_state=downstream_turn_state, + ) + return statement.on_conflict_do_update( + index_elements=[HttpBridgeLease.session_id], + set_={ + "affinity_kind": affinity_kind, + "affinity_key": affinity_key, + "api_key_scope": api_key_scope, + "owner_instance_id": owner_instance_id, + "lease_expires_at": to_utc_naive(lease_expires_at), + "account_id": account_id, + "request_model": request_model, + "codex_session": codex_session, + "idle_ttl_seconds": idle_ttl_seconds, + "upstream_turn_state": upstream_turn_state, + "downstream_turn_state": downstream_turn_state, + "updated_at": func.now(), + }, + ) diff --git a/app/modules/proxy/repo_bundle.py b/app/modules/proxy/repo_bundle.py index afa6508f..b7fe552a 100644 --- a/app/modules/proxy/repo_bundle.py +++ b/app/modules/proxy/repo_bundle.py @@ -6,6 +6,7 @@ from app.modules.accounts.repository import AccountsRepository from app.modules.api_keys.repository import ApiKeysRepository +from app.modules.proxy.bridge_repository import HttpBridgeLeasesRepository from app.modules.proxy.sticky_repository import StickySessionsRepository from app.modules.request_logs.repository import RequestLogsRepository from app.modules.usage.repository import AdditionalUsageRepository, UsageRepository @@ -17,6 +18,7 @@ class ProxyRepositories: usage: UsageRepository request_logs: RequestLogsRepository sticky_sessions: StickySessionsRepository + http_bridge_leases: HttpBridgeLeasesRepository api_keys: ApiKeysRepository additional_usage: AdditionalUsageRepository diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 9719a5fe..e42cf0bd 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -8,6 +8,7 @@ from collections import deque from collections.abc import Sequence from dataclasses import dataclass, field +from datetime import datetime, timedelta from hashlib import sha256 from typing import AsyncIterator, Mapping, NoReturn, cast from uuid import uuid4 @@ -59,6 +60,7 @@ from app.core.utils.request_id import ensure_request_id, get_request_id from app.core.utils.retry import backoff_seconds from app.core.utils.sse import format_sse_event, parse_sse_data_json +from app.core.utils.time import to_utc_naive, utcnow from app.db.models import Account, AccountStatus, DashboardSettings, StickySessionKind, UsageHistory from app.modules.accounts.auth_manager import AuthManager from app.modules.api_keys.service import ( @@ -122,6 +124,8 @@ _TRANSIENT_RETRY_CODES = frozenset({"server_error"}) _MAX_TRANSIENT_SAME_ACCOUNT_RETRIES = 3 _COMPACT_MAX_ACCOUNT_ATTEMPTS = 2 +_HTTP_BRIDGE_TURN_STATE_PREFIX = "http_turn_v2_" +_HTTP_BRIDGE_TURN_STATE_VERSION = 2 @dataclass(frozen=True, slots=True) @@ -187,6 +191,7 @@ def stream_http_responses( api_key_reservation: ApiKeyUsageReservationData | None = None, suppress_text_done_events: bool = False, downstream_turn_state: str | None = None, + response_headers_out: dict[str, str] | None = None, ) -> AsyncIterator[str]: _maybe_log_proxy_request_payload("stream_http", payload, headers) filtered = filter_inbound_headers(headers) @@ -200,6 +205,7 @@ def stream_http_responses( api_key_reservation=api_key_reservation, suppress_text_done_events=suppress_text_done_events, downstream_turn_state=downstream_turn_state, + response_headers_out=response_headers_out, ) async def _stream_http_bridge_or_retry( @@ -214,6 +220,7 @@ async def _stream_http_bridge_or_retry( api_key_reservation: ApiKeyUsageReservationData | None, suppress_text_done_events: bool, downstream_turn_state: str | None = None, + response_headers_out: dict[str, str] | None = None, ) -> AsyncIterator[str]: settings = await get_settings_cache().get() if not _http_responses_session_bridge_enabled(settings): @@ -245,6 +252,7 @@ async def _stream_http_bridge_or_retry( max_sessions=getattr(settings, "http_responses_session_bridge_max_sessions", 256), queue_limit=getattr(settings, "http_responses_session_bridge_queue_limit", 8), downstream_turn_state=downstream_turn_state, + response_headers_out=response_headers_out, ): yield line @@ -264,6 +272,7 @@ async def _stream_via_http_bridge( max_sessions: int, queue_limit: int, downstream_turn_state: str | None = None, + response_headers_out: dict[str, str] | None = None, ) -> AsyncIterator[str]: del propagate_http_errors, suppress_text_done_events request_id = ensure_request_id() @@ -329,8 +338,14 @@ async def _stream_via_http_bridge( text_data=text_data, queue_limit=queue_limit, ) - if downstream_turn_state is not None: - await self._register_http_bridge_turn_state(session, downstream_turn_state) + resolved_downstream_turn_state = self._resolve_http_bridge_downstream_turn_state( + session, + requested_turn_state=downstream_turn_state, + api_key_id=api_key.id if api_key is not None else None, + ) + await self._register_http_bridge_turn_state(session, resolved_downstream_turn_state) + if response_headers_out is not None: + response_headers_out.update(build_downstream_turn_state_response_headers(resolved_downstream_turn_state)) try: event_queue = request_state.event_queue @@ -1414,6 +1429,209 @@ async def _http_bridge_pending_count(self, session: "_HTTPBridgeSession") -> int async with session.pending_lock: return max(len(session.pending_requests), session.queued_request_count) + def _new_http_bridge_session_id(self) -> str: + return f"hbs_{uuid4().hex}" + + def _invalid_http_bridge_turn_state(self) -> ProxyResponseError: + return ProxyResponseError( + 409, + openai_error( + "bridge_token_invalid", + "HTTP bridge turn-state token is invalid or scoped to a different API key", + error_type="server_error", + ), + ) + + def _expired_http_bridge_turn_state(self) -> ProxyResponseError: + return ProxyResponseError( + 409, + openai_error( + "bridge_session_expired", + "HTTP bridge session continuity expired; drop x-codex-turn-state and start a new turn", + error_type="server_error", + ), + ) + + def _encode_http_bridge_turn_state( + self, + *, + session_id: str, + owner_instance_id: str, + api_key_id: str | None, + ) -> str: + payload = json.dumps( + { + "v": _HTTP_BRIDGE_TURN_STATE_VERSION, + "sid": session_id, + "own": owner_instance_id, + "api": _http_bridge_api_key_scope(api_key_id), + "iat": int(time.time()), + }, + separators=(",", ":"), + ) + return f"{_HTTP_BRIDGE_TURN_STATE_PREFIX}{self._encryptor.encrypt(payload).decode('ascii')}" + + def _decode_http_bridge_turn_state( + self, + turn_state: str | None, + *, + api_key_id: str | None, + ) -> "_HTTPBridgeTurnStateToken | None": + if not turn_state or not turn_state.startswith(_HTTP_BRIDGE_TURN_STATE_PREFIX): + return None + encrypted = turn_state.removeprefix(_HTTP_BRIDGE_TURN_STATE_PREFIX).strip() + if not encrypted: + raise self._invalid_http_bridge_turn_state() + try: + raw = self._encryptor.decrypt(encrypted.encode("ascii")) + payload = json.loads(raw) + except Exception as exc: + raise self._invalid_http_bridge_turn_state() from exc + version = payload.get("v") + session_id = payload.get("sid") + owner_instance_id = payload.get("own") + api_key_scope = payload.get("api") + issued_at = payload.get("iat") + if ( + version != _HTTP_BRIDGE_TURN_STATE_VERSION + or not isinstance(session_id, str) + or not session_id.strip() + or not isinstance(owner_instance_id, str) + or not owner_instance_id.strip() + or not isinstance(api_key_scope, str) + or not isinstance(issued_at, int) + ): + raise self._invalid_http_bridge_turn_state() + if api_key_scope != _http_bridge_api_key_scope(api_key_id): + raise self._invalid_http_bridge_turn_state() + return _HTTPBridgeTurnStateToken( + session_id=session_id, + owner_instance_id=owner_instance_id, + api_key_scope=api_key_scope, + issued_at=issued_at, + ) + + def _http_bridge_turn_state_matches_session( + self, + turn_state: str, + *, + session: "_HTTPBridgeSession", + api_key_id: str | None, + ) -> bool: + try: + token = self._decode_http_bridge_turn_state(turn_state, api_key_id=api_key_id) + except ProxyResponseError: + return False + if token is None: + return False + return token.session_id == session.bridge_session_id and token.owner_instance_id == session.owner_instance_id + + def _resolve_http_bridge_downstream_turn_state( + self, + session: "_HTTPBridgeSession", + *, + requested_turn_state: str | None, + api_key_id: str | None, + ) -> str: + if session.downstream_turn_state and self._http_bridge_turn_state_matches_session( + session.downstream_turn_state, + session=session, + api_key_id=api_key_id, + ): + return session.downstream_turn_state + if requested_turn_state and self._http_bridge_turn_state_matches_session( + requested_turn_state, + session=session, + api_key_id=api_key_id, + ): + return requested_turn_state + return self._encode_http_bridge_turn_state( + session_id=session.bridge_session_id, + owner_instance_id=session.owner_instance_id, + api_key_id=api_key_id, + ) + + async def _get_live_http_bridge_lease( + self, + session_id: str | None, + ) -> "_HTTPBridgeLeaseSnapshot | None": + if not session_id: + return None + async with self._repo_factory() as repos: + lease = await repos.http_bridge_leases.get_by_session_id(session_id) + if lease is None: + return None + if to_utc_naive(lease.lease_expires_at) < utcnow(): + await repos.http_bridge_leases.delete(session_id) + return None + return _HTTPBridgeLeaseSnapshot( + session_id=lease.session_id, + owner_instance_id=lease.owner_instance_id, + api_key_scope=lease.api_key_scope, + account_id=lease.account_id, + lease_expires_at=lease.lease_expires_at, + ) + + async def _delete_http_bridge_lease(self, session_id: str | None) -> None: + if not session_id: + return + async with self._repo_factory() as repos: + await repos.http_bridge_leases.delete(session_id) + + async def _persist_http_bridge_lease(self, session: "_HTTPBridgeSession") -> None: + async with self._repo_factory() as repos: + await repos.http_bridge_leases.upsert( + session_id=session.bridge_session_id, + affinity_kind=session.key.affinity_kind, + affinity_key=session.key.affinity_key, + api_key_scope=_http_bridge_api_key_scope(session.key.api_key_id), + owner_instance_id=session.owner_instance_id, + lease_expires_at=_http_bridge_lease_expires_at(session.idle_ttl_seconds), + account_id=session.account.id, + request_model=session.request_model, + codex_session=session.codex_session, + idle_ttl_seconds=session.idle_ttl_seconds, + upstream_turn_state=session.upstream_turn_state, + downstream_turn_state=session.downstream_turn_state, + ) + + async def _touch_http_bridge_lease(self, session: "_HTTPBridgeSession") -> None: + async with self._repo_factory() as repos: + touched = await repos.http_bridge_leases.touch( + session.bridge_session_id, + lease_expires_at=_http_bridge_lease_expires_at(session.idle_ttl_seconds), + account_id=session.account.id, + request_model=session.request_model, + codex_session=session.codex_session, + idle_ttl_seconds=session.idle_ttl_seconds, + upstream_turn_state=session.upstream_turn_state, + downstream_turn_state=session.downstream_turn_state, + ) + if not touched: + await repos.http_bridge_leases.upsert( + session_id=session.bridge_session_id, + affinity_kind=session.key.affinity_kind, + affinity_key=session.key.affinity_key, + api_key_scope=_http_bridge_api_key_scope(session.key.api_key_id), + owner_instance_id=session.owner_instance_id, + lease_expires_at=_http_bridge_lease_expires_at(session.idle_ttl_seconds), + account_id=session.account.id, + request_model=session.request_model, + codex_session=session.codex_session, + idle_ttl_seconds=session.idle_ttl_seconds, + upstream_turn_state=session.upstream_turn_state, + downstream_turn_state=session.downstream_turn_state, + ) + + def _has_http_bridge_turn_state_alias_conflict(self, turn_state: str, *, api_key_id: str | None) -> bool: + requested_alias_key = _http_bridge_turn_state_alias_key(turn_state, api_key_id) + for alias_key in self._http_bridge_turn_state_index: + if alias_key[0] != turn_state: + continue + if alias_key != requested_alias_key: + return True + return False + async def _get_or_create_http_bridge_session( self, key: "_HTTPBridgeSessionKey", @@ -1428,6 +1646,7 @@ async def _get_or_create_http_bridge_session( ) -> "_HTTPBridgeSession": settings = get_settings() api_key_id = api_key.id if api_key is not None else None + current_instance, ring = _normalized_http_bridge_instance_ring(settings) effective_idle_ttl_seconds = _effective_http_bridge_idle_ttl_seconds( affinity=affinity, idle_ttl_seconds=idle_ttl_seconds, @@ -1438,69 +1657,91 @@ async def _get_or_create_http_bridge_session( ), ) incoming_turn_state = _sticky_key_from_turn_state_header(headers) + turn_state_token = self._decode_http_bridge_turn_state(incoming_turn_state, api_key_id=api_key_id) + is_bridge_turn_state_replay = bool(incoming_turn_state and incoming_turn_state.startswith("http_turn_")) + active_turn_state_lease = await self._get_live_http_bridge_lease( + turn_state_token.session_id if turn_state_token is not None else None + ) + if active_turn_state_lease is not None and active_turn_state_lease.owner_instance_id != current_instance: + _log_http_bridge_event( + "owner_mismatch", + key, + account_id=active_turn_state_lease.account_id, + model=request_model, + detail=( + f"lease_session_id={active_turn_state_lease.session_id}, " + f"expected_instance={active_turn_state_lease.owner_instance_id}, " + f"current_instance={current_instance}" + ), + ) + raise ProxyResponseError( + 409, + openai_error( + "bridge_wrong_instance", + ( + "HTTP responses session bridge turn-state is owned by another live instance " + f"(expected {active_turn_state_lease.owner_instance_id}, got {current_instance})" + ), + error_type="server_error", + ), + ) + created_session_id = self._new_http_bridge_session_id() + if turn_state_token is not None and turn_state_token.owner_instance_id == current_instance: + created_session_id = turn_state_token.session_id while True: sessions_to_close: list[_HTTPBridgeSession] = [] inflight_future: asyncio.Future[_HTTPBridgeSession] | None = None capacity_wait_future: asyncio.Future[_HTTPBridgeSession] | None = None owns_creation = False continuity_error: ProxyResponseError | None = None + delete_stale_turn_state_lease = False async with self._http_bridge_lock: if incoming_turn_state is not None: alias_index_key = _http_bridge_turn_state_alias_key(incoming_turn_state, api_key_id) alias_key = self._http_bridge_turn_state_index.get(alias_index_key) - if alias_key is not None: + alias_session = self._http_bridge_sessions.get(alias_key) if alias_key is not None else None + if alias_session is None and turn_state_token is not None: + for candidate_key, candidate_session in self._http_bridge_sessions.items(): + if candidate_session.bridge_session_id != turn_state_token.session_id: + continue + if candidate_session.closed or candidate_session.account.status != AccountStatus.ACTIVE: + continue + alias_key = candidate_key + alias_session = candidate_session + self._http_bridge_turn_state_index[alias_index_key] = candidate_key + break + if alias_session is not None: key = alias_key - alias_session = self._http_bridge_sessions.get(alias_key) - if ( - alias_session is None - or alias_session.closed - or alias_session.account.status != AccountStatus.ACTIVE - ): - self._http_bridge_turn_state_index.pop(alias_index_key, None) - key = _HTTPBridgeSessionKey("turn_state_header", incoming_turn_state, api_key_id) - else: - self._promote_http_bridge_session_to_codex_affinity( - alias_session, - turn_state=incoming_turn_state, - settings=settings, - ) - for alias in alias_session.downstream_turn_state_aliases: - self._http_bridge_turn_state_index[ - _http_bridge_turn_state_alias_key(alias, alias_session.key.api_key_id) - ] = alias_session.key - key = alias_session.key + self._promote_http_bridge_session_to_codex_affinity( + alias_session, + turn_state=incoming_turn_state, + settings=settings, + ) + for alias in alias_session.downstream_turn_state_aliases: + self._http_bridge_turn_state_index[ + _http_bridge_turn_state_alias_key(alias, alias_session.key.api_key_id) + ] = alias_session.key + key = alias_session.key + elif turn_state_token is not None: + delete_stale_turn_state_lease = True + key = _HTTPBridgeSessionKey("turn_state_header", incoming_turn_state, api_key_id) + elif incoming_turn_state.startswith("http_turn_") and self._has_http_bridge_turn_state_alias_conflict( + incoming_turn_state, + api_key_id=api_key_id, + ): + raise self._invalid_http_bridge_turn_state() elif incoming_turn_state.startswith("http_turn_"): key = _HTTPBridgeSessionKey("turn_state_header", incoming_turn_state, api_key_id) - if self._http_bridge_inflight_sessions.get(key) is not None: - pass - elif previous_response_id is not None: - raise ProxyResponseError( - 400, - _http_bridge_previous_response_error_envelope( - previous_response_id, - ( - "HTTP bridge continuity was lost. Replay x-codex-turn-state " - "or retry with a stable prompt_cache_key." - ), - ), - ) - else: - raise ProxyResponseError( - 409, - openai_error( - "bridge_instance_mismatch", - "HTTP bridge turn-state reached an instance that does not own the live session", - error_type="server_error", - ), - ) await self._prune_http_bridge_sessions_locked() - owner_instance = _http_bridge_owner_instance(key, settings) current_instance, ring = _normalized_http_bridge_instance_ring(settings) + owner_instance = _http_bridge_owner_instance(key, settings) if ( - key.affinity_kind != "request" + not is_bridge_turn_state_replay + and active_turn_state_lease is None + and key.affinity_kind != "request" and owner_instance is not None and len(ring) > 1 and owner_instance != current_instance @@ -1515,7 +1756,7 @@ async def _get_or_create_http_bridge_session( raise ProxyResponseError( 409, openai_error( - "bridge_instance_mismatch", + "bridge_wrong_instance", ( "HTTP responses session bridge request reached the wrong instance " f"(expected {owner_instance}, got {current_instance})" @@ -1526,6 +1767,18 @@ async def _get_or_create_http_bridge_session( existing = self._http_bridge_sessions.get(key) if existing is not None and not existing.closed and existing.account.status == AccountStatus.ACTIVE: + if ( + incoming_turn_state is not None + and self._http_bridge_turn_state_index.get( + _http_bridge_turn_state_alias_key(incoming_turn_state, api_key_id) + ) + == key + ): + self._promote_http_bridge_session_to_codex_affinity( + existing, + turn_state=incoming_turn_state, + settings=settings, + ) existing.request_model = request_model existing.last_used_at = time.monotonic() _log_http_bridge_event( @@ -1546,8 +1799,9 @@ async def _get_or_create_http_bridge_session( ) self._http_bridge_sessions.pop(key, None) sessions_to_close.append(existing) + if turn_state_token is not None: + delete_stale_turn_state_lease = True - inflight_future = self._http_bridge_inflight_sessions.get(key) if previous_response_id is not None: continuity_error = ProxyResponseError( 400, @@ -1560,6 +1814,12 @@ async def _get_or_create_http_bridge_session( ), ) else: + if delete_stale_turn_state_lease and turn_state_token is not None: + await self._delete_http_bridge_lease(turn_state_token.session_id) + if is_bridge_turn_state_replay: + raise self._expired_http_bridge_turn_state() + + inflight_future = self._http_bridge_inflight_sessions.get(key) if inflight_future is None: while ( len(self._http_bridge_sessions) + len(self._http_bridge_inflight_sessions) >= max_sessions @@ -1652,6 +1912,8 @@ async def _get_or_create_http_bridge_session( affinity=affinity, request_model=request_model, idle_ttl_seconds=effective_idle_ttl_seconds, + bridge_session_id=created_session_id, + owner_instance_id=current_instance, ) async with self._http_bridge_lock: current_future = self._http_bridge_inflight_sessions.get(key) @@ -1728,6 +1990,7 @@ async def _close_http_bridge_session( await session.upstream.close() except Exception: logger.debug("Failed to close HTTP bridge upstream websocket", exc_info=True) + await self._delete_http_bridge_lease(session.bridge_session_id) _log_http_bridge_event( "close", session.key, @@ -1740,12 +2003,12 @@ async def _register_http_bridge_turn_state(self, session: "_HTTPBridgeSession", if session.closed: return session.downstream_turn_state_aliases.add(turn_state) - if session.downstream_turn_state is None: - session.downstream_turn_state = turn_state + session.downstream_turn_state = turn_state for alias in session.downstream_turn_state_aliases: self._http_bridge_turn_state_index[_http_bridge_turn_state_alias_key(alias, session.key.api_key_id)] = ( session.key ) + await self._touch_http_bridge_lease(session) async def _unregister_http_bridge_turn_states(self, session: "_HTTPBridgeSession") -> None: async with self._http_bridge_lock: @@ -1785,6 +2048,8 @@ async def _create_http_bridge_session( affinity: _AffinityPolicy, request_model: str | None, idle_ttl_seconds: float, + bridge_session_id: str, + owner_instance_id: str, ) -> "_HTTPBridgeSession": request_state = _WebSocketRequestState( request_id=f"http_bridge_connect_{uuid4().hex}", @@ -1855,11 +2120,14 @@ async def _create_http_bridge_session( queued_request_count=0, last_used_at=time.monotonic(), idle_ttl_seconds=idle_ttl_seconds, + bridge_session_id=bridge_session_id, + owner_instance_id=owner_instance_id, codex_session=affinity.kind == StickySessionKind.CODEX_SESSION, prewarm_lock=anyio.Lock(), upstream_turn_state=_upstream_turn_state_from_socket(upstream), downstream_turn_state=None, ) + await self._persist_http_bridge_lease(session) session.upstream_reader = asyncio.create_task(self._relay_http_bridge_upstream_messages(session)) return session @@ -2154,6 +2422,7 @@ async def _relay_http_bridge_upstream_messages( break finally: session.closed = True + await self._delete_http_bridge_lease(session.bridge_session_id) async def _retry_http_bridge_request_on_fresh_upstream( self, @@ -2297,6 +2566,7 @@ async def _reconnect_http_bridge_session( session.upstream_control = _WebSocketUpstreamControl() session.closed = False session.upstream_turn_state = _upstream_turn_state_from_socket(upstream) or session.upstream_turn_state + await self._touch_http_bridge_lease(session) if restart_reader: session.upstream_reader = asyncio.create_task(self._relay_http_bridge_upstream_messages(session)) _log_http_bridge_event( @@ -4278,6 +4548,23 @@ class _HTTPBridgeSessionKey: api_key_id: str | None +@dataclass(frozen=True, slots=True) +class _HTTPBridgeTurnStateToken: + session_id: str + owner_instance_id: str + api_key_scope: str + issued_at: int + + +@dataclass(frozen=True, slots=True) +class _HTTPBridgeLeaseSnapshot: + session_id: str + owner_instance_id: str + api_key_scope: str + account_id: str | None + lease_expires_at: datetime + + @dataclass(slots=True) class _HTTPBridgeSession: key: _HTTPBridgeSessionKey @@ -4293,6 +4580,8 @@ class _HTTPBridgeSession: queued_request_count: int last_used_at: float idle_ttl_seconds: float + bridge_session_id: str + owner_instance_id: str codex_session: bool = False prewarmed: bool = False prewarm_lock: anyio.Lock | None = None @@ -4834,6 +5123,10 @@ def ensure_http_downstream_turn_state(headers: Mapping[str, str]) -> str: return f"http_turn_{uuid4().hex}" +def requested_http_downstream_turn_state(headers: Mapping[str, str]) -> str | None: + return _sticky_key_from_turn_state_header(headers) + + def build_downstream_turn_state_accept_headers(turn_state: str) -> list[tuple[bytes, bytes]]: return [(b"x-codex-turn-state", turn_state.encode("utf-8"))] @@ -4897,6 +5190,14 @@ def _http_bridge_turn_state_alias_key(turn_state: str, api_key_id: str | None) - return (turn_state, api_key_id) +def _http_bridge_api_key_scope(api_key_id: str | None) -> str: + return api_key_id or "" + + +def _http_bridge_lease_expires_at(idle_ttl_seconds: float) -> datetime: + return utcnow() + timedelta(seconds=max(30.0, idle_ttl_seconds + 30.0)) + + def _resolve_prompt_cache_key( payload: ResponsesRequest | ResponsesCompactRequest, *, diff --git a/openspec/changes/durable-http-bridge-ownership/.openspec.yaml b/openspec/changes/durable-http-bridge-ownership/.openspec.yaml new file mode 100644 index 00000000..5376059c --- /dev/null +++ b/openspec/changes/durable-http-bridge-ownership/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-03-21 diff --git a/openspec/changes/durable-http-bridge-ownership/proposal.md b/openspec/changes/durable-http-bridge-ownership/proposal.md new file mode 100644 index 00000000..fff4c8dd --- /dev/null +++ b/openspec/changes/durable-http-bridge-ownership/proposal.md @@ -0,0 +1,17 @@ +## Why + +HTTP bridge turn-state continuity currently depends on an in-memory alias map. When a process restarts, evicts a session, or receives a replayed request on another replica, the proxy can only tell that the local alias is missing. It currently reports that as `bridge_instance_mismatch`, which conflates stale local state, expired bridge sessions, invalid turn-state tokens, and true live-owner conflicts. + +## What Changes + +- Replace opaque local-only HTTP bridge turn-state aliases with signed, versioned turn-state tokens. +- Add a durable HTTP bridge lease registry so replicas can distinguish a live owner mismatch from expired or stale bridge state. +- Recover stale or expired bridge turn-state on requests that do not require prior-response continuity. +- Fail with specific bridge error codes for invalid tokens, expired continuity, and true wrong-instance conflicts. + +## Impact + +- Code: `app/modules/proxy/service.py`, `app/modules/proxy/api.py`, `app/modules/proxy/repo_bundle.py`, `app/dependencies.py`, `app/db/models.py` +- Data: new `http_bridge_leases` table and migration +- Tests: HTTP bridge integration coverage and proxy repository factory call sites +- Specs: `openspec/specs/responses-api-compat/spec.md`, `context.md`, and `ops.md` diff --git a/openspec/changes/durable-http-bridge-ownership/specs/responses-api-compat/spec.md b/openspec/changes/durable-http-bridge-ownership/specs/responses-api-compat/spec.md new file mode 100644 index 00000000..3ee58a22 --- /dev/null +++ b/openspec/changes/durable-http-bridge-ownership/specs/responses-api-compat/spec.md @@ -0,0 +1,26 @@ +## MODIFIED Requirements + +### Requirement: HTTP Responses routes preserve upstream websocket session continuity +HTTP `/v1/responses` and HTTP `/backend-api/codex/responses` MUST preserve upstream websocket session continuity within one live bridge session and MUST distinguish replayed turn-state failure modes cleanly. The service MUST issue signed, versioned `x-codex-turn-state` headers for HTTP bridge continuity, MUST track live bridge ownership durably enough to detect true wrong-instance conflicts across replicas, and MUST recover stale local bridge state by creating a fresh bridge session when the replayed request does not require `previous_response_id` continuity. + +#### Scenario: replayed turn-state with live owner on another instance fails closed +- **WHEN** a client replays a valid HTTP bridge turn-state token +- **AND** the durable bridge lease shows a different live owner instance +- **THEN** the service fails the request fast with `bridge_wrong_instance` +- **AND** it MUST NOT create a fresh local bridge session for that token on the wrong instance + +#### Scenario: replayed turn-state with expired bridge and no prior-response dependency recovers +- **WHEN** a client replays a valid HTTP bridge turn-state token +- **AND** no live bridge lease exists for that token +- **AND** the request does not include `previous_response_id` +- **THEN** the service creates a fresh bridge session instead of failing with `bridge_instance_mismatch` + +#### Scenario: replayed turn-state with expired bridge and prior-response dependency fails clearly +- **WHEN** a client replays a valid HTTP bridge turn-state token +- **AND** no live bridge lease exists for that token +- **AND** the request includes `previous_response_id` +- **THEN** the service fails the request with `bridge_session_expired` + +#### Scenario: malformed or forged turn-state token is rejected +- **WHEN** a client sends a replayed HTTP bridge turn-state token that cannot be validated +- **THEN** the service fails the request with `bridge_token_invalid` diff --git a/openspec/changes/durable-http-bridge-ownership/tasks.md b/openspec/changes/durable-http-bridge-ownership/tasks.md new file mode 100644 index 00000000..3bd10a80 --- /dev/null +++ b/openspec/changes/durable-http-bridge-ownership/tasks.md @@ -0,0 +1,18 @@ +## 1. Spec + +- [x] 1.1 Add Responses HTTP bridge requirements for signed turn-state tokens and durable live-owner tracking +- [x] 1.2 Update bridge context and ops notes for stale recovery and new error codes +- [x] 1.3 Validate OpenSpec changes + +## 2. Tests + +- [x] 2.1 Update HTTP bridge tests that currently expect missing local aliases to fail with `bridge_instance_mismatch` +- [x] 2.2 Add regression coverage for `bridge_wrong_instance`, `bridge_session_expired`, and `bridge_token_invalid` +- [x] 2.3 Add coverage for stale-session recovery without `previous_response_id` + +## 3. Implementation + +- [x] 3.1 Add durable HTTP bridge lease storage and repository wiring +- [x] 3.2 Issue signed HTTP bridge turn-state tokens and validate replayed tokens +- [x] 3.3 Recover stale bridge sessions when continuity is not required +- [x] 3.4 Preserve fail-closed behavior only for true live owner mismatches diff --git a/openspec/specs/responses-api-compat/context.md b/openspec/specs/responses-api-compat/context.md index 58ad7e3f..11508cd5 100644 --- a/openspec/specs/responses-api-compat/context.md +++ b/openspec/specs/responses-api-compat/context.md @@ -22,7 +22,8 @@ See `openspec/specs/responses-api-compat/spec.md` for normative requirements. - `previous_response_id` is forwarded when `conversation` is absent, but the `conversation + previous_response_id` conflict remains rejected. - HTTP `/v1/responses` and HTTP `/backend-api/codex/responses` now use a server-side upstream websocket session bridge by default so repeated compatible requests can keep upstream response/session continuity without forcing clients onto the public websocket route. - Codex-affinity HTTP bridge sessions can optionally use a conservative first-request prewarm (`generate=false`), but that behavior now stays behind an explicit flag so production defaults do not pay an extra upstream request unless operators opt in. -- When operators configure a multi-instance bridge ring, each stable bridge key now has a deterministic owner replica; non-owner replicas fail closed with `bridge_instance_mismatch` instead of silently creating fragmented continuity on the wrong host. Unstable per-request bridge keys remain local and are allowed on any replica because there is no continuity to preserve. +- When operators configure a multi-instance bridge ring, stable non-turn-state bridge keys still have a deterministic owner replica and non-owner replicas fail closed with `bridge_wrong_instance`. +- HTTP bridge turn-state continuity now uses signed turn-state tokens plus a durable live-owner lease. A replayed token can recover on stale or expired local state when `previous_response_id` continuity is not required, but true live-owner conflicts still fail closed with `bridge_wrong_instance`. - Codex-facing websocket routes now advertise `x-codex-turn-state` during websocket accept and honor client-provided turn-state on reconnect so routing can stay sticky at turn granularity even when the public websocket reconnects. - HTTP responses routes now also return `x-codex-turn-state` headers so clients that persist response headers can promote later HTTP requests from prompt-cache affinity to stronger Codex-session continuity. - `/v1/responses/compact` keeps a final-JSON contract and preserves the raw upstream `/codex/responses/compact` payload shape as the canonical next context window instead of rewriting it through buffered `/codex/responses` streaming. @@ -46,8 +47,9 @@ See `openspec/specs/responses-api-compat/spec.md` for normative requirements. - **Stream ends without terminal event:** Emit `response.failed` with `stream_incomplete`. - **Upstream error / no accounts:** Non-streaming responses return an OpenAI error envelope with 5xx status. - **Compact upstream transport/client failure:** Retry only inside `/codex/responses/compact` when the failure is safely retryable; otherwise return an explicit upstream error without surrogate fallback. -- **HTTP bridge session closes or expires:** The next compatible HTTP `/v1/responses` or `/backend-api/codex/responses` request recreates a fresh upstream websocket bridge session; continuity is guaranteed only within the lifetime of one active bridged session. -- **Multi-instance routing without bridge owner policy:** if operators do not configure a bridge ring or front-door affinity, continuity can still fragment across replicas; with a configured bridge ring, wrong-replica requests now fail closed instead of silently forking bridge state. +- **HTTP bridge session closes or expires:** The next compatible HTTP `/v1/responses` or `/backend-api/codex/responses` request recreates a fresh upstream websocket bridge session when continuity is optional. Requests that still depend on `previous_response_id` fail early with `bridge_session_expired`; continuity is guaranteed only within the lifetime of one active bridged session. +- **Replayed turn-state token is invalid or cross-scoped:** the proxy fails with `bridge_token_invalid`. +- **Multi-instance routing without bridge owner policy:** if operators do not configure a bridge ring or front-door affinity, continuity can still fragment across replicas; with a configured bridge ring and durable bridge leases, true wrong-replica requests fail closed instead of silently forking bridge state. - **Codex websocket reconnects:** Reconnect continuity now depends on the client replaying the accepted `x-codex-turn-state`; generated turn-state is emitted on accept for backend Codex routes and echoed back when the client already supplies one. - **Websocket handshake forbidden/not-found:** Auto transport now fails loud on `403` / `404` instead of silently hiding the websocket regression behind HTTP fallback. - **Invalid request payloads:** Return 4xx with `invalid_request_error`. diff --git a/openspec/specs/responses-api-compat/ops.md b/openspec/specs/responses-api-compat/ops.md index 87eb1eb2..60b382a9 100644 --- a/openspec/specs/responses-api-compat/ops.md +++ b/openspec/specs/responses-api-compat/ops.md @@ -284,13 +284,15 @@ If you deploy multiple replicas behind a load balancer, configure front-door aff Without front-door affinity, each replica will maintain its own in-memory bridge pool and HTTP continuity can fragment across instances. -If you cannot guarantee front-door affinity, configure the deterministic bridge instance ring so the proxy can fail closed with `bridge_instance_mismatch` rather than silently creating a second bridge on the wrong replica. +If you cannot guarantee front-door affinity, configure the deterministic bridge instance ring and keep the durable bridge-lease table healthy so the proxy can fail closed with `bridge_wrong_instance` on true live-owner conflicts rather than silently creating a second bridge on the wrong replica. ### Failure interpretation - `queue_full`: one bridge key is overloaded; increase bridge capacity carefully or reduce per-session concurrency upstream. - `capacity_exhausted_active_sessions`: the bridge pool hit `max_sessions` while every existing session still had pending work. The proxy intentionally refused the new request with `429` instead of evicting an active session. Mitigate by increasing pool size carefully, reducing concurrent bridge fan-out, or improving front-door affinity so related calls land on the same replica. -- `owner_mismatch` / `bridge_instance_mismatch`: deterministic replica ownership is enabled for a stable bridge key and the request landed on the wrong instance. Fix ingress affinity or route the stable bridge key to the logged owner instance. Requests that only have an unstable per-request bridge key are intentionally exempt from owner enforcement. +- `owner_mismatch` / `bridge_wrong_instance`: either deterministic replica ownership rejected a stable non-turn-state key on the wrong instance, or a replayed signed turn-state still has a live lease on another instance. Fix ingress affinity or route the bridge key to the logged owner instance. +- `bridge_session_expired`: the replayed turn-state no longer has a live bridge session and the request still required `previous_response_id` continuity. Drop the stale `x-codex-turn-state` and start a fresh turn. +- `bridge_token_invalid`: the replayed signed turn-state could not be validated or belonged to another API key scope. - `reconnect`: the bridge recreated an upstream websocket before response creation and retried once. - `terminal_error` with `previous_response_not_found`: continuity was already broken upstream; inspect replica affinity, bridge eviction timing, or upstream resets. - plain `transport = "http"` request logs are still expected for bridged HTTP requests; the internal upstream websocket does not change external transport accounting. diff --git a/openspec/specs/responses-api-compat/spec.md b/openspec/specs/responses-api-compat/spec.md index 8d4750de..13c25d72 100644 --- a/openspec/specs/responses-api-compat/spec.md +++ b/openspec/specs/responses-api-compat/spec.md @@ -254,13 +254,34 @@ When serving HTTP `/v1/responses` or HTTP `/backend-api/codex/responses`, the se - **THEN** the service sends one internal `response.create` prewarm with `generate=false` before the client-visible request - **AND** the client-visible response contract remains unchanged -#### Scenario: bridge enforces deterministic owner instance only for stable bridge keys +#### Scenario: bridge enforces deterministic owner instance only for stable non-turn-state keys - **WHEN** operators configure multiple eligible bridge instance ids -- **AND** a request uses a stable bridge key derived from turn-state, session header, or prompt-cache key +- **AND** a request uses a stable bridge key derived from a session header or prompt-cache key - **AND** that request lands on a non-owner instance -- **THEN** the service fails the request fast with `bridge_instance_mismatch` +- **THEN** the service fails the request fast with `bridge_wrong_instance` - **AND** it MUST NOT create a fresh local bridge session for that key on the wrong instance +#### Scenario: replayed signed turn-state fails only for a true live-owner mismatch +- **WHEN** a client replays a signed HTTP bridge `x-codex-turn-state` +- **AND** the durable bridge lease shows another live owner instance +- **THEN** the service fails the request fast with `bridge_wrong_instance` + +#### Scenario: replayed signed turn-state without a live lease recovers when continuity is optional +- **WHEN** a client replays a signed HTTP bridge `x-codex-turn-state` +- **AND** no live bridge lease exists for that turn-state +- **AND** the request does not include `previous_response_id` +- **THEN** the service creates a fresh bridge session and returns a fresh signed `x-codex-turn-state` + +#### Scenario: replayed turn-state without a live lease fails clearly when continuity is required +- **WHEN** a client replays an HTTP bridge `x-codex-turn-state` +- **AND** no live bridge lease exists for that turn-state +- **AND** the request includes `previous_response_id` +- **THEN** the service fails the request with `bridge_session_expired` + +#### Scenario: malformed or cross-scope signed turn-state is rejected +- **WHEN** a client sends a signed HTTP bridge `x-codex-turn-state` that cannot be validated or belongs to another API key scope +- **THEN** the service fails the request with `bridge_token_invalid` + ### Requirement: Websocket responses advertise and honor Codex turn-state affinity When serving websocket Responses endpoints, the service MUST advertise an `x-codex-turn-state` header during websocket accept. If the client reconnects and presents that same `x-codex-turn-state`, the service MUST treat it as the highest-priority Codex-affinity key for upstream routing on that websocket turn. On `/v1/responses`, a proxy-generated turn-state MUST NOT override the first request's prompt-cache routing unless the client explicitly sends the turn-state back. diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index b8f00635..cd7c1b77 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -12,11 +12,11 @@ import anyio import pytest import pytest_asyncio -from sqlalchemy import select +from sqlalchemy import delete, select import app.modules.proxy.service as proxy_module from app.core.utils.request_id import reset_request_id, set_request_id -from app.db.models import Account, AccountStatus +from app.db.models import Account, AccountStatus, HttpBridgeLease from app.db.session import SessionLocal from app.dependencies import get_proxy_service_for_app from app.modules.proxy.load_balancer import AccountSelection @@ -26,6 +26,9 @@ @pytest_asyncio.fixture(autouse=True) async def _cleanup_http_bridge_sessions(app_instance): + async with SessionLocal() as session: + await session.execute(delete(HttpBridgeLease)) + await session.commit() yield service = get_proxy_service_for_app(app_instance) async with service._http_bridge_lock: @@ -35,10 +38,14 @@ async def _cleanup_http_bridge_sessions(app_instance): service._http_bridge_inflight_sessions.clear() service._http_bridge_turn_state_index.clear() for session in sessions: + session.bridge_session_id = "" await service._close_http_bridge_session(session) for inflight_future in inflight_sessions: if not inflight_future.done(): inflight_future.cancel() + async with SessionLocal() as session: + await session.execute(delete(HttpBridgeLease)) + await session.commit() def _encode_jwt(payload: dict) -> str: @@ -848,7 +855,7 @@ async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_se exc = exc_info.value assert exc.status_code == 409 - assert exc.payload["error"].get("code") == "bridge_instance_mismatch" + assert exc.payload["error"].get("code") == "bridge_wrong_instance" @pytest.mark.asyncio @@ -1097,7 +1104,7 @@ async def test_v1_responses_http_bridge_waits_for_inflight_recreation_on_missing @pytest.mark.asyncio -async def test_v1_responses_http_bridge_generated_turn_state_fails_closed_without_local_alias( +async def test_v1_responses_http_bridge_generated_turn_state_missing_local_alias_recovers_fresh_session( async_client, app_instance, monkeypatch, @@ -1115,6 +1122,7 @@ async def test_v1_responses_http_bridge_generated_turn_state_fails_closed_withou ) account = await _get_account(account_id) service = get_proxy_service_for_app(app_instance) + fake_upstream = _FakeBridgeUpstreamWebSocket() async def fake_select_account_with_budget( self, @@ -1149,25 +1157,41 @@ async def fake_select_account_with_budget( ) return AccountSelection(account=account, error_message=None, error_code=None) + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del headers, access_token, account_id_header, base_url, session + return fake_upstream + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) - with pytest.raises(proxy_module.ProxyResponseError) as exc_info: - await service._get_or_create_http_bridge_session( - proxy_module._HTTPBridgeSessionKey("turn_state_header", "http_turn_missing_alias", None), - headers={"x-codex-turn-state": "http_turn_missing_alias"}, - affinity=proxy_module._AffinityPolicy( - key="http_turn_missing_alias", - kind=proxy_module.StickySessionKind.CODEX_SESSION, - ), - api_key=None, - request_model="gpt-5.1", - idle_ttl_seconds=120.0, - max_sessions=128, - ) + session = await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", "http_turn_missing_alias", None), + headers={"x-codex-turn-state": "http_turn_missing_alias"}, + affinity=proxy_module._AffinityPolicy( + key="http_turn_missing_alias", + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) - exc = exc_info.value - assert exc.status_code == 409 - assert exc.payload["error"].get("code") == "bridge_instance_mismatch" + assert session.key.affinity_kind == "turn_state_header" + assert session.key.affinity_key == "http_turn_missing_alias" + assert session.bridge_session_id.startswith("hbs_") @pytest.mark.asyncio @@ -1269,14 +1293,19 @@ async def fake_connect_responses_websocket( idle_ttl_seconds=120.0, max_sessions=128, ) - await service._register_http_bridge_turn_state(session, "http_turn_api_key_alias") + signed_turn_state = service._encode_http_bridge_turn_state( + session_id=session.bridge_session_id, + owner_instance_id=session.owner_instance_id, + api_key_id="api-key-a", + ) + await service._register_http_bridge_turn_state(session, signed_turn_state) with pytest.raises(proxy_module.ProxyResponseError) as exc_info: await service._get_or_create_http_bridge_session( - proxy_module._HTTPBridgeSessionKey("turn_state_header", "http_turn_api_key_alias", "api-key-b"), - headers={"x-codex-turn-state": "http_turn_api_key_alias"}, + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, "api-key-b"), + headers={"x-codex-turn-state": signed_turn_state}, affinity=proxy_module._AffinityPolicy( - key="http_turn_api_key_alias", + key=signed_turn_state, kind=proxy_module.StickySessionKind.CODEX_SESSION, ), api_key=cast(proxy_module.ApiKeyData, SimpleNamespace(id="api-key-b")), @@ -1288,7 +1317,7 @@ async def fake_connect_responses_websocket( assert isinstance(exc_info.value, proxy_module.ProxyResponseError) exc = exc_info.value assert exc.status_code == 409 - assert exc.payload["error"].get("code") == "bridge_instance_mismatch" + assert exc.payload["error"].get("code") == "bridge_token_invalid" await service._close_http_bridge_session(session) @@ -3404,7 +3433,8 @@ async def fake_connect_responses_websocket( "error": { "message": ( f"Previous response with id '{first_body['id']}' not found. " - "HTTP bridge continuity was lost. Replay x-codex-turn-state or retry with a stable prompt_cache_key." + "HTTP bridge continuity was lost before upstream created the next response. " + "Replay x-codex-turn-state or retry with a stable prompt_cache_key." ), "type": "invalid_request_error", "code": "previous_response_not_found", diff --git a/tests/integration/test_load_balancer_integration.py b/tests/integration/test_load_balancer_integration.py index 19bb5c07..429e5034 100644 --- a/tests/integration/test_load_balancer_integration.py +++ b/tests/integration/test_load_balancer_integration.py @@ -12,6 +12,7 @@ from app.db.session import SessionLocal from app.modules.accounts.repository import AccountsRepository from app.modules.api_keys.repository import ApiKeysRepository +from app.modules.proxy.bridge_repository import HttpBridgeLeasesRepository from app.modules.proxy.load_balancer import LoadBalancer from app.modules.proxy.repo_bundle import ProxyRepositories from app.modules.proxy.sticky_repository import StickySessionsRepository @@ -29,6 +30,7 @@ async def _repo_factory() -> AsyncIterator[ProxyRepositories]: usage=UsageRepository(session), request_logs=RequestLogsRepository(session), sticky_sessions=StickySessionsRepository(session), + http_bridge_leases=HttpBridgeLeasesRepository(session), api_keys=ApiKeysRepository(session), additional_usage=AdditionalUsageRepository(session), ) diff --git a/tests/unit/test_proxy_load_balancer_refresh.py b/tests/unit/test_proxy_load_balancer_refresh.py index 1d208693..d182898b 100644 --- a/tests/unit/test_proxy_load_balancer_refresh.py +++ b/tests/unit/test_proxy_load_balancer_refresh.py @@ -260,6 +260,7 @@ async def _repo_factory( usage=usage_repo, request_logs=StubRequestLogsRepository(), sticky_sessions=sticky_repo, + http_bridge_leases=object(), # type: ignore[arg-type] api_keys=StubApiKeysRepository(), additional_usage=additional_usage_repo or StubAdditionalUsageRepository(), ) @@ -961,6 +962,7 @@ async def repo_factory() -> AsyncIterator[ProxyRepositories]: additional_usage=StubAdditionalUsageRepository(), request_logs=object(), # type: ignore[arg-type] sticky_sessions=sticky_repo, + http_bridge_leases=object(), # type: ignore[arg-type] api_keys=object(), # type: ignore[arg-type] ) diff --git a/tests/unit/test_proxy_service_additional_limits.py b/tests/unit/test_proxy_service_additional_limits.py index ac8efa84..a999afc9 100644 --- a/tests/unit/test_proxy_service_additional_limits.py +++ b/tests/unit/test_proxy_service_additional_limits.py @@ -156,6 +156,7 @@ async def repo_factory() -> Any: usage=object(), # type: ignore[arg-type] request_logs=object(), # type: ignore[arg-type] sticky_sessions=object(), # type: ignore[arg-type] + http_bridge_leases=object(), # type: ignore[arg-type] api_keys=object(), # type: ignore[arg-type] additional_usage=additional_usage, # type: ignore[arg-type] ) @@ -168,6 +169,7 @@ async def repo_factory() -> Any: usage=object(), # type: ignore[arg-type] request_logs=object(), # type: ignore[arg-type] sticky_sessions=object(), # type: ignore[arg-type] + http_bridge_leases=object(), # type: ignore[arg-type] api_keys=object(), # type: ignore[arg-type] additional_usage=additional_usage, # type: ignore[arg-type] ), From b30b937e901d3bef58e1a4ef949787be2ac6a9e6 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Sun, 22 Mar 2026 16:05:39 +0000 Subject: [PATCH 02/34] fix(review): address durable bridge regressions --- app/modules/proxy/api.py | 6 +- app/modules/proxy/service.py | 42 +++- .../integration/test_http_responses_bridge.py | 238 +++++++++++++++++- tests/unit/test_pricing.py | 2 + tests/unit/test_proxy_utils.py | 1 + 5 files changed, 274 insertions(+), 15 deletions(-) diff --git a/app/modules/proxy/api.py b/app/modules/proxy/api.py index 69817341..5a059e96 100644 --- a/app/modules/proxy/api.py +++ b/app/modules/proxy/api.py @@ -479,7 +479,7 @@ async def _stream_responses( return StreamingResponse( _prepend_first(None, stream), media_type="text/event-stream", - headers={"Cache-Control": "no-cache", **rate_limit_headers}, + headers={"Cache-Control": "no-cache", **turn_state_headers, **rate_limit_headers}, ) except ProxyResponseError as exc: await _release_reservation(reservation) @@ -487,7 +487,7 @@ async def _stream_responses( request, exc.status_code, exc.payload, - headers=rate_limit_headers, + headers={**turn_state_headers, **rate_limit_headers}, ) return StreamingResponse( _prepend_first(first, stream), @@ -555,7 +555,7 @@ async def _collect_responses( request, exc.status_code, error.model_dump(mode="json", exclude_none=True), - headers=rate_limit_headers, + headers={**turn_state_headers, **rate_limit_headers}, ) if isinstance(response_payload, OpenAIResponsePayload): if response_payload.status == "failed": diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index e42cf0bd..19276d37 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1802,7 +1802,12 @@ async def _get_or_create_http_bridge_session( if turn_state_token is not None: delete_stale_turn_state_lease = True - if previous_response_id is not None: + if delete_stale_turn_state_lease and turn_state_token is not None: + await self._delete_http_bridge_lease(turn_state_token.session_id) + if previous_response_id is not None: + continuity_error = self._expired_http_bridge_turn_state() + + if continuity_error is None and previous_response_id is not None: continuity_error = ProxyResponseError( 400, _http_bridge_previous_response_error_envelope( @@ -1813,12 +1818,8 @@ async def _get_or_create_http_bridge_session( ), ), ) - else: - if delete_stale_turn_state_lease and turn_state_token is not None: - await self._delete_http_bridge_lease(turn_state_token.session_id) - if is_bridge_turn_state_replay: - raise self._expired_http_bridge_turn_state() + if continuity_error is None: inflight_future = self._http_bridge_inflight_sessions.get(key) if inflight_future is None: while ( @@ -1908,7 +1909,11 @@ async def _get_or_create_http_bridge_session( try: session = await self._create_http_bridge_session( key, - headers=headers, + headers=( + _headers_without_local_http_bridge_turn_state(headers) + if is_bridge_turn_state_replay or turn_state_token is not None + else headers + ), affinity=affinity, request_model=request_model, idle_ttl_seconds=effective_idle_ttl_seconds, @@ -2008,7 +2013,14 @@ async def _register_http_bridge_turn_state(self, session: "_HTTPBridgeSession", self._http_bridge_turn_state_index[_http_bridge_turn_state_alias_key(alias, session.key.api_key_id)] = ( session.key ) - await self._touch_http_bridge_lease(session) + try: + await self._touch_http_bridge_lease(session) + except Exception: + logger.warning( + "Failed to persist HTTP bridge lease after turn-state registration session_id=%s", + session.bridge_session_id, + exc_info=True, + ) async def _unregister_http_bridge_turn_states(self, session: "_HTTPBridgeSession") -> None: async with self._http_bridge_lock: @@ -2038,7 +2050,6 @@ def _promote_http_bridge_session_to_codex_affinity( session.idle_ttl_seconds, float(getattr(settings, "http_responses_session_bridge_codex_idle_ttl_seconds", 900.0)), ) - session.headers = _headers_with_turn_state(session.headers, turn_state) async def _create_http_bridge_session( self, @@ -5175,6 +5186,19 @@ def _headers_with_turn_state(headers: Mapping[str, str], turn_state: str | None) return forwarded +def _headers_without_local_http_bridge_turn_state(headers: Mapping[str, str]) -> dict[str, str]: + forwarded = dict(headers) + for key, value in list(forwarded.items()): + if key.lower() != "x-codex-turn-state": + continue + if isinstance(value, str): + stripped = value.strip() + if stripped.startswith(_HTTP_BRIDGE_TURN_STATE_PREFIX) or stripped.startswith("http_turn_"): + forwarded.pop(key, None) + break + return forwarded + + def _preferred_http_bridge_reconnect_turn_state(session: "_HTTPBridgeSession") -> str | None: if ( session.codex_session diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index cd7c1b77..e36d31a0 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -33,14 +33,15 @@ async def _cleanup_http_bridge_sessions(app_instance): service = get_proxy_service_for_app(app_instance) async with service._http_bridge_lock: sessions = list(service._http_bridge_sessions.values()) - inflight_sessions = list(service._http_bridge_inflight_sessions.values()) service._http_bridge_sessions.clear() - service._http_bridge_inflight_sessions.clear() + inflight_futures = list(getattr(service, "_http_bridge_inflight_sessions", {}).values()) + if hasattr(service, "_http_bridge_inflight_sessions"): + service._http_bridge_inflight_sessions.clear() service._http_bridge_turn_state_index.clear() for session in sessions: session.bridge_session_id = "" await service._close_http_bridge_session(session) - for inflight_future in inflight_sessions: + for inflight_future in inflight_futures: if not inflight_future.done(): inflight_future.cancel() async with SessionLocal() as session: @@ -1194,6 +1195,125 @@ async def fake_connect_responses_websocket( assert session.bridge_session_id.startswith("hbs_") +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_signed_turn_state_missing_local_alias_recovers_fresh_session( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits( + monkeypatch, + enabled=True, + instance_id="instance-a", + instance_ring=["instance-a", "instance-b"], + ) + account_id = await _import_account( + async_client, + "acc_http_bridge_missing_signed_alias", + "http-bridge-missing-signed-alias@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + fake_upstream = _FakeBridgeUpstreamWebSocket() + connect_headers_seen: list[dict[str, str]] = [] + session_id = "hbs_signed_missing_alias" + signed_turn_state = service._encode_http_bridge_turn_state( + session_id=session_id, + owner_instance_id="instance-a", + api_key_id=None, + ) + + async def fake_select_account_with_budget( + self, + deadline, + *, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids=None, + additional_limit_name=None, + ): + del ( + self, + deadline, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids, + additional_limit_name, + ) + return AccountSelection(account=account, error_message=None, error_code=None) + + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del access_token, account_id_header, base_url, session + connect_headers_seen.append(dict(headers)) + return fake_upstream + + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + + async with SessionLocal() as db_session: + await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) + await db_session.commit() + + async with service._repo_factory() as repos: + await repos.http_bridge_leases.upsert( + session_id=session_id, + affinity_kind="turn_state_header", + affinity_key=signed_turn_state, + api_key_scope="", + owner_instance_id="instance-a", + lease_expires_at=proxy_module._http_bridge_lease_expires_at(120.0), + account_id=account.id, + request_model="gpt-5.1", + codex_session=True, + idle_ttl_seconds=120.0, + upstream_turn_state=None, + downstream_turn_state=signed_turn_state, + ) + + session = await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) + + assert session.key.affinity_kind == "turn_state_header" + assert session.bridge_session_id == session_id + assert connect_headers_seen[-1].get("x-codex-turn-state") is None + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_turn_state_alias_respects_api_key_isolation( async_client, @@ -3139,6 +3259,118 @@ async def fail_refresh(self, target, *, force=False, timeout_seconds): assert "x-codex-turn-state" not in response.headers +@pytest.mark.asyncio +async def test_backend_responses_http_bridge_early_error_preserves_turn_state_header(async_client, monkeypatch): + _install_bridge_settings(monkeypatch, enabled=True) + + async def fake_stream_http_responses( + self, + payload, + headers, + *, + response_headers_out=None, + **kwargs, + ): + del self, payload, headers, kwargs + assert response_headers_out is not None + response_headers_out["x-codex-turn-state"] = "http_turn_test_backend_error" + raise proxy_module.ProxyResponseError( + 502, + {"error": {"message": "upstream unavailable", "type": "server_error", "code": "upstream_unavailable"}}, + ) + yield "" + + monkeypatch.setattr(proxy_module.ProxyService, "stream_http_responses", fake_stream_http_responses) + + response = await async_client.post( + "/backend-api/codex/responses", + json={ + "model": "gpt-5.1", + "instructions": "Return exactly OK.", + "input": "hello", + "stream": True, + }, + ) + + assert response.status_code == 502 + assert response.json()["error"]["code"] == "upstream_unavailable" + assert response.headers["x-codex-turn-state"] == "http_turn_test_backend_error" + + +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_early_error_preserves_turn_state_header(async_client, monkeypatch): + _install_bridge_settings(monkeypatch, enabled=True) + + async def fake_stream_http_responses( + self, + payload, + headers, + *, + response_headers_out=None, + **kwargs, + ): + del self, payload, headers, kwargs + assert response_headers_out is not None + response_headers_out["x-codex-turn-state"] = "http_turn_test_v1_error" + raise proxy_module.ProxyResponseError( + 502, + {"error": {"message": "upstream unavailable", "type": "server_error", "code": "upstream_unavailable"}}, + ) + yield "" + + monkeypatch.setattr(proxy_module.ProxyService, "stream_http_responses", fake_stream_http_responses) + + response = await async_client.post( + "/v1/responses", + json={ + "model": "gpt-5.1", + "instructions": "Return exactly OK.", + "input": "hello", + }, + ) + + assert response.status_code == 502 + assert response.json()["error"]["code"] == "upstream_unavailable" + assert response.headers["x-codex-turn-state"] == "http_turn_test_v1_error" + + +@pytest.mark.asyncio +async def test_backend_responses_http_bridge_empty_stream_preserves_turn_state_header(async_client, monkeypatch): + _install_bridge_settings(monkeypatch, enabled=True) + + async def fake_stream_http_responses( + self, + payload, + headers, + *, + response_headers_out=None, + **kwargs, + ): + del self, payload, headers, kwargs + assert response_headers_out is not None + response_headers_out["x-codex-turn-state"] = "http_turn_test_backend_empty" + if False: + yield "" + + monkeypatch.setattr(proxy_module.ProxyService, "stream_http_responses", fake_stream_http_responses) + + async with async_client.stream( + "POST", + "/backend-api/codex/responses", + json={ + "model": "gpt-5.1", + "instructions": "Return exactly OK.", + "input": "hello", + "stream": True, + }, + ) as response: + assert response.status_code == 200 + lines = [line async for line in response.aiter_lines()] + + assert lines == [] + assert response.headers["x-codex-turn-state"] == "http_turn_test_backend_empty" + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_does_not_register_turn_state_alias_before_request_admission( async_client, diff --git a/tests/unit/test_pricing.py b/tests/unit/test_pricing.py index 2871060f..8816ee79 100644 --- a/tests/unit/test_pricing.py +++ b/tests/unit/test_pricing.py @@ -143,11 +143,13 @@ def test_calculate_cost_from_usage_legacy_gpt_5_service_tiers() -> None: usage = UsageTokens(input_tokens=1_000_000.0, output_tokens=1_000_000.0) gpt_5_priority = calculate_cost_from_usage(usage, DEFAULT_PRICING_MODELS["gpt-5"], service_tier="priority") + gpt_5_1_priority = calculate_cost_from_usage(usage, DEFAULT_PRICING_MODELS["gpt-5.1"], service_tier="priority") gpt_5_1_flex = calculate_cost_from_usage(usage, DEFAULT_PRICING_MODELS["gpt-5.1"], service_tier="flex") gpt_5_2_priority = calculate_cost_from_usage(usage, DEFAULT_PRICING_MODELS["gpt-5.2"], service_tier="priority") gpt_5_2_flex = calculate_cost_from_usage(usage, DEFAULT_PRICING_MODELS["gpt-5.2"], service_tier="flex") assert gpt_5_priority == pytest.approx(22.5) + assert gpt_5_1_priority == pytest.approx(22.5) assert gpt_5_1_flex == pytest.approx(5.625) assert gpt_5_2_priority == pytest.approx(31.5) assert gpt_5_2_flex == pytest.approx(7.875) diff --git a/tests/unit/test_proxy_utils.py b/tests/unit/test_proxy_utils.py index e7c214a2..bad7be48 100644 --- a/tests/unit/test_proxy_utils.py +++ b/tests/unit/test_proxy_utils.py @@ -148,6 +148,7 @@ def test_has_native_codex_transport_headers_requires_allowlisted_originator(): assert proxy_module._has_native_codex_transport_headers({"originator": "codex_chatgpt_desktop"}) is True assert proxy_module._has_native_codex_transport_headers({"originator": "Codex Chat"}) is False assert proxy_module._has_native_codex_transport_headers({"originator": "Codex QA"}) is False + assert proxy_module._has_native_codex_transport_headers({"originator": "Codex Foo"}) is False assert proxy_module._has_native_codex_transport_headers({"originator": "other-client"}) is False From bf5ed8ddea5391baa14d020cfdbfb46f6e2d97c4 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Sun, 22 Mar 2026 18:54:21 +0000 Subject: [PATCH 03/34] fix(proxy): harden durable http bridge leases --- app/modules/proxy/service.py | 98 ++++++++++++------- .../integration/test_http_responses_bridge.py | 2 + 2 files changed, 64 insertions(+), 36 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 19276d37..b737312a 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1566,6 +1566,8 @@ async def _get_live_http_bridge_lease( return None return _HTTPBridgeLeaseSnapshot( session_id=lease.session_id, + affinity_kind=lease.affinity_kind, + affinity_key=lease.affinity_key, owner_instance_id=lease.owner_instance_id, api_key_scope=lease.api_key_scope, account_id=lease.account_id, @@ -1596,24 +1598,12 @@ async def _persist_http_bridge_lease(self, session: "_HTTPBridgeSession") -> Non ) async def _touch_http_bridge_lease(self, session: "_HTTPBridgeSession") -> None: - async with self._repo_factory() as repos: - touched = await repos.http_bridge_leases.touch( - session.bridge_session_id, - lease_expires_at=_http_bridge_lease_expires_at(session.idle_ttl_seconds), - account_id=session.account.id, - request_model=session.request_model, - codex_session=session.codex_session, - idle_ttl_seconds=session.idle_ttl_seconds, - upstream_turn_state=session.upstream_turn_state, - downstream_turn_state=session.downstream_turn_state, - ) - if not touched: - await repos.http_bridge_leases.upsert( - session_id=session.bridge_session_id, - affinity_kind=session.key.affinity_kind, - affinity_key=session.key.affinity_key, - api_key_scope=_http_bridge_api_key_scope(session.key.api_key_id), - owner_instance_id=session.owner_instance_id, + async with session.lease_lock: + if session.closed: + return + async with self._repo_factory() as repos: + touched = await repos.http_bridge_leases.touch( + session.bridge_session_id, lease_expires_at=_http_bridge_lease_expires_at(session.idle_ttl_seconds), account_id=session.account.id, request_model=session.request_model, @@ -1622,6 +1612,23 @@ async def _touch_http_bridge_lease(self, session: "_HTTPBridgeSession") -> None: upstream_turn_state=session.upstream_turn_state, downstream_turn_state=session.downstream_turn_state, ) + if not touched: + if session.closed: + return + await repos.http_bridge_leases.upsert( + session_id=session.bridge_session_id, + affinity_kind=session.key.affinity_kind, + affinity_key=session.key.affinity_key, + api_key_scope=_http_bridge_api_key_scope(session.key.api_key_id), + owner_instance_id=session.owner_instance_id, + lease_expires_at=_http_bridge_lease_expires_at(session.idle_ttl_seconds), + account_id=session.account.id, + request_model=session.request_model, + codex_session=session.codex_session, + idle_ttl_seconds=session.idle_ttl_seconds, + upstream_turn_state=session.upstream_turn_state, + downstream_turn_state=session.downstream_turn_state, + ) def _has_http_bridge_turn_state_alias_conflict(self, turn_state: str, *, api_key_id: str | None) -> bool: requested_alias_key = _http_bridge_turn_state_alias_key(turn_state, api_key_id) @@ -1725,7 +1732,14 @@ async def _get_or_create_http_bridge_session( key = alias_session.key elif turn_state_token is not None: delete_stale_turn_state_lease = True - key = _HTTPBridgeSessionKey("turn_state_header", incoming_turn_state, api_key_id) + if active_turn_state_lease is not None: + key = _HTTPBridgeSessionKey( + active_turn_state_lease.affinity_kind, + active_turn_state_lease.affinity_key, + api_key_id, + ) + else: + key = _HTTPBridgeSessionKey("turn_state_header", incoming_turn_state, api_key_id) elif incoming_turn_state.startswith("http_turn_") and self._has_http_bridge_turn_state_alias_conflict( incoming_turn_state, api_key_id=api_key_id, @@ -1980,22 +1994,23 @@ async def _close_http_bridge_session( *, turn_state_lock_held: bool = False, ) -> None: - session.closed = True - if turn_state_lock_held: - self._unregister_http_bridge_turn_states_locked(session) - else: - await self._unregister_http_bridge_turn_states(session) - if session.upstream_reader is not None: - session.upstream_reader.cancel() + async with session.lease_lock: + session.closed = True + if turn_state_lock_held: + self._unregister_http_bridge_turn_states_locked(session) + else: + await self._unregister_http_bridge_turn_states(session) + if session.upstream_reader is not None: + session.upstream_reader.cancel() + try: + await session.upstream_reader + except asyncio.CancelledError: + pass try: - await session.upstream_reader - except asyncio.CancelledError: - pass - try: - await session.upstream.close() - except Exception: - logger.debug("Failed to close HTTP bridge upstream websocket", exc_info=True) - await self._delete_http_bridge_lease(session.bridge_session_id) + await session.upstream.close() + except Exception: + logger.debug("Failed to close HTTP bridge upstream websocket", exc_info=True) + await self._delete_http_bridge_lease(session.bridge_session_id) _log_http_bridge_event( "close", session.key, @@ -2127,6 +2142,7 @@ async def _create_http_bridge_session( upstream_control=_WebSocketUpstreamControl(), pending_requests=deque(), pending_lock=anyio.Lock(), + lease_lock=anyio.Lock(), response_create_gate=asyncio.Semaphore(1), queued_request_count=0, last_used_at=time.monotonic(), @@ -2577,7 +2593,14 @@ async def _reconnect_http_bridge_session( session.upstream_control = _WebSocketUpstreamControl() session.closed = False session.upstream_turn_state = _upstream_turn_state_from_socket(upstream) or session.upstream_turn_state - await self._touch_http_bridge_lease(session) + try: + await self._touch_http_bridge_lease(session) + except Exception: + logger.warning( + "Failed to persist HTTP bridge lease after reconnect session_id=%s", + session.bridge_session_id, + exc_info=True, + ) if restart_reader: session.upstream_reader = asyncio.create_task(self._relay_http_bridge_upstream_messages(session)) _log_http_bridge_event( @@ -4570,6 +4593,8 @@ class _HTTPBridgeTurnStateToken: @dataclass(frozen=True, slots=True) class _HTTPBridgeLeaseSnapshot: session_id: str + affinity_kind: str + affinity_key: str owner_instance_id: str api_key_scope: str account_id: str | None @@ -4587,6 +4612,7 @@ class _HTTPBridgeSession: upstream_control: _WebSocketUpstreamControl pending_requests: deque[_WebSocketRequestState] pending_lock: anyio.Lock + lease_lock: anyio.Lock response_create_gate: asyncio.Semaphore queued_request_count: int last_used_at: float @@ -5219,7 +5245,7 @@ def _http_bridge_api_key_scope(api_key_id: str | None) -> str: def _http_bridge_lease_expires_at(idle_ttl_seconds: float) -> datetime: - return utcnow() + timedelta(seconds=max(30.0, idle_ttl_seconds + 30.0)) + return utcnow() + timedelta(seconds=max(0.0, idle_ttl_seconds)) def _resolve_prompt_cache_key( diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index e36d31a0..d4b33828 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -6,6 +6,7 @@ import time from collections import deque from collections.abc import AsyncGenerator +from datetime import timedelta from types import SimpleNamespace from typing import cast @@ -19,6 +20,7 @@ from app.db.models import Account, AccountStatus, HttpBridgeLease from app.db.session import SessionLocal from app.dependencies import get_proxy_service_for_app +from app.modules.proxy.bridge_repository import HttpBridgeLeasesRepository from app.modules.proxy.load_balancer import AccountSelection pytestmark = pytest.mark.integration From 2245e2dc939ac2e443e98864a9834c1a37216d2f Mon Sep 17 00:00:00 2001 From: aaiyer Date: Sun, 22 Mar 2026 19:08:24 +0000 Subject: [PATCH 04/34] fix(test): remove stale bridge inflight cleanup --- tests/integration/test_http_responses_bridge.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index d4b33828..c78700dd 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -36,16 +36,10 @@ async def _cleanup_http_bridge_sessions(app_instance): async with service._http_bridge_lock: sessions = list(service._http_bridge_sessions.values()) service._http_bridge_sessions.clear() - inflight_futures = list(getattr(service, "_http_bridge_inflight_sessions", {}).values()) - if hasattr(service, "_http_bridge_inflight_sessions"): - service._http_bridge_inflight_sessions.clear() service._http_bridge_turn_state_index.clear() for session in sessions: session.bridge_session_id = "" await service._close_http_bridge_session(session) - for inflight_future in inflight_futures: - if not inflight_future.done(): - inflight_future.cancel() async with SessionLocal() as session: await session.execute(delete(HttpBridgeLease)) await session.commit() From 58d1c92d2062729aed10a682db968a3abbf059c1 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Sun, 22 Mar 2026 19:46:23 +0000 Subject: [PATCH 05/34] fix(proxy): preserve recovered bridge ownership --- app/modules/proxy/service.py | 44 +++++- .../integration/test_http_responses_bridge.py | 125 +++++++++++++++++- 2 files changed, 160 insertions(+), 9 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index b737312a..874c9232 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1702,6 +1702,7 @@ async def _get_or_create_http_bridge_session( owns_creation = False continuity_error: ProxyResponseError | None = None delete_stale_turn_state_lease = False + matched_turn_state_alias = False async with self._http_bridge_lock: if incoming_turn_state is not None: @@ -1719,6 +1720,7 @@ async def _get_or_create_http_bridge_session( self._http_bridge_turn_state_index[alias_index_key] = candidate_key break if alias_session is not None: + matched_turn_state_alias = True key = alias_key self._promote_http_bridge_session_to_codex_affinity( alias_session, @@ -1739,7 +1741,15 @@ async def _get_or_create_http_bridge_session( api_key_id, ) else: - key = _HTTPBridgeSessionKey("turn_state_header", incoming_turn_state, api_key_id) + key = _HTTPBridgeSessionKey( + "turn_state_header", + self._encode_http_bridge_turn_state( + session_id=created_session_id, + owner_instance_id=current_instance, + api_key_id=api_key_id, + ), + api_key_id, + ) elif incoming_turn_state.startswith("http_turn_") and self._has_http_bridge_turn_state_alias_conflict( incoming_turn_state, api_key_id=api_key_id, @@ -1754,6 +1764,7 @@ async def _get_or_create_http_bridge_session( owner_instance = _http_bridge_owner_instance(key, settings) if ( not is_bridge_turn_state_replay + and not matched_turn_state_alias and active_turn_state_lease is None and key.affinity_kind != "request" and owner_instance is not None @@ -2023,7 +2034,19 @@ async def _register_http_bridge_turn_state(self, session: "_HTTPBridgeSession", if session.closed: return session.downstream_turn_state_aliases.add(turn_state) - session.downstream_turn_state = turn_state + if self._http_bridge_turn_state_matches_session( + turn_state, + session=session, + api_key_id=session.key.api_key_id, + ): + if session.downstream_turn_state is None: + session.downstream_turn_state = turn_state + else: + self._promote_http_bridge_session_to_codex_affinity( + session, + turn_state=turn_state, + settings=get_settings(), + ) for alias in session.downstream_turn_state_aliases: self._http_bridge_turn_state_index[_http_bridge_turn_state_alias_key(alias, session.key.api_key_id)] = ( session.key @@ -2057,6 +2080,18 @@ def _promote_http_bridge_session_to_codex_affinity( turn_state: str, settings: object, ) -> None: + promoted_key = _HTTPBridgeSessionKey( + affinity_kind="turn_state_header", + affinity_key=turn_state, + api_key_id=session.key.api_key_id, + ) + current_key = session.key + if current_key != promoted_key: + current_session = self._http_bridge_sessions.get(current_key) + if current_session is session: + self._http_bridge_sessions.pop(current_key, None) + session.key = promoted_key + self._http_bridge_sessions[promoted_key] = session session.affinity = _AffinityPolicy(key=turn_state, kind=StickySessionKind.CODEX_SESSION) session.codex_session = True session.downstream_turn_state = turn_state @@ -5226,14 +5261,17 @@ def _headers_without_local_http_bridge_turn_state(headers: Mapping[str, str]) -> def _preferred_http_bridge_reconnect_turn_state(session: "_HTTPBridgeSession") -> str | None: + if session.upstream_turn_state is not None: + return session.upstream_turn_state if ( session.codex_session and session.downstream_turn_state is not None + and not session.downstream_turn_state.startswith(_HTTP_BRIDGE_TURN_STATE_PREFIX) and session.affinity.kind == StickySessionKind.CODEX_SESSION and session.affinity.key == session.downstream_turn_state ): return session.downstream_turn_state - return session.upstream_turn_state + return None def _http_bridge_turn_state_alias_key(turn_state: str, api_key_id: str | None) -> tuple[str, str | None]: diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index c78700dd..4bfd8fab 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -1023,7 +1023,7 @@ async def fake_connect_responses_websocket( service._http_bridge_turn_state_index[ proxy_module._http_bridge_turn_state_alias_key(replay_turn_state, session.key.api_key_id) ] - == key + == replay_key ) replayed = await service._get_or_create_http_bridge_session( @@ -1037,14 +1037,14 @@ async def fake_connect_responses_websocket( ) assert replayed is session - assert replayed.key == key - assert service._http_bridge_sessions[key] is session - assert replay_key not in service._http_bridge_sessions + assert replayed.key == replay_key + assert service._http_bridge_sessions[replay_key] is session + assert key not in service._http_bridge_sessions assert ( service._http_bridge_turn_state_index[ proxy_module._http_bridge_turn_state_alias_key(replay_turn_state, session.key.api_key_id) ] - == key + == replay_key ) assert replayed.codex_session is True assert replayed.affinity.kind == proxy_module.StickySessionKind.CODEX_SESSION @@ -1060,7 +1060,7 @@ async def fake_connect_responses_websocket( started_at=time.monotonic(), ) await service._reconnect_http_bridge_session(replayed, request_state=request_state) - assert connect_headers_seen[-1]["x-codex-turn-state"] == replay_turn_state + assert connect_headers_seen[-1]["x-codex-turn-state"] == "upstream_turn_state_stale" await service._close_http_bridge_session(session) @@ -1310,6 +1310,119 @@ async def fake_connect_responses_websocket( assert connect_headers_seen[-1].get("x-codex-turn-state") is None +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_signed_turn_state_owner_mismatch_rekeys_recovered_session( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits( + monkeypatch, + enabled=True, + instance_id="instance-a", + instance_ring=["instance-a", "instance-b"], + ) + account_id = await _import_account( + async_client, + "acc_http_bridge_rekey_recovered_signed_alias", + "http-bridge-rekey-recovered-signed-alias@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + fake_upstream = _FakeBridgeUpstreamWebSocket() + stale_session_id = "hbs_signed_missing_alias_other_owner" + signed_turn_state = service._encode_http_bridge_turn_state( + session_id=stale_session_id, + owner_instance_id="instance-b", + api_key_id=None, + ) + + async def fake_select_account_with_budget( + self, + deadline, + *, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids=None, + additional_limit_name=None, + ): + del ( + self, + deadline, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids, + additional_limit_name, + ) + return AccountSelection(account=account, error_message=None, error_code=None) + + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del headers, access_token, account_id_header, base_url, session + return fake_upstream + + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + + async with SessionLocal() as db_session: + await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == stale_session_id)) + await db_session.commit() + + session = await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) + + assert session.bridge_session_id != stale_session_id + assert session.key.affinity_kind == "turn_state_header" + assert session.key.affinity_key != signed_turn_state + recovered_token = service._decode_http_bridge_turn_state(session.key.affinity_key, api_key_id=None) + assert recovered_token is not None + assert recovered_token.session_id == session.bridge_session_id + assert recovered_token.owner_instance_id == "instance-a" + + async with SessionLocal() as db_session: + lease = ( + await db_session.execute(select(HttpBridgeLease).where(HttpBridgeLease.session_id == session.bridge_session_id)) + ).scalar_one() + assert lease.owner_instance_id == "instance-a" + assert lease.affinity_kind == "turn_state_header" + assert lease.affinity_key == session.key.affinity_key + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_turn_state_alias_respects_api_key_isolation( async_client, From 9e45f8b0d190b08e5909aea4bac401544eaa1318 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Mon, 23 Mar 2026 03:39:13 +0000 Subject: [PATCH 06/34] fix(proxy): handle stale bridge owners and legacy replays --- app/modules/proxy/service.py | 33 ++- .../integration/test_http_responses_bridge.py | 222 ++++++++++++------ 2 files changed, 176 insertions(+), 79 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 874c9232..1fcd7f4e 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1669,7 +1669,11 @@ async def _get_or_create_http_bridge_session( active_turn_state_lease = await self._get_live_http_bridge_lease( turn_state_token.session_id if turn_state_token is not None else None ) - if active_turn_state_lease is not None and active_turn_state_lease.owner_instance_id != current_instance: + if ( + active_turn_state_lease is not None + and active_turn_state_lease.owner_instance_id in ring + and active_turn_state_lease.owner_instance_id != current_instance + ): _log_http_bridge_event( "owner_mismatch", key, @@ -1750,13 +1754,26 @@ async def _get_or_create_http_bridge_session( ), api_key_id, ) - elif incoming_turn_state.startswith("http_turn_") and self._has_http_bridge_turn_state_alias_conflict( - incoming_turn_state, - api_key_id=api_key_id, - ): - raise self._invalid_http_bridge_turn_state() - elif incoming_turn_state.startswith("http_turn_"): - key = _HTTPBridgeSessionKey("turn_state_header", incoming_turn_state, api_key_id) + else: + if previous_response_id is not None: + raise ProxyResponseError( + 400, + _http_bridge_previous_response_error_envelope( + previous_response_id, + ( + "HTTP bridge continuity was lost. Replay x-codex-turn-state " + "or retry with a stable prompt_cache_key." + ), + ), + ) + raise ProxyResponseError( + 409, + openai_error( + "bridge_instance_mismatch", + "HTTP bridge turn-state reached an instance that does not own the live session", + error_type="server_error", + ), + ) await self._prune_http_bridge_sessions_locked() diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 4bfd8fab..fbec60ff 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -1065,43 +1065,7 @@ async def fake_connect_responses_websocket( @pytest.mark.asyncio -async def test_v1_responses_http_bridge_waits_for_inflight_recreation_on_missing_turn_state_alias(app_instance): - service = get_proxy_service_for_app(app_instance) - service._http_bridge_sessions.clear() - service._http_bridge_turn_state_index.clear() - service._http_bridge_inflight_sessions.clear() - - replay_turn_state = "http_turn_inflight_replay" - replay_key = proxy_module._HTTPBridgeSessionKey("turn_state_header", replay_turn_state, None) - expected_session = _make_dummy_bridge_session(replay_key) - inflight_future: asyncio.Future[SimpleNamespace] = asyncio.get_running_loop().create_future() - service._http_bridge_inflight_sessions[replay_key] = inflight_future - - request_key = proxy_module._HTTPBridgeSessionKey("request", "derived-key", None) - try: - waiter = asyncio.create_task( - service._get_or_create_http_bridge_session( - request_key, - headers={"x-codex-turn-state": replay_turn_state}, - affinity=proxy_module._AffinityPolicy(key="derived-key"), - api_key=None, - request_model="gpt-5.4", - idle_ttl_seconds=120.0, - max_sessions=8, - ) - ) - await asyncio.sleep(0) - assert not waiter.done() - inflight_future.set_result(expected_session) - returned = await waiter - finally: - service._http_bridge_inflight_sessions.clear() - - assert returned is expected_session - - -@pytest.mark.asyncio -async def test_v1_responses_http_bridge_generated_turn_state_missing_local_alias_recovers_fresh_session( +async def test_v1_responses_http_bridge_generated_turn_state_fails_closed_without_local_alias( async_client, app_instance, monkeypatch, @@ -1117,9 +1081,8 @@ async def test_v1_responses_http_bridge_generated_turn_state_missing_local_alias "acc_http_bridge_missing_alias", "http-bridge-missing-alias@example.com", ) - account = await _get_account(account_id) service = get_proxy_service_for_app(app_instance) - fake_upstream = _FakeBridgeUpstreamWebSocket() + account = await _get_account(account_id) async def fake_select_account_with_budget( self, @@ -1154,41 +1117,25 @@ async def fake_select_account_with_budget( ) return AccountSelection(account=account, error_message=None, error_code=None) - async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): - del self, force, timeout_seconds - return target - - async def fake_connect_responses_websocket( - headers, - access_token, - account_id_header, - *, - base_url=None, - session=None, - ): - del headers, access_token, account_id_header, base_url, session - return fake_upstream - monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) - monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) - monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) - session = await service._get_or_create_http_bridge_session( - proxy_module._HTTPBridgeSessionKey("turn_state_header", "http_turn_missing_alias", None), - headers={"x-codex-turn-state": "http_turn_missing_alias"}, - affinity=proxy_module._AffinityPolicy( - key="http_turn_missing_alias", - kind=proxy_module.StickySessionKind.CODEX_SESSION, - ), - api_key=None, - request_model="gpt-5.1", - idle_ttl_seconds=120.0, - max_sessions=128, - ) + with pytest.raises(proxy_module.ProxyResponseError) as exc_info: + await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", "http_turn_missing_alias", None), + headers={"x-codex-turn-state": "http_turn_missing_alias"}, + affinity=proxy_module._AffinityPolicy( + key="http_turn_missing_alias", + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) - assert session.key.affinity_kind == "turn_state_header" - assert session.key.affinity_key == "http_turn_missing_alias" - assert session.bridge_session_id.startswith("hbs_") + exc = exc_info.value + assert exc.status_code == 409 + assert exc.payload["error"].get("code") == "bridge_instance_mismatch" @pytest.mark.asyncio @@ -1423,6 +1370,139 @@ async def fake_connect_responses_websocket( assert lease.affinity_key == session.key.affinity_key +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_signed_turn_state_stale_owner_outside_ring_recovers( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits( + monkeypatch, + enabled=True, + instance_id="instance-new", + instance_ring=[], + ) + account_id = await _import_account( + async_client, + "acc_http_bridge_missing_signed_alias_stale_owner", + "http-bridge-missing-signed-alias-stale-owner@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + fake_upstream = _FakeBridgeUpstreamWebSocket() + stale_session_id = "hbs_signed_missing_alias_stale_owner" + signed_turn_state = service._encode_http_bridge_turn_state( + session_id=stale_session_id, + owner_instance_id="instance-old", + api_key_id=None, + ) + + async def fake_select_account_with_budget( + self, + deadline, + *, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids=None, + additional_limit_name=None, + ): + del ( + self, + deadline, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids, + additional_limit_name, + ) + return AccountSelection(account=account, error_message=None, error_code=None) + + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del headers, access_token, account_id_header, base_url, session + return fake_upstream + + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + + async with SessionLocal() as db_session: + await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == stale_session_id)) + await db_session.commit() + + async with service._repo_factory() as repos: + await repos.http_bridge_leases.upsert( + session_id=stale_session_id, + affinity_kind="prompt_cache", + affinity_key="stale-owner-thread", + api_key_scope="", + owner_instance_id="instance-old", + lease_expires_at=proxy_module._http_bridge_lease_expires_at(120.0), + account_id=account.id, + request_model="gpt-5.1", + codex_session=True, + idle_ttl_seconds=120.0, + upstream_turn_state=None, + downstream_turn_state=signed_turn_state, + ) + + recovered = await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) + + assert recovered.key.affinity_kind == "prompt_cache" + assert recovered.key.affinity_key == "stale-owner-thread" + assert recovered.bridge_session_id != stale_session_id + assert recovered.owner_instance_id == "instance-new" + + async with SessionLocal() as db_session: + stale_lease = ( + await db_session.execute(select(HttpBridgeLease).where(HttpBridgeLease.session_id == stale_session_id)) + ).scalar_one_or_none() + new_lease = ( + await db_session.execute( + select(HttpBridgeLease).where(HttpBridgeLease.session_id == recovered.bridge_session_id) + ) + ).scalar_one() + + assert stale_lease is None + assert new_lease.owner_instance_id == "instance-new" + assert new_lease.affinity_kind == "prompt_cache" + assert new_lease.affinity_key == "stale-owner-thread" + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_turn_state_alias_respects_api_key_isolation( async_client, From fb3b9ca195e6c760f9e386a480dcc4486bbb1df4 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Mon, 23 Mar 2026 06:43:05 +0000 Subject: [PATCH 07/34] fix(proxy): rebind recovered bridge affinity --- app/modules/proxy/service.py | 32 ++- .../integration/test_http_responses_bridge.py | 220 ++++++++++++++++++ 2 files changed, 251 insertions(+), 1 deletion(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 1fcd7f4e..d98871d8 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1666,6 +1666,7 @@ async def _get_or_create_http_bridge_session( incoming_turn_state = _sticky_key_from_turn_state_header(headers) turn_state_token = self._decode_http_bridge_turn_state(incoming_turn_state, api_key_id=api_key_id) is_bridge_turn_state_replay = bool(incoming_turn_state and incoming_turn_state.startswith("http_turn_")) + create_affinity = affinity active_turn_state_lease = await self._get_live_http_bridge_lease( turn_state_token.session_id if turn_state_token is not None else None ) @@ -1744,6 +1745,10 @@ async def _get_or_create_http_bridge_session( active_turn_state_lease.affinity_key, api_key_id, ) + create_affinity = _affinity_policy_from_http_bridge_session_key( + key, + openai_cache_affinity_max_age_seconds=settings.openai_cache_affinity_max_age_seconds, + ) else: key = _HTTPBridgeSessionKey( "turn_state_header", @@ -1956,7 +1961,7 @@ async def _get_or_create_http_bridge_session( if is_bridge_turn_state_replay or turn_state_token is not None else headers ), - affinity=affinity, + affinity=create_affinity, request_model=request_model, idle_ttl_seconds=effective_idle_ttl_seconds, bridge_session_id=created_session_id, @@ -5396,6 +5401,31 @@ def _make_http_bridge_session_key( ) +def _affinity_policy_from_http_bridge_session_key( + key: _HTTPBridgeSessionKey, + *, + openai_cache_affinity_max_age_seconds: int, +) -> _AffinityPolicy: + if key.affinity_kind in {"turn_state_header", "session_header"}: + return _AffinityPolicy( + key=key.affinity_key, + kind=StickySessionKind.CODEX_SESSION, + ) + if key.affinity_kind == StickySessionKind.PROMPT_CACHE.value: + return _AffinityPolicy( + key=key.affinity_key, + kind=StickySessionKind.PROMPT_CACHE, + max_age_seconds=openai_cache_affinity_max_age_seconds, + ) + if key.affinity_kind == StickySessionKind.STICKY_THREAD.value: + return _AffinityPolicy( + key=key.affinity_key, + kind=StickySessionKind.STICKY_THREAD, + reallocate_sticky=True, + ) + return _AffinityPolicy() + + def _effective_http_bridge_idle_ttl_seconds( *, affinity: _AffinityPolicy, diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index fbec60ff..cff4073b 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -1503,6 +1503,226 @@ async def fake_connect_responses_websocket( assert new_lease.affinity_key == "stale-owner-thread" +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_signed_turn_state_recovery_preserves_stable_affinity( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits( + monkeypatch, + enabled=True, + instance_id="instance-a", + instance_ring=["instance-a", "instance-b"], + ) + account_id = await _import_account( + async_client, + "acc_http_bridge_missing_signed_alias_stable_affinity", + "http-bridge-missing-signed-alias-stable-affinity@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + upstreams = [_FakeBridgeUpstreamWebSocket(), _FakeBridgeUpstreamWebSocket()] + connect_count = 0 + sticky_selections: list[tuple[str | None, object | None, bool, int | None]] = [] + session_id = "hbs_signed_missing_alias_stable_affinity" + signed_turn_state = service._encode_http_bridge_turn_state( + session_id=session_id, + owner_instance_id="instance-a", + api_key_id=None, + ) + + async def fake_select_account_with_budget( + self, + deadline, + *, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids=None, + additional_limit_name=None, + ): + del ( + self, + deadline, + request_id, + kind, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids, + additional_limit_name, + ) + sticky_selections.append((sticky_key, sticky_kind, reallocate_sticky, sticky_max_age_seconds)) + return AccountSelection(account=account, error_message=None, error_code=None) + + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del headers, access_token, account_id_header, base_url, session + nonlocal connect_count + connect_count += 1 + return upstreams.pop(0) + + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + + async with SessionLocal() as db_session: + await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) + await db_session.commit() + + async with service._repo_factory() as repos: + await repos.http_bridge_leases.upsert( + session_id=session_id, + affinity_kind="prompt_cache", + affinity_key="stable-affinity-thread", + api_key_scope="", + owner_instance_id="instance-a", + lease_expires_at=proxy_module._http_bridge_lease_expires_at(120.0), + account_id=account.id, + request_model="gpt-5.1", + codex_session=True, + idle_ttl_seconds=120.0, + upstream_turn_state=None, + downstream_turn_state=signed_turn_state, + ) + + recovered = await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) + + assert recovered.key.affinity_kind == "prompt_cache" + assert recovered.key.affinity_key == "stable-affinity-thread" + assert recovered.affinity == proxy_module._AffinityPolicy( + key="stable-affinity-thread", + kind=proxy_module.StickySessionKind.PROMPT_CACHE, + max_age_seconds=300, + ) + + reused = await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("prompt_cache", "stable-affinity-thread", None), + headers={}, + affinity=proxy_module._AffinityPolicy( + key="stable-affinity-thread", + kind=proxy_module.StickySessionKind.PROMPT_CACHE, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) + + request_state = proxy_module._WebSocketRequestState( + request_id="req-stable-affinity-reconnect", + model="gpt-5.1", + service_tier=None, + reasoning_effort=None, + api_key_reservation=None, + started_at=time.monotonic(), + awaiting_response_created=True, + request_text=json.dumps({"type": "response.create", "model": "gpt-5.1", "input": []}), + ) + await service._reconnect_http_bridge_session(recovered, request_state=request_state) + + assert reused is recovered + assert connect_count == 2 + assert sticky_selections == [ + ("stable-affinity-thread", proxy_module.StickySessionKind.PROMPT_CACHE, False, 300), + ("stable-affinity-thread", proxy_module.StickySessionKind.PROMPT_CACHE, False, 300), + ] + + +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_signed_turn_state_missing_local_alias_with_previous_response_expires( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits( + monkeypatch, + enabled=True, + instance_id="instance-a", + instance_ring=["instance-a", "instance-b"], + ) + account_id = await _import_account( + async_client, + "acc_http_bridge_missing_signed_alias_previous", + "http-bridge-missing-signed-alias-previous@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + session_id = "hbs_signed_missing_alias_previous" + signed_turn_state = service._encode_http_bridge_turn_state( + session_id=session_id, + owner_instance_id="instance-a", + api_key_id=None, + ) + + async with SessionLocal() as db_session: + await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) + await db_session.commit() + + async with service._repo_factory() as repos: + await repos.http_bridge_leases.upsert( + session_id=session_id, + affinity_kind="turn_state_header", + affinity_key=signed_turn_state, + api_key_scope="", + owner_instance_id="instance-a", + lease_expires_at=proxy_module._http_bridge_lease_expires_at(120.0), + account_id=account.id, + request_model="gpt-5.1", + codex_session=True, + idle_ttl_seconds=120.0, + upstream_turn_state=None, + downstream_turn_state=signed_turn_state, + ) + + with pytest.raises(proxy_module.ProxyResponseError) as exc_info: + await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + previous_response_id="resp_previous", + ) + + exc = exc_info.value + assert exc.status_code == 409 + assert exc.payload["error"]["code"] == "bridge_session_expired" + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_turn_state_alias_respects_api_key_isolation( async_client, From 1d2c5ce12e8356086ff7adbcee1234b01af67f85 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 11:55:07 +0000 Subject: [PATCH 08/34] fix(proxy): restore turn-state creation compatibility --- app/modules/proxy/service.py | 61 +++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index d98871d8..fcb44b10 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1760,24 +1760,15 @@ async def _get_or_create_http_bridge_session( api_key_id, ) else: - if previous_response_id is not None: - raise ProxyResponseError( - 400, - _http_bridge_previous_response_error_envelope( - previous_response_id, - ( - "HTTP bridge continuity was lost. Replay x-codex-turn-state " - "or retry with a stable prompt_cache_key." - ), - ), - ) - raise ProxyResponseError( - 409, - openai_error( - "bridge_instance_mismatch", - "HTTP bridge turn-state reached an instance that does not own the live session", - error_type="server_error", - ), + if incoming_turn_state.startswith("http_turn_") and self._has_http_bridge_turn_state_alias_conflict( + incoming_turn_state, + api_key_id=api_key_id, + ): + raise self._invalid_http_bridge_turn_state() + key = _HTTPBridgeSessionKey( + "turn_state_header", + incoming_turn_state, + api_key_id, ) await self._prune_http_bridge_sessions_locked() @@ -1954,18 +1945,30 @@ async def _get_or_create_http_bridge_session( session: _HTTPBridgeSession | None = None session_registered = False try: - session = await self._create_http_bridge_session( + create_headers = ( + _headers_without_local_http_bridge_turn_state(headers) + if is_bridge_turn_state_replay or turn_state_token is not None + else headers + ) + create_session = self._create_http_bridge_session + create_kwargs: dict[str, object] = { + "headers": create_headers, + "affinity": create_affinity, + "request_model": request_model, + "idle_ttl_seconds": effective_idle_ttl_seconds, + } + create_signature = inspect.signature(create_session) + accepts_extra_create_kwargs = any( + parameter.kind == inspect.Parameter.VAR_KEYWORD + for parameter in create_signature.parameters.values() + ) + if accepts_extra_create_kwargs or "bridge_session_id" in create_signature.parameters: + create_kwargs["bridge_session_id"] = created_session_id + if accepts_extra_create_kwargs or "owner_instance_id" in create_signature.parameters: + create_kwargs["owner_instance_id"] = current_instance + session = await create_session( key, - headers=( - _headers_without_local_http_bridge_turn_state(headers) - if is_bridge_turn_state_replay or turn_state_token is not None - else headers - ), - affinity=create_affinity, - request_model=request_model, - idle_ttl_seconds=effective_idle_ttl_seconds, - bridge_session_id=created_session_id, - owner_instance_id=current_instance, + **create_kwargs, ) async with self._http_bridge_lock: current_future = self._http_bridge_inflight_sessions.get(key) From a0624305feb46ef5b23644f403c3e8a00a3ed0db Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 12:07:32 +0000 Subject: [PATCH 09/34] fix(proxy): harden bridge turn-state recovery --- app/modules/proxy/service.py | 59 ++++++++++++------- .../integration/test_http_responses_bridge.py | 18 ++---- 2 files changed, 42 insertions(+), 35 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index fcb44b10..6ef9fc04 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1630,15 +1630,6 @@ async def _touch_http_bridge_lease(self, session: "_HTTPBridgeSession") -> None: downstream_turn_state=session.downstream_turn_state, ) - def _has_http_bridge_turn_state_alias_conflict(self, turn_state: str, *, api_key_id: str | None) -> bool: - requested_alias_key = _http_bridge_turn_state_alias_key(turn_state, api_key_id) - for alias_key in self._http_bridge_turn_state_index: - if alias_key[0] != turn_state: - continue - if alias_key != requested_alias_key: - return True - return False - async def _get_or_create_http_bridge_session( self, key: "_HTTPBridgeSessionKey", @@ -1706,7 +1697,8 @@ async def _get_or_create_http_bridge_session( capacity_wait_future: asyncio.Future[_HTTPBridgeSession] | None = None owns_creation = False continuity_error: ProxyResponseError | None = None - delete_stale_turn_state_lease = False + recovered_turn_state_replay = False + stale_turn_state_lease_session_id: str | None = None matched_turn_state_alias = False async with self._http_bridge_lock: @@ -1738,7 +1730,10 @@ async def _get_or_create_http_bridge_session( ] = alias_session.key key = alias_session.key elif turn_state_token is not None: - delete_stale_turn_state_lease = True + recovered_turn_state_replay = True + stale_turn_state_lease_session_id = ( + active_turn_state_lease.session_id if active_turn_state_lease is not None else None + ) if active_turn_state_lease is not None: key = _HTTPBridgeSessionKey( active_turn_state_lease.affinity_kind, @@ -1760,10 +1755,7 @@ async def _get_or_create_http_bridge_session( api_key_id, ) else: - if incoming_turn_state.startswith("http_turn_") and self._has_http_bridge_turn_state_alias_conflict( - incoming_turn_state, - api_key_id=api_key_id, - ): + if incoming_turn_state.startswith("http_turn_"): raise self._invalid_http_bridge_turn_state() key = _HTTPBridgeSessionKey( "turn_state_header", @@ -1838,12 +1830,13 @@ async def _get_or_create_http_bridge_session( self._http_bridge_sessions.pop(key, None) sessions_to_close.append(existing) if turn_state_token is not None: - delete_stale_turn_state_lease = True + recovered_turn_state_replay = True + stale_turn_state_lease_session_id = ( + active_turn_state_lease.session_id if active_turn_state_lease is not None else None + ) - if delete_stale_turn_state_lease and turn_state_token is not None: - await self._delete_http_bridge_lease(turn_state_token.session_id) - if previous_response_id is not None: - continuity_error = self._expired_http_bridge_turn_state() + if recovered_turn_state_replay and previous_response_id is not None: + continuity_error = self._expired_http_bridge_turn_state() if continuity_error is None and previous_response_id is not None: continuity_error = ProxyResponseError( @@ -1998,6 +1991,21 @@ async def _get_or_create_http_bridge_session( account_id=session.account.id, model=session.request_model, ) + if ( + stale_turn_state_lease_session_id is not None + and stale_turn_state_lease_session_id != session.bridge_session_id + ): + try: + await self._delete_http_bridge_lease(stale_turn_state_lease_session_id) + except Exception: + logger.warning( + "Failed to delete stale HTTP bridge lease after replacement registration", + extra={ + "stale_bridge_session_id": stale_turn_state_lease_session_id, + "replacement_bridge_session_id": session.bridge_session_id, + }, + exc_info=True, + ) return session async def _prune_http_bridge_sessions_locked(self) -> None: @@ -2030,7 +2038,9 @@ async def _close_http_bridge_session( *, turn_state_lock_held: bool = False, ) -> None: - async with session.lease_lock: + lease_lock = getattr(session, "lease_lock", None) + + async def _close_session() -> None: session.closed = True if turn_state_lock_held: self._unregister_http_bridge_turn_states_locked(session) @@ -2046,7 +2056,12 @@ async def _close_http_bridge_session( await session.upstream.close() except Exception: logger.debug("Failed to close HTTP bridge upstream websocket", exc_info=True) - await self._delete_http_bridge_lease(session.bridge_session_id) + await self._delete_http_bridge_lease(getattr(session, "bridge_session_id", None)) + if lease_lock is not None: + async with lease_lock: + await _close_session() + else: + await _close_session() _log_http_bridge_event( "close", session.key, diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index cff4073b..e94c0ca1 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -856,7 +856,7 @@ async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_se @pytest.mark.asyncio -async def test_v1_responses_http_bridge_missing_turn_state_alias_with_previous_response_id_fails_closed( +async def test_v1_responses_http_bridge_unsigned_legacy_turn_state_with_previous_response_id_fails_closed( app_instance, monkeypatch, ): @@ -882,16 +882,8 @@ async def test_v1_responses_http_bridge_missing_turn_state_alias_with_previous_r ) exc = exc_info.value - assert exc.status_code == 400 - assert exc.payload["error"] == { - "message": ( - "Previous response with id 'resp_missing_alias' not found. " - "HTTP bridge continuity was lost. Replay x-codex-turn-state or retry with a stable prompt_cache_key." - ), - "type": "invalid_request_error", - "code": "previous_response_not_found", - "param": "previous_response_id", - } + assert exc.status_code == 409 + assert exc.payload["error"].get("code") == "bridge_token_invalid" @pytest.mark.asyncio @@ -1065,7 +1057,7 @@ async def fake_connect_responses_websocket( @pytest.mark.asyncio -async def test_v1_responses_http_bridge_generated_turn_state_fails_closed_without_local_alias( +async def test_v1_responses_http_bridge_unsigned_legacy_turn_state_fails_closed_without_local_alias( async_client, app_instance, monkeypatch, @@ -1135,7 +1127,7 @@ async def fake_select_account_with_budget( exc = exc_info.value assert exc.status_code == 409 - assert exc.payload["error"].get("code") == "bridge_instance_mismatch" + assert exc.payload["error"].get("code") == "bridge_token_invalid" @pytest.mark.asyncio From ac0ecc47db15add056651e3bd6eb8f9b33afb47d Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 12:21:43 +0000 Subject: [PATCH 10/34] fix(proxy): stabilize bridge ownership recovery --- app/modules/proxy/service.py | 83 +++++--- .../integration/test_http_responses_bridge.py | 188 ++++++++++++++++++ 2 files changed, 248 insertions(+), 23 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 6ef9fc04..1a4202be 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -359,6 +359,14 @@ async def _stream_via_http_bridge( with anyio.CancelScope(shield=True): await self._detach_http_bridge_request(session, request_state=request_state) session.last_used_at = time.monotonic() + try: + await self._touch_http_bridge_lease(session) + except Exception: + logger.warning( + "Failed to persist HTTP bridge lease after request detach session_id=%s", + session.bridge_session_id, + exc_info=True, + ) async def compact_responses( self, @@ -1698,8 +1706,11 @@ async def _get_or_create_http_bridge_session( owns_creation = False continuity_error: ProxyResponseError | None = None recovered_turn_state_replay = False + rekey_recovered_turn_state = False stale_turn_state_lease_session_id: str | None = None matched_turn_state_alias = False + lookup_key = key + session_key = key async with self._http_bridge_lock: if incoming_turn_state is not None: @@ -1719,6 +1730,8 @@ async def _get_or_create_http_bridge_session( if alias_session is not None: matched_turn_state_alias = True key = alias_key + lookup_key = key + session_key = key self._promote_http_bridge_session_to_codex_affinity( alias_session, turn_state=incoming_turn_state, @@ -1740,20 +1753,28 @@ async def _get_or_create_http_bridge_session( active_turn_state_lease.affinity_key, api_key_id, ) + lookup_key = key + session_key = key create_affinity = _affinity_policy_from_http_bridge_session_key( key, openai_cache_affinity_max_age_seconds=settings.openai_cache_affinity_max_age_seconds, ) else: - key = _HTTPBridgeSessionKey( + lookup_key = _HTTPBridgeSessionKey( "turn_state_header", - self._encode_http_bridge_turn_state( - session_id=created_session_id, - owner_instance_id=current_instance, - api_key_id=api_key_id, - ), + turn_state_token.session_id, api_key_id, ) + if turn_state_token.owner_instance_id == current_instance: + key = _HTTPBridgeSessionKey( + "turn_state_header", + incoming_turn_state, + api_key_id, + ) + else: + rekey_recovered_turn_state = True + key = lookup_key + session_key = key else: if incoming_turn_state.startswith("http_turn_"): raise self._invalid_http_bridge_turn_state() @@ -1762,23 +1783,25 @@ async def _get_or_create_http_bridge_session( incoming_turn_state, api_key_id, ) + lookup_key = key + session_key = key await self._prune_http_bridge_sessions_locked() current_instance, ring = _normalized_http_bridge_instance_ring(settings) - owner_instance = _http_bridge_owner_instance(key, settings) + owner_instance = _http_bridge_owner_instance(lookup_key, settings) if ( not is_bridge_turn_state_replay and not matched_turn_state_alias and active_turn_state_lease is None - and key.affinity_kind != "request" + and lookup_key.affinity_kind != "request" and owner_instance is not None and len(ring) > 1 and owner_instance != current_instance ): _log_http_bridge_event( "owner_mismatch", - key, + lookup_key, account_id=None, model=request_model, detail=f"expected_instance={owner_instance}, current_instance={current_instance}", @@ -1795,14 +1818,14 @@ async def _get_or_create_http_bridge_session( ), ) - existing = self._http_bridge_sessions.get(key) + existing = self._http_bridge_sessions.get(lookup_key) if existing is not None and not existing.closed and existing.account.status == AccountStatus.ACTIVE: if ( incoming_turn_state is not None and self._http_bridge_turn_state_index.get( _http_bridge_turn_state_alias_key(incoming_turn_state, api_key_id) ) - == key + == existing.key ): self._promote_http_bridge_session_to_codex_affinity( existing, @@ -1813,7 +1836,7 @@ async def _get_or_create_http_bridge_session( existing.last_used_at = time.monotonic() _log_http_bridge_event( "reuse", - key, + existing.key, account_id=existing.account.id, model=existing.request_model, pending_count=await self._http_bridge_pending_count(existing), @@ -1823,11 +1846,11 @@ async def _get_or_create_http_bridge_session( if existing is not None: _log_http_bridge_event( "discard_stale", - key, + existing.key, account_id=existing.account.id, model=existing.request_model, ) - self._http_bridge_sessions.pop(key, None) + self._http_bridge_sessions.pop(lookup_key, None) sessions_to_close.append(existing) if turn_state_token is not None: recovered_turn_state_replay = True @@ -1851,7 +1874,7 @@ async def _get_or_create_http_bridge_session( ) if continuity_error is None: - inflight_future = self._http_bridge_inflight_sessions.get(key) + inflight_future = self._http_bridge_inflight_sessions.get(lookup_key) if inflight_future is None: while ( len(self._http_bridge_sessions) + len(self._http_bridge_inflight_sessions) >= max_sessions @@ -1883,7 +1906,7 @@ async def _get_or_create_http_bridge_session( else: _log_http_bridge_event( "capacity_exhausted_active_sessions", - key, + lookup_key, account_id=None, model=request_model, pending_count=( @@ -1900,7 +1923,7 @@ async def _get_or_create_http_bridge_session( ) else: inflight_future = asyncio.get_running_loop().create_future() - self._http_bridge_inflight_sessions[key] = inflight_future + self._http_bridge_inflight_sessions[lookup_key] = inflight_future owns_creation = True for stale_session in sessions_to_close: @@ -1938,6 +1961,20 @@ async def _get_or_create_http_bridge_session( session: _HTTPBridgeSession | None = None session_registered = False try: + if rekey_recovered_turn_state: + session_key = _HTTPBridgeSessionKey( + "turn_state_header", + self._encode_http_bridge_turn_state( + session_id=created_session_id, + owner_instance_id=current_instance, + api_key_id=api_key_id, + ), + api_key_id, + ) + create_affinity = _AffinityPolicy( + key=session_key.affinity_key, + kind=StickySessionKind.CODEX_SESSION, + ) create_headers = ( _headers_without_local_http_bridge_turn_state(headers) if is_bridge_turn_state_replay or turn_state_token is not None @@ -1960,22 +1997,22 @@ async def _get_or_create_http_bridge_session( if accepts_extra_create_kwargs or "owner_instance_id" in create_signature.parameters: create_kwargs["owner_instance_id"] = current_instance session = await create_session( - key, + session_key, **create_kwargs, ) async with self._http_bridge_lock: - current_future = self._http_bridge_inflight_sessions.get(key) + current_future = self._http_bridge_inflight_sessions.get(lookup_key) if current_future is inflight_future: - self._http_bridge_inflight_sessions.pop(key, None) - self._http_bridge_sessions[key] = session + self._http_bridge_inflight_sessions.pop(lookup_key, None) + self._http_bridge_sessions[session_key] = session session_registered = True if inflight_future is not None and not inflight_future.done(): inflight_future.set_result(session) except BaseException as exc: async with self._http_bridge_lock: - current_future = self._http_bridge_inflight_sessions.get(key) + current_future = self._http_bridge_inflight_sessions.get(lookup_key) if current_future is inflight_future: - self._http_bridge_inflight_sessions.pop(key, None) + self._http_bridge_inflight_sessions.pop(lookup_key, None) if inflight_future is not None and not inflight_future.done(): if isinstance(exc, asyncio.CancelledError): inflight_future.cancel() diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index e94c0ca1..82f92966 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -2064,6 +2064,107 @@ async def fake_connect_responses_websocket( assert alias_key not in service._http_bridge_turn_state_index +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_refreshes_lease_after_request_detach(app_instance, monkeypatch): + _install_bridge_settings_with_limits(monkeypatch, enabled=True) + service = get_proxy_service_for_app(app_instance) + + payload = proxy_module.ResponsesRequest.model_validate({"model": "gpt-5.1", "instructions": "hi", "input": []}) + session = cast( + proxy_module._HTTPBridgeSession, + _make_dummy_bridge_session(proxy_module._HTTPBridgeSessionKey("request", "bridge-lease-refresh", None)), + ) + session.bridge_session_id = "hbs_bridge_lease_refresh" + session.response_create_gate = asyncio.Semaphore(1) + + event_queue: asyncio.Queue[str | None] = asyncio.Queue() + await event_queue.put("data: {\"type\":\"response.completed\"}\n\n") + await event_queue.put(None) + request_state = proxy_module._WebSocketRequestState( + request_id="req_bridge_lease_refresh", + model=payload.model, + service_tier=None, + reasoning_effort=None, + api_key_reservation=None, + started_at=time.monotonic(), + ) + request_state.event_queue = event_queue + session.pending_requests.append(request_state) + session.queued_request_count = 1 + + touch_points: list[float] = [] + + def fake_prepare_http_bridge_request(self, payload, headers, *, api_key, api_key_reservation, request_id): + del self, payload, headers, api_key, api_key_reservation, request_id + return request_state, json.dumps({"type": "response.create", "model": "gpt-5.1", "input": []}) + + async def fake_get_or_create_http_bridge_session( + self, + key, + *, + headers, + affinity, + api_key, + request_model, + idle_ttl_seconds, + max_sessions, + previous_response_id=None, + ): + del self, key, headers, affinity, api_key, request_model, idle_ttl_seconds, max_sessions, previous_response_id + return session + + async def fake_submit_http_bridge_request(self, session, *, request_state, text_data, queue_limit): + del self, session, request_state, text_data, queue_limit + return None + + def fake_resolve_http_bridge_downstream_turn_state(self, session, *, requested_turn_state, api_key_id): + del self, session, requested_turn_state, api_key_id + return "http_turn_refresh_finished" + + async def fake_register_http_bridge_turn_state(self, session, turn_state): + del turn_state + await self._touch_http_bridge_lease(session) + + async def fake_touch_http_bridge_lease(self, session): + del self + touch_points.append(session.last_used_at) + + monkeypatch.setattr(proxy_module.ProxyService, "_prepare_http_bridge_request", fake_prepare_http_bridge_request) + monkeypatch.setattr(proxy_module.ProxyService, "_get_or_create_http_bridge_session", fake_get_or_create_http_bridge_session) + monkeypatch.setattr(proxy_module.ProxyService, "_submit_http_bridge_request", fake_submit_http_bridge_request) + monkeypatch.setattr( + proxy_module.ProxyService, + "_resolve_http_bridge_downstream_turn_state", + fake_resolve_http_bridge_downstream_turn_state, + ) + monkeypatch.setattr(proxy_module.ProxyService, "_register_http_bridge_turn_state", fake_register_http_bridge_turn_state) + monkeypatch.setattr(proxy_module.ProxyService, "_touch_http_bridge_lease", fake_touch_http_bridge_lease) + + events = [ + event + async for event in service._stream_via_http_bridge( + payload, + {}, + codex_session_affinity=False, + propagate_http_errors=False, + openai_cache_affinity=False, + api_key=None, + api_key_reservation=None, + suppress_text_done_events=False, + idle_ttl_seconds=120.0, + codex_idle_ttl_seconds=120.0, + max_sessions=8, + queue_limit=8, + ) + ] + + assert events == ['data: {"type":"response.completed"}\n\n'] + assert len(touch_points) == 2 + assert touch_points[1] >= touch_points[0] + assert session.last_used_at == touch_points[1] + assert not session.pending_requests + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_allows_unstable_request_key_even_on_non_owner_instance( async_client, @@ -5137,6 +5238,93 @@ async def fake_create_http_bridge_session( service._http_bridge_turn_state_index.clear() +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_singleflights_stale_signed_turn_state_recovery(app_instance, monkeypatch): + service = get_proxy_service_for_app(app_instance) + service._http_bridge_sessions.clear() + service._http_bridge_inflight_sessions.clear() + service._http_bridge_turn_state_index.clear() + + settings = SimpleNamespace( + http_responses_session_bridge_enabled=True, + http_responses_session_bridge_idle_ttl_seconds=120.0, + http_responses_session_bridge_codex_idle_ttl_seconds=120.0, + http_responses_session_bridge_max_sessions=8, + http_responses_session_bridge_instance_id="instance-a", + http_responses_session_bridge_instance_ring=["instance-a", "instance-b"], + ) + monkeypatch.setattr(proxy_module, "get_settings_cache", lambda: _SettingsCache(settings)) + monkeypatch.setattr(proxy_module, "get_settings", lambda: settings) + + create_started: list[str] = [] + signed_turn_state = service._encode_http_bridge_turn_state( + session_id="hbs_signed_stale_singleflight", + owner_instance_id="instance-b", + api_key_id=None, + ) + + async def fake_create_http_bridge_session( + self, + key, + *, + headers, + affinity, + request_model, + idle_ttl_seconds, + bridge_session_id=None, + owner_instance_id=None, + ): + del self, headers, affinity, request_model, idle_ttl_seconds + create_started.append(key.affinity_key) + await asyncio.sleep(0.2) + session = _make_dummy_bridge_session(key) + session.bridge_session_id = bridge_session_id or "" + session.owner_instance_id = owner_instance_id or "instance-a" + return session + + monkeypatch.setattr(proxy_module.ProxyService, "_create_http_bridge_session", fake_create_http_bridge_session) + + try: + first = asyncio.create_task( + service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=8, + ) + ) + second = asyncio.create_task( + service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=8, + ) + ) + session_one, session_two = await asyncio.gather(first, second) + + assert len(create_started) == 1 + assert session_one is session_two + assert session_one.key.affinity_kind == "turn_state_header" + assert session_one.key.affinity_key != signed_turn_state + finally: + service._http_bridge_sessions.clear() + service._http_bridge_inflight_sessions.clear() + service._http_bridge_turn_state_index.clear() + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_cleans_up_cancelled_singleflight_creator(app_instance, monkeypatch): service = get_proxy_service_for_app(app_instance) From ba1a8463c84c5ea6ae233b6e3e72a25771d1f364 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 12:33:48 +0000 Subject: [PATCH 11/34] fix(proxy): preserve bridge turn-state rollout compatibility --- app/modules/proxy/service.py | 29 ++- .../integration/test_http_responses_bridge.py | 165 ++++++++++++++++-- 2 files changed, 167 insertions(+), 27 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 1a4202be..02ad577c 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1697,8 +1697,6 @@ async def _get_or_create_http_bridge_session( ), ) created_session_id = self._new_http_bridge_session_id() - if turn_state_token is not None and turn_state_token.owner_instance_id == current_instance: - created_session_id = turn_state_token.session_id while True: sessions_to_close: list[_HTTPBridgeSession] = [] inflight_future: asyncio.Future[_HTTPBridgeSession] | None = None @@ -1759,25 +1757,17 @@ async def _get_or_create_http_bridge_session( key, openai_cache_affinity_max_age_seconds=settings.openai_cache_affinity_max_age_seconds, ) + rekey_recovered_turn_state = key.affinity_kind == "turn_state_header" else: lookup_key = _HTTPBridgeSessionKey( "turn_state_header", turn_state_token.session_id, api_key_id, ) - if turn_state_token.owner_instance_id == current_instance: - key = _HTTPBridgeSessionKey( - "turn_state_header", - incoming_turn_state, - api_key_id, - ) - else: - rekey_recovered_turn_state = True - key = lookup_key + rekey_recovered_turn_state = True + key = lookup_key session_key = key else: - if incoming_turn_state.startswith("http_turn_"): - raise self._invalid_http_bridge_turn_state() key = _HTTPBridgeSessionKey( "turn_state_header", incoming_turn_state, @@ -1791,8 +1781,7 @@ async def _get_or_create_http_bridge_session( current_instance, ring = _normalized_http_bridge_instance_ring(settings) owner_instance = _http_bridge_owner_instance(lookup_key, settings) if ( - not is_bridge_turn_state_replay - and not matched_turn_state_alias + not matched_turn_state_alias and active_turn_state_lease is None and lookup_key.affinity_kind != "request" and owner_instance is not None @@ -2266,7 +2255,15 @@ async def _create_http_bridge_session( upstream_turn_state=_upstream_turn_state_from_socket(upstream), downstream_turn_state=None, ) - await self._persist_http_bridge_lease(session) + try: + await self._persist_http_bridge_lease(session) + except BaseException: + session.closed = True + try: + await upstream.close() + except Exception: + logger.debug("Failed to close HTTP bridge upstream websocket after lease persistence error", exc_info=True) + raise session.upstream_reader = asyncio.create_task(self._relay_http_bridge_upstream_messages(session)) return session diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 82f92966..6f2aff52 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -856,7 +856,7 @@ async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_se @pytest.mark.asyncio -async def test_v1_responses_http_bridge_unsigned_legacy_turn_state_with_previous_response_id_fails_closed( +async def test_v1_responses_http_bridge_unsigned_legacy_turn_state_preserves_previous_response_compatibility( app_instance, monkeypatch, ): @@ -882,8 +882,16 @@ async def test_v1_responses_http_bridge_unsigned_legacy_turn_state_with_previous ) exc = exc_info.value - assert exc.status_code == 409 - assert exc.payload["error"].get("code") == "bridge_token_invalid" + assert exc.status_code == 400 + assert exc.payload["error"] == { + "message": ( + "Previous response with id 'resp_missing_alias' not found. " + "HTTP bridge continuity was lost. Replay x-codex-turn-state or retry with a stable prompt_cache_key." + ), + "type": "invalid_request_error", + "code": "previous_response_not_found", + "param": "previous_response_id", + } @pytest.mark.asyncio @@ -1057,7 +1065,7 @@ async def fake_connect_responses_websocket( @pytest.mark.asyncio -async def test_v1_responses_http_bridge_unsigned_legacy_turn_state_fails_closed_without_local_alias( +async def test_v1_responses_http_bridge_unsigned_legacy_turn_state_uses_owner_routing_without_local_alias( async_client, app_instance, monkeypatch, @@ -1111,12 +1119,27 @@ async def fake_select_account_with_budget( monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + legacy_turn_state = next( + candidate + for candidate in ( + "http_turn_missing_alias_a", + "http_turn_missing_alias_b", + "http_turn_missing_alias_c", + "http_turn_missing_alias_d", + ) + if proxy_module._http_bridge_owner_instance( + proxy_module._HTTPBridgeSessionKey("turn_state_header", candidate, None), + proxy_module.get_settings(), + ) + == "instance-b" + ) + with pytest.raises(proxy_module.ProxyResponseError) as exc_info: await service._get_or_create_http_bridge_session( - proxy_module._HTTPBridgeSessionKey("turn_state_header", "http_turn_missing_alias", None), - headers={"x-codex-turn-state": "http_turn_missing_alias"}, + proxy_module._HTTPBridgeSessionKey("turn_state_header", legacy_turn_state, None), + headers={"x-codex-turn-state": legacy_turn_state}, affinity=proxy_module._AffinityPolicy( - key="http_turn_missing_alias", + key=legacy_turn_state, kind=proxy_module.StickySessionKind.CODEX_SESSION, ), api_key=None, @@ -1127,7 +1150,7 @@ async def fake_select_account_with_budget( exc = exc_info.value assert exc.status_code == 409 - assert exc.payload["error"].get("code") == "bridge_token_invalid" + assert exc.payload["error"].get("code") == "bridge_wrong_instance" @pytest.mark.asyncio @@ -1245,9 +1268,28 @@ async def fake_connect_responses_websocket( ) assert session.key.affinity_kind == "turn_state_header" - assert session.bridge_session_id == session_id + assert session.bridge_session_id != session_id + assert session.key.affinity_key != signed_turn_state + recovered_token = service._decode_http_bridge_turn_state(session.key.affinity_key, api_key_id=None) + assert recovered_token is not None + assert recovered_token.session_id == session.bridge_session_id + assert recovered_token.owner_instance_id == "instance-a" assert connect_headers_seen[-1].get("x-codex-turn-state") is None + async with SessionLocal() as db_session: + stale_lease = ( + await db_session.execute(select(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) + ).scalar_one_or_none() + new_lease = ( + await db_session.execute( + select(HttpBridgeLease).where(HttpBridgeLease.session_id == session.bridge_session_id) + ) + ).scalar_one() + + assert stale_lease is None + assert new_lease.affinity_kind == "turn_state_header" + assert new_lease.affinity_key == session.key.affinity_key + @pytest.mark.asyncio async def test_v1_responses_http_bridge_signed_turn_state_owner_mismatch_rekeys_recovered_session( @@ -2165,6 +2207,93 @@ async def fake_touch_http_bridge_lease(self, session): assert not session.pending_requests +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_creation_closes_upstream_when_lease_persist_fails( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits(monkeypatch, enabled=True) + account_id = await _import_account( + async_client, + "acc_http_bridge_lease_persist_failure", + "http-bridge-lease-persist-failure@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + fake_upstream = _FakeBridgeUpstreamWebSocket() + + async def fake_select_account_with_budget( + self, + deadline, + *, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids=None, + additional_limit_name=None, + ): + del ( + self, + deadline, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids, + additional_limit_name, + ) + return AccountSelection(account=account, error_message=None, error_code=None) + + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del headers, access_token, account_id_header, base_url, session + return fake_upstream + + async def fake_persist_http_bridge_lease(self, session): + del self, session + raise RuntimeError("lease persistence failed") + + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + monkeypatch.setattr(proxy_module.ProxyService, "_persist_http_bridge_lease", fake_persist_http_bridge_lease) + + with pytest.raises(RuntimeError, match="lease persistence failed"): + await service._create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("request", "lease-persist-failure", None), + headers={}, + affinity=proxy_module._AffinityPolicy(), + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + bridge_session_id="hbs_lease_persist_failure", + owner_instance_id="instance-a", + ) + + assert fake_upstream.closed is True + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_allows_unstable_request_key_even_on_non_owner_instance( async_client, @@ -5257,9 +5386,23 @@ async def test_v1_responses_http_bridge_singleflights_stale_signed_turn_state_re monkeypatch.setattr(proxy_module, "get_settings", lambda: settings) create_started: list[str] = [] + session_id = next( + candidate + for candidate in ( + "hbs_signed_stale_singleflight_a", + "hbs_signed_stale_singleflight_b", + "hbs_signed_stale_singleflight_c", + "hbs_signed_stale_singleflight_d", + ) + if proxy_module._http_bridge_owner_instance( + proxy_module._HTTPBridgeSessionKey("turn_state_header", candidate, None), + settings, + ) + == "instance-a" + ) signed_turn_state = service._encode_http_bridge_turn_state( - session_id="hbs_signed_stale_singleflight", - owner_instance_id="instance-b", + session_id=session_id, + owner_instance_id="instance-a", api_key_id=None, ) From f5230aa87eb2aba00a15b57cef7df51c2c30c011 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 12:45:11 +0000 Subject: [PATCH 12/34] fix(proxy): preserve bridge lease handoff on replay --- app/modules/proxy/service.py | 122 ++++++++++-------- .../integration/test_http_responses_bridge.py | 117 +++++++++++++++++ 2 files changed, 188 insertions(+), 51 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 02ad577c..81c9f4ea 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1728,8 +1728,6 @@ async def _get_or_create_http_bridge_session( if alias_session is not None: matched_turn_state_alias = True key = alias_key - lookup_key = key - session_key = key self._promote_http_bridge_session_to_codex_affinity( alias_session, turn_state=incoming_turn_state, @@ -1740,6 +1738,8 @@ async def _get_or_create_http_bridge_session( _http_bridge_turn_state_alias_key(alias, alias_session.key.api_key_id) ] = alias_session.key key = alias_session.key + lookup_key = key + session_key = key elif turn_state_token is not None: recovered_turn_state_replay = True stale_turn_state_lease_session_id = ( @@ -2558,7 +2558,8 @@ async def _relay_http_bridge_upstream_messages( break finally: session.closed = True - await self._delete_http_bridge_lease(session.bridge_session_id) + if not session.preserve_lease_during_reconnect: + await self._delete_http_bridge_lease(session.bridge_session_id) async def _retry_http_bridge_request_on_fresh_upstream( self, @@ -2650,7 +2651,11 @@ async def _reconnect_http_bridge_session( old_account_id = session.account.id old_upstream = session.upstream old_reader = session.upstream_reader if restart_reader else None + new_upstream: UpstreamResponsesWebSocket | None = None + preserve_lease_during_reconnect = False if old_reader is not None: + session.preserve_lease_during_reconnect = True + preserve_lease_during_reconnect = True old_reader.cancel() if old_reader is not asyncio.current_task(): try: @@ -2662,56 +2667,70 @@ async def _reconnect_http_bridge_session( except Exception: logger.debug("Failed to close HTTP bridge upstream websocket before reconnect", exc_info=True) - deadline = _websocket_connect_deadline(request_state, get_settings().proxy_request_budget_seconds) - settings = await get_settings_cache().get() - selection = await self._select_account_with_budget( - deadline, - request_id=request_state.request_log_id or request_state.request_id, - kind="http_bridge", - sticky_key=session.affinity.key, - sticky_kind=session.affinity.kind, - reallocate_sticky=session.affinity.reallocate_sticky, - sticky_max_age_seconds=session.affinity.max_age_seconds, - prefer_earlier_reset_accounts=settings.prefer_earlier_reset_accounts, - routing_strategy=_routing_strategy(settings), - model=session.request_model, - ) - account = selection.account - if account is None: - raise ProxyResponseError( - 503, - openai_error( - selection.error_code or "no_accounts", - selection.error_message or "No active accounts available", - error_type="server_error", - ), - ) - account = await self._ensure_fresh_with_budget(account, timeout_seconds=_remaining_budget_seconds(deadline)) - connect_headers = _headers_with_turn_state( - session.headers, - _preferred_http_bridge_reconnect_turn_state(session), - ) - upstream = await self._open_upstream_websocket_with_budget( - account, - connect_headers, - timeout_seconds=_remaining_budget_seconds(deadline), - ) - session.account = account - session.headers = connect_headers - session.upstream = upstream - session.upstream_control = _WebSocketUpstreamControl() - session.closed = False - session.upstream_turn_state = _upstream_turn_state_from_socket(upstream) or session.upstream_turn_state try: - await self._touch_http_bridge_lease(session) - except Exception: - logger.warning( - "Failed to persist HTTP bridge lease after reconnect session_id=%s", - session.bridge_session_id, - exc_info=True, + deadline = _websocket_connect_deadline(request_state, get_settings().proxy_request_budget_seconds) + settings = await get_settings_cache().get() + selection = await self._select_account_with_budget( + deadline, + request_id=request_state.request_log_id or request_state.request_id, + kind="http_bridge", + sticky_key=session.affinity.key, + sticky_kind=session.affinity.kind, + reallocate_sticky=session.affinity.reallocate_sticky, + sticky_max_age_seconds=session.affinity.max_age_seconds, + prefer_earlier_reset_accounts=settings.prefer_earlier_reset_accounts, + routing_strategy=_routing_strategy(settings), + model=session.request_model, ) - if restart_reader: - session.upstream_reader = asyncio.create_task(self._relay_http_bridge_upstream_messages(session)) + account = selection.account + if account is None: + raise ProxyResponseError( + 503, + openai_error( + selection.error_code or "no_accounts", + selection.error_message or "No active accounts available", + error_type="server_error", + ), + ) + account = await self._ensure_fresh_with_budget(account, timeout_seconds=_remaining_budget_seconds(deadline)) + connect_headers = _headers_with_turn_state( + session.headers, + _preferred_http_bridge_reconnect_turn_state(session), + ) + new_upstream = await self._open_upstream_websocket_with_budget( + account, + connect_headers, + timeout_seconds=_remaining_budget_seconds(deadline), + ) + session.account = account + session.headers = connect_headers + session.upstream = new_upstream + session.upstream_control = _WebSocketUpstreamControl() + session.closed = False + session.upstream_turn_state = _upstream_turn_state_from_socket(new_upstream) or session.upstream_turn_state + try: + await self._touch_http_bridge_lease(session) + except Exception: + logger.warning( + "Failed to persist HTTP bridge lease after reconnect session_id=%s", + session.bridge_session_id, + exc_info=True, + ) + if restart_reader: + session.upstream_reader = asyncio.create_task(self._relay_http_bridge_upstream_messages(session)) + except BaseException: + session.closed = True + if new_upstream is not None: + try: + await new_upstream.close() + except Exception: + logger.debug("Failed to close replacement HTTP bridge websocket after reconnect error", exc_info=True) + if preserve_lease_during_reconnect: + session.preserve_lease_during_reconnect = False + await self._delete_http_bridge_lease(session.bridge_session_id) + raise + if preserve_lease_during_reconnect: + session.preserve_lease_during_reconnect = False _log_http_bridge_event( "reconnect", session.key, @@ -4735,6 +4754,7 @@ class _HTTPBridgeSession: downstream_turn_state: str | None = None downstream_turn_state_aliases: set[str] = field(default_factory=set) upstream_reader: asyncio.Task[None] | None = None + preserve_lease_during_reconnect: bool = False closed: bool = False diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 6f2aff52..91d09156 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -2515,6 +2515,123 @@ async def fake_connect_responses_websocket( assert bridge_session.upstream_turn_state == "upstream_turn_state_2" +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_reconnect_restart_reader_preserves_lease_until_touch( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits(monkeypatch, enabled=True) + account_id = await _import_account( + async_client, + "acc_http_bridge_reconnect_lease_handoff", + "http-bridge-reconnect-lease-handoff@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + first_upstream = _FakeBridgeUpstreamWebSocket() + second_upstream = _FakeBridgeUpstreamWebSocket() + upstreams = [first_upstream, second_upstream] + + async def fake_select_account_with_budget( + self, + deadline, + *, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids=None, + additional_limit_name=None, + ): + del ( + self, + deadline, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids, + additional_limit_name, + ) + return AccountSelection(account=account, error_message=None, error_code=None) + + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del headers, access_token, account_id_header, base_url, session + return upstreams.pop(0) + + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + + payload = proxy_module.ResponsesRequest.model_validate({"model": "gpt-5.1", "instructions": "hi", "input": []}) + affinity = proxy_module._AffinityPolicy( + key="reconnect-lease-handoff", + kind=proxy_module.StickySessionKind.PROMPT_CACHE, + ) + session = await service._get_or_create_http_bridge_session( + proxy_module._make_http_bridge_session_key( + payload, + headers={}, + affinity=affinity, + api_key=None, + request_id="req_reconnect_lease_handoff", + ), + headers={}, + affinity=affinity, + api_key=None, + request_model=payload.model, + idle_ttl_seconds=120.0, + max_sessions=128, + ) + + call_order: list[str] = [] + + async def fake_delete_http_bridge_lease(self, session_id): + del self, session_id + call_order.append("delete") + + async def fake_touch_http_bridge_lease(self, session): + del self, session + call_order.append("touch") + + monkeypatch.setattr(proxy_module.ProxyService, "_delete_http_bridge_lease", fake_delete_http_bridge_lease) + monkeypatch.setattr(proxy_module.ProxyService, "_touch_http_bridge_lease", fake_touch_http_bridge_lease) + + request_state = proxy_module._WebSocketRequestState( + request_id="req_reconnect_lease_restart", + model=payload.model, + service_tier=None, + reasoning_effort=None, + api_key_reservation=None, + started_at=time.monotonic(), + ) + await service._reconnect_http_bridge_session(session, request_state=request_state, restart_reader=True) + + assert call_order == ["touch"] + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_session_id_reconnect_keeps_upstream_turn_state( async_client, From e5ca1931b5172dd9e06f9737ce404a578e108bfc Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 13:01:54 +0000 Subject: [PATCH 13/34] fix(proxy): keep bridge leases alive per worker --- app/modules/proxy/service.py | 98 +++++++++++++-- .../integration/test_http_responses_bridge.py | 114 +++++++++++++++++- 2 files changed, 199 insertions(+), 13 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 81c9f4ea..af82c684 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -4,6 +4,7 @@ import inspect import json import logging +import os import time from collections import deque from collections.abc import Sequence @@ -1638,6 +1639,49 @@ async def _touch_http_bridge_lease(self, session: "_HTTPBridgeSession") -> None: downstream_turn_state=session.downstream_turn_state, ) + async def _ensure_http_bridge_lease_keepalive(self, session: "_HTTPBridgeSession") -> None: + task = getattr(session, "lease_keepalive_task", None) + if task is not None and not task.done(): + return + + async def _keepalive() -> None: + interval_seconds = max(1.0, min(session.idle_ttl_seconds / 2.0, 60.0)) + try: + while True: + await asyncio.sleep(interval_seconds) + if session.closed: + return + pending_count = await self._http_bridge_pending_count(session) + if pending_count <= 0: + return + try: + await self._touch_http_bridge_lease(session) + except Exception: + logger.warning( + "Failed to refresh HTTP bridge lease during active stream session_id=%s", + session.bridge_session_id, + exc_info=True, + ) + except asyncio.CancelledError: + raise + + session.lease_keepalive_task = asyncio.create_task(_keepalive()) + + async def _stop_http_bridge_lease_keepalive(self, session: "_HTTPBridgeSession") -> None: + task = getattr(session, "lease_keepalive_task", None) + if task is None: + return + session.lease_keepalive_task = None + if task.done(): + return + task.cancel() + if task is asyncio.current_task(): + return + try: + await task + except asyncio.CancelledError: + pass + async def _get_or_create_http_bridge_session( self, key: "_HTTPBridgeSessionKey", @@ -1653,6 +1697,7 @@ async def _get_or_create_http_bridge_session( settings = get_settings() api_key_id = api_key.id if api_key is not None else None current_instance, ring = _normalized_http_bridge_instance_ring(settings) + current_owner = _http_bridge_current_owner_id(settings) effective_idle_ttl_seconds = _effective_http_bridge_idle_ttl_seconds( affinity=affinity, idle_ttl_seconds=idle_ttl_seconds, @@ -1671,8 +1716,12 @@ async def _get_or_create_http_bridge_session( ) if ( active_turn_state_lease is not None - and active_turn_state_lease.owner_instance_id in ring - and active_turn_state_lease.owner_instance_id != current_instance + and not _http_bridge_owner_matches_current( + active_turn_state_lease.owner_instance_id, + current_owner_id=current_owner, + current_instance_id=current_instance, + ) + and _http_bridge_owner_instance_group(active_turn_state_lease.owner_instance_id) in ring ): _log_http_bridge_event( "owner_mismatch", @@ -1691,7 +1740,7 @@ async def _get_or_create_http_bridge_session( "bridge_wrong_instance", ( "HTTP responses session bridge turn-state is owned by another live instance " - f"(expected {active_turn_state_lease.owner_instance_id}, got {current_instance})" + f"(expected {active_turn_state_lease.owner_instance_id}, got {current_owner})" ), error_type="server_error", ), @@ -1779,6 +1828,7 @@ async def _get_or_create_http_bridge_session( await self._prune_http_bridge_sessions_locked() current_instance, ring = _normalized_http_bridge_instance_ring(settings) + current_owner = _http_bridge_current_owner_id(settings) owner_instance = _http_bridge_owner_instance(lookup_key, settings) if ( not matched_turn_state_alias @@ -1955,10 +2005,10 @@ async def _get_or_create_http_bridge_session( "turn_state_header", self._encode_http_bridge_turn_state( session_id=created_session_id, - owner_instance_id=current_instance, - api_key_id=api_key_id, - ), - api_key_id, + owner_instance_id=current_owner, + api_key_id=api_key_id, + ), + api_key_id, ) create_affinity = _AffinityPolicy( key=session_key.affinity_key, @@ -1984,7 +2034,7 @@ async def _get_or_create_http_bridge_session( if accepts_extra_create_kwargs or "bridge_session_id" in create_signature.parameters: create_kwargs["bridge_session_id"] = created_session_id if accepts_extra_create_kwargs or "owner_instance_id" in create_signature.parameters: - create_kwargs["owner_instance_id"] = current_instance + create_kwargs["owner_instance_id"] = current_owner session = await create_session( session_key, **create_kwargs, @@ -2068,6 +2118,7 @@ async def _close_http_bridge_session( async def _close_session() -> None: session.closed = True + await self._stop_http_bridge_lease_keepalive(session) if turn_state_lock_held: self._unregister_http_bridge_turn_states_locked(session) else: @@ -2317,6 +2368,7 @@ async def _submit_http_bridge_request( async with session.pending_lock: session.pending_requests.append(request_state) request_enqueued = True + await self._ensure_http_bridge_lease_keepalive(session) await session.upstream.send_text(text_data) session.last_used_at = time.monotonic() except asyncio.CancelledError: @@ -2431,6 +2483,7 @@ async def _maybe_prewarm_http_bridge_session( async with session.pending_lock: session.pending_requests.append(warmup_state) request_enqueued = True + await self._ensure_http_bridge_lease_keepalive(session) await session.upstream.send_text(warmup_text) while True: event_block = await event_queue.get() @@ -2470,6 +2523,9 @@ async def _cleanup_http_bridge_submit_interruption( if request_enqueued and request_state in session.pending_requests: session.pending_requests.remove(request_state) session.queued_request_count = max(0, session.queued_request_count - 1) + has_pending_requests = bool(session.pending_requests) + if not has_pending_requests: + await self._stop_http_bridge_lease_keepalive(session) if gate_acquired: _release_websocket_response_create_gate(request_state, session.response_create_gate) @@ -2485,9 +2541,12 @@ async def _detach_http_bridge_request( session.pending_requests.remove(request_state) session.queued_request_count = max(0, session.queued_request_count - 1) removed = True + has_pending_requests = bool(session.pending_requests) request_state.event_queue = None if not removed: return False + if not has_pending_requests: + await self._stop_http_bridge_lease_keepalive(session) _release_websocket_response_create_gate(request_state, session.response_create_gate) await self._release_websocket_reservation(request_state.api_key_reservation) request_state.api_key_reservation = None @@ -2558,6 +2617,7 @@ async def _relay_http_bridge_upstream_messages( break finally: session.closed = True + await self._stop_http_bridge_lease_keepalive(session) if not session.preserve_lease_during_reconnect: await self._delete_http_bridge_lease(session.bridge_session_id) @@ -2794,6 +2854,9 @@ async def _process_http_bridge_upstream_text( if terminal_request_state is None: return + if await self._http_bridge_pending_count(session) <= 0: + await self._stop_http_bridge_lease_keepalive(session) + if terminal_request_state is not matched_request_state and terminal_request_state.event_queue is not None: await terminal_request_state.event_queue.put(event_block) if terminal_request_state.event_queue is not None: @@ -4754,6 +4817,7 @@ class _HTTPBridgeSession: downstream_turn_state: str | None = None downstream_turn_state_aliases: set[str] = field(default_factory=set) upstream_reader: asyncio.Task[None] | None = None + lease_keepalive_task: asyncio.Task[None] | None = None preserve_lease_during_reconnect: bool = False closed: bool = False @@ -5576,6 +5640,24 @@ def _normalized_http_bridge_instance_ring(settings: object) -> tuple[str, tuple[ return instance_id, tuple(sorted(set(ring_entries))) +def _http_bridge_current_owner_id(settings: object) -> str: + instance_id, _ = _normalized_http_bridge_instance_ring(settings) + return f"{instance_id}@{os.getpid()}" + + +def _http_bridge_owner_instance_group(owner_id: str) -> str: + return owner_id.split("@", 1)[0] + + +def _http_bridge_owner_matches_current( + owner_id: str, + *, + current_owner_id: str, + current_instance_id: str, +) -> bool: + return owner_id == current_owner_id or owner_id == current_instance_id + + def _http_bridge_owner_instance(key: _HTTPBridgeSessionKey, settings: object) -> str | None: instance_id, ring = _normalized_http_bridge_instance_ring(settings) if len(ring) <= 1: diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 91d09156..10c1790d 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -1273,7 +1273,7 @@ async def fake_connect_responses_websocket( recovered_token = service._decode_http_bridge_turn_state(session.key.affinity_key, api_key_id=None) assert recovered_token is not None assert recovered_token.session_id == session.bridge_session_id - assert recovered_token.owner_instance_id == "instance-a" + assert proxy_module._http_bridge_owner_instance_group(recovered_token.owner_instance_id) == "instance-a" assert connect_headers_seen[-1].get("x-codex-turn-state") is None async with SessionLocal() as db_session: @@ -1291,6 +1291,73 @@ async def fake_connect_responses_websocket( assert new_lease.affinity_key == session.key.affinity_key +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_worker_is_wrong_instance( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits( + monkeypatch, + enabled=True, + instance_id="instance-a", + instance_ring=["instance-a", "instance-b"], + ) + account_id = await _import_account( + async_client, + "acc_http_bridge_worker_owner_mismatch", + "http-bridge-worker-owner-mismatch@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + session_id = "hbs_signed_worker_owner_mismatch" + signed_turn_state = service._encode_http_bridge_turn_state( + session_id=session_id, + owner_instance_id="instance-a@worker-1", + api_key_id=None, + ) + + monkeypatch.setattr(proxy_module, "_http_bridge_current_owner_id", lambda settings: "instance-a@worker-2") + + async with SessionLocal() as db_session: + await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) + await db_session.commit() + + async with service._repo_factory() as repos: + await repos.http_bridge_leases.upsert( + session_id=session_id, + affinity_kind="turn_state_header", + affinity_key=signed_turn_state, + api_key_scope="", + owner_instance_id="instance-a@worker-1", + lease_expires_at=proxy_module._http_bridge_lease_expires_at(120.0), + account_id=account.id, + request_model="gpt-5.1", + codex_session=True, + idle_ttl_seconds=120.0, + upstream_turn_state=None, + downstream_turn_state=signed_turn_state, + ) + + with pytest.raises(proxy_module.ProxyResponseError) as exc_info: + await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) + + exc = exc_info.value + assert exc.status_code == 409 + assert exc.payload["error"].get("code") == "bridge_wrong_instance" + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_signed_turn_state_owner_mismatch_rekeys_recovered_session( async_client, @@ -1393,13 +1460,13 @@ async def fake_connect_responses_websocket( recovered_token = service._decode_http_bridge_turn_state(session.key.affinity_key, api_key_id=None) assert recovered_token is not None assert recovered_token.session_id == session.bridge_session_id - assert recovered_token.owner_instance_id == "instance-a" + assert proxy_module._http_bridge_owner_instance_group(recovered_token.owner_instance_id) == "instance-a" async with SessionLocal() as db_session: lease = ( await db_session.execute(select(HttpBridgeLease).where(HttpBridgeLease.session_id == session.bridge_session_id)) ).scalar_one() - assert lease.owner_instance_id == "instance-a" + assert proxy_module._http_bridge_owner_instance_group(lease.owner_instance_id) == "instance-a" assert lease.affinity_kind == "turn_state_header" assert lease.affinity_key == session.key.affinity_key @@ -1519,7 +1586,7 @@ async def fake_connect_responses_websocket( assert recovered.key.affinity_kind == "prompt_cache" assert recovered.key.affinity_key == "stale-owner-thread" assert recovered.bridge_session_id != stale_session_id - assert recovered.owner_instance_id == "instance-new" + assert proxy_module._http_bridge_owner_instance_group(recovered.owner_instance_id) == "instance-new" async with SessionLocal() as db_session: stale_lease = ( @@ -1532,7 +1599,7 @@ async def fake_connect_responses_websocket( ).scalar_one() assert stale_lease is None - assert new_lease.owner_instance_id == "instance-new" + assert proxy_module._http_bridge_owner_instance_group(new_lease.owner_instance_id) == "instance-new" assert new_lease.affinity_kind == "prompt_cache" assert new_lease.affinity_key == "stale-owner-thread" @@ -2207,6 +2274,43 @@ async def fake_touch_http_bridge_lease(self, session): assert not session.pending_requests +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_keeps_lease_alive_while_request_is_active(app_instance, monkeypatch): + _install_bridge_settings_with_limits(monkeypatch, enabled=True) + service = get_proxy_service_for_app(app_instance) + session = cast( + proxy_module._HTTPBridgeSession, + _make_dummy_bridge_session(proxy_module._HTTPBridgeSessionKey("request", "bridge-lease-keepalive", None)), + ) + session.bridge_session_id = "hbs_bridge_lease_keepalive" + session.idle_ttl_seconds = 2.0 + session.response_create_gate = asyncio.Semaphore(1) + request_state = proxy_module._WebSocketRequestState( + request_id="req_bridge_lease_keepalive", + model="gpt-5.1", + service_tier=None, + reasoning_effort=None, + api_key_reservation=None, + started_at=time.monotonic(), + ) + session.pending_requests.append(request_state) + session.queued_request_count = 1 + + touch_points: list[float] = [] + + async def fake_touch_http_bridge_lease(self, session): + del self + touch_points.append(session.last_used_at) + + monkeypatch.setattr(proxy_module.ProxyService, "_touch_http_bridge_lease", fake_touch_http_bridge_lease) + + await service._ensure_http_bridge_lease_keepalive(session) + await asyncio.sleep(1.1) + await service._stop_http_bridge_lease_keepalive(session) + + assert touch_points + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_creation_closes_upstream_when_lease_persist_fails( async_client, From e02297d61947ab70c4198be1bfcd1f1d2b186ca3 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 13:13:04 +0000 Subject: [PATCH 14/34] fix(proxy): recover signed bridge replays after restart --- app/modules/proxy/service.py | 40 +++++- .../integration/test_http_responses_bridge.py | 129 +++++++++++++++++- 2 files changed, 166 insertions(+), 3 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index af82c684..9f61c137 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import errno import inspect import json import logging @@ -1833,6 +1834,7 @@ async def _get_or_create_http_bridge_session( if ( not matched_turn_state_alias and active_turn_state_lease is None + and turn_state_token is None and lookup_key.affinity_kind != "request" and owner_instance is not None and len(ring) > 1 @@ -5649,13 +5651,49 @@ def _http_bridge_owner_instance_group(owner_id: str) -> str: return owner_id.split("@", 1)[0] +def _http_bridge_owner_pid(owner_id: str) -> int | None: + owner_parts = owner_id.split("@", 1) + if len(owner_parts) != 2: + return None + try: + pid = int(owner_parts[1]) + except ValueError: + return None + return pid if pid > 0 else None + + +def _http_bridge_process_exists(pid: int) -> bool: + try: + os.kill(pid, 0) + except ProcessLookupError: + return False + except PermissionError: + return True + except OSError as exc: + if exc.errno == errno.ESRCH: + return False + if exc.errno == errno.EPERM: + return True + return True + return True + + def _http_bridge_owner_matches_current( owner_id: str, *, current_owner_id: str, current_instance_id: str, ) -> bool: - return owner_id == current_owner_id or owner_id == current_instance_id + if owner_id == current_owner_id or owner_id == current_instance_id: + return True + if _http_bridge_owner_instance_group(owner_id) != current_instance_id: + return False + owner_pid = _http_bridge_owner_pid(owner_id) + if owner_pid is None: + return False + if owner_pid == os.getpid(): + return True + return not _http_bridge_process_exists(owner_pid) def _http_bridge_owner_instance(key: _HTTPBridgeSessionKey, settings: object) -> str | None: diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 10c1790d..2dc1dfbb 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -1291,6 +1291,124 @@ async def fake_connect_responses_websocket( assert new_lease.affinity_key == session.key.affinity_key +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_signed_turn_state_live_lease_from_restarted_worker_recovers_on_same_instance( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits( + monkeypatch, + enabled=True, + instance_id="instance-a", + instance_ring=["instance-a", "instance-b"], + ) + account_id = await _import_account( + async_client, + "acc_http_bridge_restarted_worker_recovery", + "http-bridge-restarted-worker-recovery@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + fake_upstream = _FakeBridgeUpstreamWebSocket() + session_id = "hbs_signed_restarted_worker" + signed_turn_state = service._encode_http_bridge_turn_state( + session_id=session_id, + owner_instance_id="instance-a@111", + api_key_id=None, + ) + + async def fake_select_account_with_budget( + self, + deadline, + *, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids=None, + additional_limit_name=None, + ): + del ( + self, + deadline, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids, + additional_limit_name, + ) + return AccountSelection(account=account, error_message=None, error_code=None) + + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del headers, access_token, account_id_header, base_url, session + return fake_upstream + + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + monkeypatch.setattr(proxy_module, "_http_bridge_current_owner_id", lambda settings: "instance-a@222") + monkeypatch.setattr(proxy_module, "_http_bridge_process_exists", lambda pid: False) + + async with SessionLocal() as db_session: + await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) + await db_session.commit() + + async with service._repo_factory() as repos: + await repos.http_bridge_leases.upsert( + session_id=session_id, + affinity_kind="turn_state_header", + affinity_key=signed_turn_state, + api_key_scope="", + owner_instance_id="instance-a@111", + lease_expires_at=proxy_module._http_bridge_lease_expires_at(120.0), + account_id=account.id, + request_model="gpt-5.1", + codex_session=True, + idle_ttl_seconds=120.0, + upstream_turn_state=None, + downstream_turn_state=signed_turn_state, + ) + + recovered = await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) + + assert recovered.bridge_session_id != session_id + assert proxy_module._http_bridge_owner_instance_group(recovered.owner_instance_id) == "instance-a" + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_worker_is_wrong_instance( async_client, @@ -5619,11 +5737,11 @@ async def test_v1_responses_http_bridge_singleflights_stale_signed_turn_state_re proxy_module._HTTPBridgeSessionKey("turn_state_header", candidate, None), settings, ) - == "instance-a" + == "instance-b" ) signed_turn_state = service._encode_http_bridge_turn_state( session_id=session_id, - owner_instance_id="instance-a", + owner_instance_id="instance-b", api_key_id=None, ) @@ -5681,6 +5799,13 @@ async def fake_create_http_bridge_session( assert len(create_started) == 1 assert session_one is session_two + assert ( + proxy_module._http_bridge_owner_instance( + proxy_module._HTTPBridgeSessionKey("turn_state_header", session_id, None), + settings, + ) + == "instance-b" + ) assert session_one.key.affinity_kind == "turn_state_header" assert session_one.key.affinity_key != signed_turn_state finally: From d4844aa42c5e4ee4be15ed2f26624f7d1c17ad9f Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 13:35:57 +0000 Subject: [PATCH 15/34] fix(proxy): preserve recovered bridge aliases --- app/modules/proxy/service.py | 50 +++++++++++++++---- .../integration/test_http_responses_bridge.py | 22 +++++++- 2 files changed, 59 insertions(+), 13 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 9f61c137..56b00477 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1646,7 +1646,9 @@ async def _ensure_http_bridge_lease_keepalive(self, session: "_HTTPBridgeSession return async def _keepalive() -> None: - interval_seconds = max(1.0, min(session.idle_ttl_seconds / 2.0, 60.0)) + interval_seconds = min(session.idle_ttl_seconds / 2.0, 60.0) + if interval_seconds <= 0: + interval_seconds = 0.001 try: while True: await asyncio.sleep(interval_seconds) @@ -1778,11 +1780,21 @@ async def _get_or_create_http_bridge_session( if alias_session is not None: matched_turn_state_alias = True key = alias_key - self._promote_http_bridge_session_to_codex_affinity( - alias_session, - turn_state=incoming_turn_state, - settings=settings, - ) + if incoming_turn_state is not None and ( + turn_state_token is None + or self._http_bridge_turn_state_matches_session( + incoming_turn_state, + session=alias_session, + api_key_id=alias_session.key.api_key_id, + ) + ): + self._promote_http_bridge_session_to_codex_affinity( + alias_session, + turn_state=incoming_turn_state, + settings=settings, + ) + else: + alias_session.downstream_turn_state_aliases.add(incoming_turn_state) for alias in alias_session.downstream_turn_state_aliases: self._http_bridge_turn_state_index[ _http_bridge_turn_state_alias_key(alias, alias_session.key.api_key_id) @@ -1868,11 +1880,18 @@ async def _get_or_create_http_bridge_session( ) == existing.key ): - self._promote_http_bridge_session_to_codex_affinity( - existing, - turn_state=incoming_turn_state, - settings=settings, - ) + if turn_state_token is None or self._http_bridge_turn_state_matches_session( + incoming_turn_state, + session=existing, + api_key_id=existing.key.api_key_id, + ): + self._promote_http_bridge_session_to_codex_affinity( + existing, + turn_state=incoming_turn_state, + settings=settings, + ) + else: + existing.downstream_turn_state_aliases.add(incoming_turn_state) existing.request_model = request_model existing.last_used_at = time.monotonic() _log_http_bridge_event( @@ -2046,6 +2065,15 @@ async def _get_or_create_http_bridge_session( if current_future is inflight_future: self._http_bridge_inflight_sessions.pop(lookup_key, None) self._http_bridge_sessions[session_key] = session + if ( + rekey_recovered_turn_state + and incoming_turn_state is not None + and incoming_turn_state != session.key.affinity_key + ): + session.downstream_turn_state_aliases.add(incoming_turn_state) + self._http_bridge_turn_state_index[ + _http_bridge_turn_state_alias_key(incoming_turn_state, session.key.api_key_id) + ] = session.key session_registered = True if inflight_future is not None and not inflight_future.done(): inflight_future.set_result(session) diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 2dc1dfbb..24a7dd37 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -1276,6 +1276,19 @@ async def fake_connect_responses_websocket( assert proxy_module._http_bridge_owner_instance_group(recovered_token.owner_instance_id) == "instance-a" assert connect_headers_seen[-1].get("x-codex-turn-state") is None + replayed = await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) + async with SessionLocal() as db_session: stale_lease = ( await db_session.execute(select(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) @@ -1286,6 +1299,9 @@ async def fake_connect_responses_websocket( ) ).scalar_one() + assert replayed is session + assert signed_turn_state in session.downstream_turn_state_aliases + assert connect_headers_seen and len(connect_headers_seen) == 1 assert stale_lease is None assert new_lease.affinity_kind == "turn_state_header" assert new_lease.affinity_key == session.key.affinity_key @@ -2401,7 +2417,7 @@ async def test_v1_responses_http_bridge_keeps_lease_alive_while_request_is_activ _make_dummy_bridge_session(proxy_module._HTTPBridgeSessionKey("request", "bridge-lease-keepalive", None)), ) session.bridge_session_id = "hbs_bridge_lease_keepalive" - session.idle_ttl_seconds = 2.0 + session.idle_ttl_seconds = 0.5 session.response_create_gate = asyncio.Semaphore(1) request_state = proxy_module._WebSocketRequestState( request_id="req_bridge_lease_keepalive", @@ -2415,15 +2431,17 @@ async def test_v1_responses_http_bridge_keeps_lease_alive_while_request_is_activ session.queued_request_count = 1 touch_points: list[float] = [] + touched = asyncio.Event() async def fake_touch_http_bridge_lease(self, session): del self touch_points.append(session.last_used_at) + touched.set() monkeypatch.setattr(proxy_module.ProxyService, "_touch_http_bridge_lease", fake_touch_http_bridge_lease) await service._ensure_http_bridge_lease_keepalive(session) - await asyncio.sleep(1.1) + await asyncio.wait_for(touched.wait(), timeout=0.4) await service._stop_http_bridge_lease_keepalive(session) assert touch_points From 527fa75a041eea76131bba2ce1348a62c7baec1c Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 13:48:59 +0000 Subject: [PATCH 16/34] fix(proxy): invalidate bridge sessions on lease loss --- app/modules/proxy/service.py | 95 ++++++++-- .../integration/test_http_responses_bridge.py | 164 ++++++++++++++++++ 2 files changed, 241 insertions(+), 18 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 56b00477..297da3fe 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -364,10 +364,9 @@ async def _stream_via_http_bridge( try: await self._touch_http_bridge_lease(session) except Exception: - logger.warning( - "Failed to persist HTTP bridge lease after request detach session_id=%s", - session.bridge_session_id, - exc_info=True, + await self._invalidate_http_bridge_session_after_lease_failure( + session, + failure_message="Failed to persist HTTP bridge lease after request detach session_id=%s", ) async def compact_responses( @@ -1640,6 +1639,26 @@ async def _touch_http_bridge_lease(self, session: "_HTTPBridgeSession") -> None: downstream_turn_state=session.downstream_turn_state, ) + async def _invalidate_http_bridge_session_after_lease_failure( + self, + session: "_HTTPBridgeSession", + *, + failure_message: str, + ) -> None: + logger.warning( + failure_message, + session.bridge_session_id, + exc_info=True, + ) + try: + await self._close_http_bridge_session(session) + except Exception: + logger.warning( + "Failed to invalidate HTTP bridge session after lease persistence failure session_id=%s", + session.bridge_session_id, + exc_info=True, + ) + async def _ensure_http_bridge_lease_keepalive(self, session: "_HTTPBridgeSession") -> None: task = getattr(session, "lease_keepalive_task", None) if task is not None and not task.done(): @@ -1660,11 +1679,13 @@ async def _keepalive() -> None: try: await self._touch_http_bridge_lease(session) except Exception: - logger.warning( + await self._invalidate_http_bridge_session_after_lease_failure( + session, + failure_message=( "Failed to refresh HTTP bridge lease during active stream session_id=%s", - session.bridge_session_id, - exc_info=True, + ), ) + return except asyncio.CancelledError: raise @@ -2201,10 +2222,9 @@ async def _register_http_bridge_turn_state(self, session: "_HTTPBridgeSession", try: await self._touch_http_bridge_lease(session) except Exception: - logger.warning( - "Failed to persist HTTP bridge lease after turn-state registration session_id=%s", - session.bridge_session_id, - exc_info=True, + await self._invalidate_http_bridge_session_after_lease_failure( + session, + failure_message="Failed to persist HTTP bridge lease after turn-state registration session_id=%s", ) async def _unregister_http_bridge_turn_states(self, session: "_HTTPBridgeSession") -> None: @@ -2801,10 +2821,9 @@ async def _reconnect_http_bridge_session( try: await self._touch_http_bridge_lease(session) except Exception: - logger.warning( - "Failed to persist HTTP bridge lease after reconnect session_id=%s", - session.bridge_session_id, - exc_info=True, + await self._invalidate_http_bridge_session_after_lease_failure( + session, + failure_message="Failed to persist HTTP bridge lease after reconnect session_id=%s", ) if restart_reader: session.upstream_reader = asyncio.create_task(self._relay_http_bridge_upstream_messages(session)) @@ -5672,7 +5691,11 @@ def _normalized_http_bridge_instance_ring(settings: object) -> tuple[str, tuple[ def _http_bridge_current_owner_id(settings: object) -> str: instance_id, _ = _normalized_http_bridge_instance_ring(settings) - return f"{instance_id}@{os.getpid()}" + pid = os.getpid() + process_marker = _http_bridge_process_start_marker(pid) + if process_marker is None: + return f"{instance_id}@{pid}" + return f"{instance_id}@{pid}:{process_marker}" def _http_bridge_owner_instance_group(owner_id: str) -> str: @@ -5683,13 +5706,41 @@ def _http_bridge_owner_pid(owner_id: str) -> int | None: owner_parts = owner_id.split("@", 1) if len(owner_parts) != 2: return None + pid_text, _, _ = owner_parts[1].partition(":") try: - pid = int(owner_parts[1]) + pid = int(pid_text) except ValueError: return None return pid if pid > 0 else None +def _http_bridge_owner_process_marker(owner_id: str) -> str | None: + owner_parts = owner_id.split("@", 1) + if len(owner_parts) != 2: + return None + _, separator, process_marker = owner_parts[1].partition(":") + if not separator or not process_marker: + return None + return process_marker + + +def _http_bridge_process_start_marker(pid: int) -> str | None: + try: + with open(f"/proc/{pid}/stat", encoding="utf-8") as process_stat: + payload = process_stat.read().strip() + except OSError: + return None + try: + _, stat_tail = payload.rsplit(") ", 1) + except ValueError: + return None + stat_fields = stat_tail.split() + if len(stat_fields) <= 19: + return None + process_marker = stat_fields[19].strip() + return process_marker or None + + def _http_bridge_process_exists(pid: int) -> bool: try: os.kill(pid, 0) @@ -5721,7 +5772,15 @@ def _http_bridge_owner_matches_current( return False if owner_pid == os.getpid(): return True - return not _http_bridge_process_exists(owner_pid) + owner_process_marker = _http_bridge_owner_process_marker(owner_id) + if owner_process_marker is None: + return not _http_bridge_process_exists(owner_pid) + live_process_marker = _http_bridge_process_start_marker(owner_pid) + if live_process_marker is None: + return True + if live_process_marker != owner_process_marker: + return True + return False def _http_bridge_owner_instance(key: _HTTPBridgeSessionKey, settings: object) -> str | None: diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 24a7dd37..a45607cb 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -1425,6 +1425,128 @@ async def fake_connect_responses_websocket( assert proxy_module._http_bridge_owner_instance_group(recovered.owner_instance_id) == "instance-a" +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_signed_turn_state_live_lease_from_reused_pid_recovers_on_same_instance( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits( + monkeypatch, + enabled=True, + instance_id="instance-a", + instance_ring=["instance-a", "instance-b"], + ) + account_id = await _import_account( + async_client, + "acc_http_bridge_reused_pid_recovery", + "http-bridge-reused-pid-recovery@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + fake_upstream = _FakeBridgeUpstreamWebSocket() + session_id = "hbs_signed_reused_pid_worker" + signed_turn_state = service._encode_http_bridge_turn_state( + session_id=session_id, + owner_instance_id="instance-a@111:old-start", + api_key_id=None, + ) + + async def fake_select_account_with_budget( + self, + deadline, + *, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids=None, + additional_limit_name=None, + ): + del ( + self, + deadline, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids, + additional_limit_name, + ) + return AccountSelection(account=account, error_message=None, error_code=None) + + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del headers, access_token, account_id_header, base_url, session + return fake_upstream + + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + monkeypatch.setattr(proxy_module, "_http_bridge_current_owner_id", lambda settings: "instance-a@222:current-start") + monkeypatch.setattr( + proxy_module, + "_http_bridge_process_start_marker", + lambda pid: "reused-start" if pid == 111 else None, + ) + + async with SessionLocal() as db_session: + await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) + await db_session.commit() + + async with service._repo_factory() as repos: + await repos.http_bridge_leases.upsert( + session_id=session_id, + affinity_kind="turn_state_header", + affinity_key=signed_turn_state, + api_key_scope="", + owner_instance_id="instance-a@111:old-start", + lease_expires_at=proxy_module._http_bridge_lease_expires_at(120.0), + account_id=account.id, + request_model="gpt-5.1", + codex_session=True, + idle_ttl_seconds=120.0, + upstream_turn_state=None, + downstream_turn_state=signed_turn_state, + ) + + recovered = await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) + + assert recovered.bridge_session_id != session_id + assert proxy_module._http_bridge_owner_instance_group(recovered.owner_instance_id) == "instance-a" + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_worker_is_wrong_instance( async_client, @@ -2447,6 +2569,48 @@ async def fake_touch_http_bridge_lease(self, session): assert touch_points +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_keepalive_refresh_failure_closes_session(app_instance, monkeypatch): + _install_bridge_settings_with_limits(monkeypatch, enabled=True) + service = get_proxy_service_for_app(app_instance) + session = cast( + proxy_module._HTTPBridgeSession, + _make_dummy_bridge_session(proxy_module._HTTPBridgeSessionKey("request", "bridge-lease-keepalive-failure", None)), + ) + session.bridge_session_id = "hbs_bridge_lease_keepalive_failure" + session.idle_ttl_seconds = 0.5 + session.response_create_gate = asyncio.Semaphore(1) + request_state = proxy_module._WebSocketRequestState( + request_id="req_bridge_lease_keepalive_failure", + model="gpt-5.1", + service_tier=None, + reasoning_effort=None, + api_key_reservation=None, + started_at=time.monotonic(), + ) + session.pending_requests.append(request_state) + session.queued_request_count = 1 + + closed = asyncio.Event() + + async def fake_touch_http_bridge_lease(self, session): + del self, session + raise RuntimeError("lease touch failed") + + async def fake_close_http_bridge_session(self, session): + del self + session.closed = True + closed.set() + + monkeypatch.setattr(proxy_module.ProxyService, "_touch_http_bridge_lease", fake_touch_http_bridge_lease) + monkeypatch.setattr(proxy_module.ProxyService, "_close_http_bridge_session", fake_close_http_bridge_session) + + await service._ensure_http_bridge_lease_keepalive(session) + await asyncio.wait_for(closed.wait(), timeout=0.4) + + assert session.closed is True + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_creation_closes_upstream_when_lease_persist_fails( async_client, From 437dda2d66bd937fb20389d00536a2d9bcfeb118 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 14:06:08 +0000 Subject: [PATCH 17/34] fix(proxy): align bridge recovery and lease state --- app/modules/proxy/bridge_repository.py | 8 ++ app/modules/proxy/service.py | 6 +- .../integration/test_http_responses_bridge.py | 97 +++++++++++++++++++ 3 files changed, 110 insertions(+), 1 deletion(-) diff --git a/app/modules/proxy/bridge_repository.py b/app/modules/proxy/bridge_repository.py index 28aaa75a..b0fcd7e6 100644 --- a/app/modules/proxy/bridge_repository.py +++ b/app/modules/proxy/bridge_repository.py @@ -73,6 +73,10 @@ async def touch( self, session_id: str, *, + affinity_kind: str, + affinity_key: str, + api_key_scope: str, + owner_instance_id: str, lease_expires_at: datetime, account_id: str | None, request_model: str | None, @@ -87,6 +91,10 @@ async def touch( update(HttpBridgeLease) .where(HttpBridgeLease.session_id == session_id) .values( + affinity_kind=affinity_kind, + affinity_key=affinity_key, + api_key_scope=api_key_scope, + owner_instance_id=owner_instance_id, lease_expires_at=to_utc_naive(lease_expires_at), account_id=account_id, request_model=request_model, diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 297da3fe..9d92ca42 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1613,6 +1613,10 @@ async def _touch_http_bridge_lease(self, session: "_HTTPBridgeSession") -> None: async with self._repo_factory() as repos: touched = await repos.http_bridge_leases.touch( session.bridge_session_id, + affinity_kind=session.key.affinity_kind, + affinity_key=session.key.affinity_key, + api_key_scope=_http_bridge_api_key_scope(session.key.api_key_id), + owner_instance_id=session.owner_instance_id, lease_expires_at=_http_bridge_lease_expires_at(session.idle_ttl_seconds), account_id=session.account.id, request_model=session.request_model, @@ -5461,7 +5465,7 @@ def _headers_without_local_http_bridge_turn_state(headers: Mapping[str, str]) -> continue if isinstance(value, str): stripped = value.strip() - if stripped.startswith(_HTTP_BRIDGE_TURN_STATE_PREFIX) or stripped.startswith("http_turn_"): + if stripped.startswith(_HTTP_BRIDGE_TURN_STATE_PREFIX): forwarded.pop(key, None) break return forwarded diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index a45607cb..95885761 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -894,6 +894,95 @@ async def test_v1_responses_http_bridge_unsigned_legacy_turn_state_preserves_pre } +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_unsigned_legacy_turn_state_recovery_forwards_upstream_token( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits(monkeypatch, enabled=True) + account_id = await _import_account( + async_client, + "acc_http_bridge_legacy_rebuild_forwarding", + "http-bridge-legacy-rebuild-forwarding@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + fake_upstream = _FakeBridgeUpstreamWebSocket() + connect_headers_seen: list[dict[str, str]] = [] + + async def fake_select_account_with_budget( + self, + deadline, + *, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids=None, + additional_limit_name=None, + ): + del ( + self, + deadline, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids, + additional_limit_name, + ) + return AccountSelection(account=account, error_message=None, error_code=None) + + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del access_token, account_id_header, base_url, session + connect_headers_seen.append(dict(headers)) + return fake_upstream + + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + + legacy_turn_state = "http_turn_legacy_rebuild_forwarding" + session = await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", legacy_turn_state, None), + headers={"x-codex-turn-state": legacy_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=legacy_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) + + assert session.key.affinity_kind == "turn_state_header" + assert session.key.affinity_key == legacy_turn_state + assert connect_headers_seen[-1]["x-codex-turn-state"] == legacy_turn_state + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_replayed_turn_state_alias_preserves_owner_and_promotes_session( async_client, @@ -1019,6 +1108,14 @@ async def fake_connect_responses_websocket( ) await service._register_http_bridge_turn_state(session, replay_turn_state) replay_key = proxy_module._HTTPBridgeSessionKey("turn_state_header", replay_turn_state, None) + async with SessionLocal() as db_session: + lease = ( + await db_session.execute( + select(HttpBridgeLease).where(HttpBridgeLease.session_id == session.bridge_session_id) + ) + ).scalar_one() + assert lease.affinity_kind == "turn_state_header" + assert lease.affinity_key == replay_turn_state assert ( service._http_bridge_turn_state_index[ proxy_module._http_bridge_turn_state_alias_key(replay_turn_state, session.key.api_key_id) From 8a29fca94196e5bc5fcff26d6e176afb5c70bd1c Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 14:49:03 +0000 Subject: [PATCH 18/34] fix(proxy): unify bridge invalidation lifecycle --- app/modules/proxy/service.py | 103 +++++++++++--- .../integration/test_http_responses_bridge.py | 131 ++++++++++++++++-- 2 files changed, 204 insertions(+), 30 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 9d92ca42..235446b3 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1655,7 +1655,12 @@ async def _invalidate_http_bridge_session_after_lease_failure( exc_info=True, ) try: - await self._close_http_bridge_session(session) + await self._close_http_bridge_session( + session, + fail_pending_requests=True, + error_code="upstream_unavailable", + error_message="HTTP bridge session became unavailable", + ) except Exception: logger.warning( "Failed to invalidate HTTP bridge session after lease persistence failure session_id=%s", @@ -1663,6 +1668,53 @@ async def _invalidate_http_bridge_session_after_lease_failure( exc_info=True, ) + async def _fail_pending_http_bridge_requests( + self, + session: "_HTTPBridgeSession", + *, + error_code: str, + error_message: str, + error_type: str = "server_error", + ) -> None: + async with session.pending_lock: + remaining = list(session.pending_requests) + session.pending_requests.clear() + session.queued_request_count = 0 + + for request_state in remaining: + _release_websocket_response_create_gate(request_state, session.response_create_gate) + if request_state.event_queue is not None: + await request_state.event_queue.put( + format_sse_event( + response_failed_event( + request_state.error_code_override or error_code, + request_state.error_message_override or error_message, + error_type=request_state.error_type_override or error_type, + response_id=request_state.response_id or request_state.request_id, + error_param=request_state.error_param_override, + ) + ) + ) + await request_state.event_queue.put(None) + await self._release_websocket_reservation(request_state.api_key_reservation) + request_state.api_key_reservation = None + if session.account.id and not request_state.skip_request_log: + await self._write_request_log( + account_id=session.account.id, + api_key=request_state.api_key, + request_id=request_state.response_id or request_state.request_log_id or request_state.request_id, + model=request_state.model or "", + latency_ms=int((time.monotonic() - request_state.started_at) * 1000), + status="error", + error_code=request_state.error_code_override or error_code, + error_message=request_state.error_message_override or error_message, + reasoning_effort=request_state.reasoning_effort, + transport=request_state.transport, + service_tier=request_state.service_tier, + requested_service_tier=request_state.requested_service_tier, + actual_service_tier=request_state.actual_service_tier, + ) + async def _ensure_http_bridge_lease_keepalive(self, session: "_HTTPBridgeSession") -> None: task = getattr(session, "lease_keepalive_task", None) if task is not None and not task.done(): @@ -1685,9 +1737,7 @@ async def _keepalive() -> None: except Exception: await self._invalidate_http_bridge_session_after_lease_failure( session, - failure_message=( - "Failed to refresh HTTP bridge lease during active stream session_id=%s", - ), + failure_message="Failed to refresh HTTP bridge lease during active stream session_id=%s", ) return except asyncio.CancelledError: @@ -1702,9 +1752,9 @@ async def _stop_http_bridge_lease_keepalive(self, session: "_HTTPBridgeSession") session.lease_keepalive_task = None if task.done(): return - task.cancel() if task is asyncio.current_task(): return + task.cancel() try: await task except asyncio.CancelledError: @@ -2168,22 +2218,32 @@ async def _close_http_bridge_session( session: "_HTTPBridgeSession", *, turn_state_lock_held: bool = False, + fail_pending_requests: bool = False, + error_code: str = "upstream_unavailable", + error_message: str = "HTTP bridge session became unavailable", ) -> None: lease_lock = getattr(session, "lease_lock", None) async def _close_session() -> None: session.closed = True await self._stop_http_bridge_lease_keepalive(session) + if fail_pending_requests: + await self._fail_pending_http_bridge_requests( + session, + error_code=error_code, + error_message=error_message, + ) if turn_state_lock_held: self._unregister_http_bridge_turn_states_locked(session) else: await self._unregister_http_bridge_turn_states(session) if session.upstream_reader is not None: session.upstream_reader.cancel() - try: - await session.upstream_reader - except asyncio.CancelledError: - pass + if session.upstream_reader is not asyncio.current_task(): + try: + await session.upstream_reader + except asyncio.CancelledError: + pass try: await session.upstream.close() except Exception: @@ -2205,6 +2265,7 @@ async def _register_http_bridge_turn_state(self, session: "_HTTPBridgeSession", async with self._http_bridge_lock: if session.closed: return + session.reconnect_turn_state = turn_state session.downstream_turn_state_aliases.add(turn_state) if self._http_bridge_turn_state_matches_session( turn_state, @@ -2266,6 +2327,7 @@ def _promote_http_bridge_session_to_codex_affinity( session.affinity = _AffinityPolicy(key=turn_state, kind=StickySessionKind.CODEX_SESSION) session.codex_session = True session.downstream_turn_state = turn_state + session.reconnect_turn_state = turn_state session.downstream_turn_state_aliases.add(turn_state) session.idle_ttl_seconds = max( session.idle_ttl_seconds, @@ -2338,6 +2400,8 @@ async def _create_http_bridge_session( _raise_proxy_unavailable(exc.message or "Temporary upstream refresh failure") except (aiohttp.ClientError, asyncio.TimeoutError) as exc: _raise_proxy_unavailable(str(exc) or "Request to upstream timed out") + echoed_turn_state = _upstream_turn_state_from_socket(upstream) + reconnect_turn_state = echoed_turn_state or _sticky_key_from_turn_state_header(connect_headers) session = _HTTPBridgeSession( key=key, headers=connect_headers, @@ -2357,7 +2421,8 @@ async def _create_http_bridge_session( owner_instance_id=owner_instance_id, codex_session=affinity.kind == StickySessionKind.CODEX_SESSION, prewarm_lock=anyio.Lock(), - upstream_turn_state=_upstream_turn_state_from_socket(upstream), + upstream_turn_state=echoed_turn_state, + reconnect_turn_state=reconnect_turn_state, downstream_turn_state=None, ) try: @@ -2807,9 +2872,10 @@ async def _reconnect_http_bridge_session( ), ) account = await self._ensure_fresh_with_budget(account, timeout_seconds=_remaining_budget_seconds(deadline)) + preferred_turn_state = _preferred_http_bridge_reconnect_turn_state(session) connect_headers = _headers_with_turn_state( session.headers, - _preferred_http_bridge_reconnect_turn_state(session), + preferred_turn_state, ) new_upstream = await self._open_upstream_websocket_with_budget( account, @@ -2821,7 +2887,9 @@ async def _reconnect_http_bridge_session( session.upstream = new_upstream session.upstream_control = _WebSocketUpstreamControl() session.closed = False - session.upstream_turn_state = _upstream_turn_state_from_socket(new_upstream) or session.upstream_turn_state + echoed_turn_state = _upstream_turn_state_from_socket(new_upstream) + session.upstream_turn_state = echoed_turn_state or session.upstream_turn_state + session.reconnect_turn_state = echoed_turn_state or preferred_turn_state try: await self._touch_http_bridge_lease(session) except Exception: @@ -4867,6 +4935,7 @@ class _HTTPBridgeSession: prewarmed: bool = False prewarm_lock: anyio.Lock | None = None upstream_turn_state: str | None = None + reconnect_turn_state: str | None = None downstream_turn_state: str | None = None downstream_turn_state_aliases: set[str] = field(default_factory=set) upstream_reader: asyncio.Task[None] | None = None @@ -5474,15 +5543,7 @@ def _headers_without_local_http_bridge_turn_state(headers: Mapping[str, str]) -> def _preferred_http_bridge_reconnect_turn_state(session: "_HTTPBridgeSession") -> str | None: if session.upstream_turn_state is not None: return session.upstream_turn_state - if ( - session.codex_session - and session.downstream_turn_state is not None - and not session.downstream_turn_state.startswith(_HTTP_BRIDGE_TURN_STATE_PREFIX) - and session.affinity.kind == StickySessionKind.CODEX_SESSION - and session.affinity.key == session.downstream_turn_state - ): - return session.downstream_turn_state - return None + return session.reconnect_turn_state def _http_bridge_turn_state_alias_key(turn_state: str, api_key_id: str | None) -> tuple[str, str | None]: diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 95885761..7484ab21 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -2684,28 +2684,34 @@ async def test_v1_responses_http_bridge_keepalive_refresh_failure_closes_session reasoning_effort=None, api_key_reservation=None, started_at=time.monotonic(), + event_queue=asyncio.Queue(), ) session.pending_requests.append(request_state) session.queued_request_count = 1 - closed = asyncio.Event() - async def fake_touch_http_bridge_lease(self, session): del self, session raise RuntimeError("lease touch failed") - async def fake_close_http_bridge_session(self, session): - del self - session.closed = True - closed.set() + async def fake_write_request_log(self, **kwargs): + del self, kwargs + return None + + async def fake_release_websocket_reservation(self, reservation): + del self, reservation + return None monkeypatch.setattr(proxy_module.ProxyService, "_touch_http_bridge_lease", fake_touch_http_bridge_lease) - monkeypatch.setattr(proxy_module.ProxyService, "_close_http_bridge_session", fake_close_http_bridge_session) + monkeypatch.setattr(proxy_module.ProxyService, "_write_request_log", fake_write_request_log) + monkeypatch.setattr(proxy_module.ProxyService, "_release_websocket_reservation", fake_release_websocket_reservation) await service._ensure_http_bridge_lease_keepalive(session) - await asyncio.wait_for(closed.wait(), timeout=0.4) - + failed_event = await asyncio.wait_for(request_state.event_queue.get(), timeout=1.0) + assert proxy_module.parse_sse_data_json(failed_event)["type"] == "response.failed" + assert await asyncio.wait_for(request_state.event_queue.get(), timeout=1.0) is None + await asyncio.sleep(0) assert session.closed is True + assert not session.pending_requests @pytest.mark.asyncio @@ -3016,6 +3022,113 @@ async def fake_connect_responses_websocket( assert bridge_session.upstream_turn_state == "upstream_turn_state_2" +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_reconnect_preserves_signed_turn_state_when_handshake_is_silent( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits(monkeypatch, enabled=True) + account_id = await _import_account( + async_client, + "acc_http_bridge_signed_reconnect_fallback", + "http-bridge-signed-reconnect-fallback@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + connect_headers_seen: list[dict[str, str]] = [] + upstreams = [_FakeBridgeUpstreamWebSocket(), _FakeBridgeUpstreamWebSocket()] + stale_signed_turn_state = service._encode_http_bridge_turn_state( + session_id="hbs_signed_reconnect_fallback_stale", + owner_instance_id="instance-a", + api_key_id=None, + ) + + async def fake_select_account_with_budget( + self, + deadline, + *, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids=None, + additional_limit_name=None, + ): + del ( + self, + deadline, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids, + additional_limit_name, + ) + return AccountSelection(account=account, error_message=None, error_code=None) + + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del access_token, account_id_header, base_url, session + connect_headers_seen.append(dict(headers)) + return upstreams.pop(0) + + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + + bridge_session = await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", stale_signed_turn_state, None), + headers={"x-codex-turn-state": stale_signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=stale_signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=8, + ) + await service._register_http_bridge_turn_state(bridge_session, bridge_session.key.affinity_key) + + request_state = proxy_module._WebSocketRequestState( + request_id="req-signed-turn-state-reconnect", + model="gpt-5.1", + service_tier=None, + reasoning_effort=None, + api_key_reservation=None, + started_at=time.monotonic(), + awaiting_response_created=True, + request_text=json.dumps({"type": "response.create", "model": "gpt-5.1", "input": []}), + ) + await service._reconnect_http_bridge_session(bridge_session, request_state=request_state) + + assert "x-codex-turn-state" not in connect_headers_seen[0] + assert connect_headers_seen[1]["x-codex-turn-state"] == bridge_session.key.affinity_key + assert bridge_session.upstream_turn_state is None + assert bridge_session.reconnect_turn_state == bridge_session.key.affinity_key + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_reconnect_restart_reader_preserves_lease_until_touch( async_client, From 107b5e88fd39994321d03070ea534f450a7880d1 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 14:54:56 +0000 Subject: [PATCH 19/34] fix(proxy): restore repo bundle compatibility --- app/modules/proxy/repo_bundle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/modules/proxy/repo_bundle.py b/app/modules/proxy/repo_bundle.py index b7fe552a..a30406e9 100644 --- a/app/modules/proxy/repo_bundle.py +++ b/app/modules/proxy/repo_bundle.py @@ -18,9 +18,9 @@ class ProxyRepositories: usage: UsageRepository request_logs: RequestLogsRepository sticky_sessions: StickySessionsRepository - http_bridge_leases: HttpBridgeLeasesRepository api_keys: ApiKeysRepository additional_usage: AdditionalUsageRepository + http_bridge_leases: HttpBridgeLeasesRepository | None = None ProxyRepoFactory = Callable[[], AsyncContextManager[ProxyRepositories]] From 7c40c44d025fe0ecc658e652d063d6adb7e0cfe5 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 15:08:41 +0000 Subject: [PATCH 20/34] fix(proxy): guard bridge lease expiry and reconnect failure --- app/modules/proxy/bridge_repository.py | 11 ++ app/modules/proxy/service.py | 9 +- .../integration/test_http_responses_bridge.py | 179 ++++++++++++++++++ 3 files changed, 198 insertions(+), 1 deletion(-) diff --git a/app/modules/proxy/bridge_repository.py b/app/modules/proxy/bridge_repository.py index b0fcd7e6..762a713c 100644 --- a/app/modules/proxy/bridge_repository.py +++ b/app/modules/proxy/bridge_repository.py @@ -69,6 +69,17 @@ async def delete(self, session_id: str) -> bool: await self._session.commit() return result.scalar_one_or_none() is not None + async def delete_if_expires_at(self, session_id: str, *, lease_expires_at: datetime) -> bool: + if not session_id: + return False + statement = delete(HttpBridgeLease).where( + HttpBridgeLease.session_id == session_id, + HttpBridgeLease.lease_expires_at == to_utc_naive(lease_expires_at), + ) + result = await self._session.execute(statement.returning(HttpBridgeLease.session_id)) + await self._session.commit() + return result.scalar_one_or_none() is not None + async def touch( self, session_id: str, diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 235446b3..b6d30881 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1571,7 +1571,10 @@ async def _get_live_http_bridge_lease( if lease is None: return None if to_utc_naive(lease.lease_expires_at) < utcnow(): - await repos.http_bridge_leases.delete(session_id) + await repos.http_bridge_leases.delete_if_expires_at( + session_id, + lease_expires_at=lease.lease_expires_at, + ) return None return _HTTPBridgeLeaseSnapshot( session_id=lease.session_id, @@ -2897,6 +2900,10 @@ async def _reconnect_http_bridge_session( session, failure_message="Failed to persist HTTP bridge lease after reconnect session_id=%s", ) + raise ProxyResponseError( + 502, + openai_error("upstream_unavailable", "HTTP bridge session became unavailable"), + ) if restart_reader: session.upstream_reader = asyncio.create_task(self._relay_http_bridge_upstream_messages(session)) except BaseException: diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 7484ab21..4d82e0f4 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -1644,6 +1644,70 @@ async def fake_connect_responses_websocket( assert proxy_module._http_bridge_owner_instance_group(recovered.owner_instance_id) == "instance-a" +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_live_lease_lookup_does_not_delete_concurrently_refreshed_row( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits(monkeypatch, enabled=True) + account_id = await _import_account( + async_client, + "acc_http_bridge_live_lease_race", + "http-bridge-live-lease-race@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + session_id = "hbs_bridge_live_lease_race" + original_expiry = proxy_module.utcnow() - timedelta(seconds=1) + refreshed_expiry = proxy_module.utcnow() + timedelta(seconds=120) + + async with service._repo_factory() as repos: + await repos.http_bridge_leases.upsert( + session_id=session_id, + affinity_kind="turn_state_header", + affinity_key="signed-state", + api_key_scope="", + owner_instance_id="instance-a", + lease_expires_at=original_expiry, + account_id=account.id, + request_model="gpt-5.1", + codex_session=True, + idle_ttl_seconds=120.0, + upstream_turn_state=None, + downstream_turn_state="signed-state", + ) + lease = await repos.http_bridge_leases.get_by_session_id(session_id) + assert lease is not None + stale_expiry = lease.lease_expires_at + await repos.http_bridge_leases.touch( + session_id, + affinity_kind="turn_state_header", + affinity_key="signed-state", + api_key_scope="", + owner_instance_id="instance-a", + lease_expires_at=refreshed_expiry, + account_id=account.id, + request_model="gpt-5.1", + codex_session=True, + idle_ttl_seconds=120.0, + upstream_turn_state=None, + downstream_turn_state="signed-state", + ) + deleted = await repos.http_bridge_leases.delete_if_expires_at( + session_id, + lease_expires_at=stale_expiry, + ) + async with SessionLocal() as db_session: + remaining = ( + await db_session.execute(select(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) + ).scalar_one_or_none() + + assert deleted is False + assert remaining is not None + assert proxy_module.to_utc_naive(remaining.lease_expires_at) == proxy_module.to_utc_naive(refreshed_expiry) + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_worker_is_wrong_instance( async_client, @@ -3246,6 +3310,121 @@ async def fake_touch_http_bridge_lease(self, session): assert call_order == ["touch"] +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_reconnect_aborts_after_lease_refresh_failure( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits(monkeypatch, enabled=True) + account_id = await _import_account( + async_client, + "acc_http_bridge_reconnect_lease_failure", + "http-bridge-reconnect-lease-failure@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + first_upstream = _FakeBridgeUpstreamWebSocket() + second_upstream = _FakeBridgeUpstreamWebSocket() + upstreams = [first_upstream, second_upstream] + + async def fake_select_account_with_budget( + self, + deadline, + *, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids=None, + additional_limit_name=None, + ): + del ( + self, + deadline, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids, + additional_limit_name, + ) + return AccountSelection(account=account, error_message=None, error_code=None) + + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del headers, access_token, account_id_header, base_url, session + return upstreams.pop(0) + + async def failing_touch_http_bridge_lease(self, session): + del self, session + raise RuntimeError("lease touch failed") + + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + monkeypatch.setattr(proxy_module.ProxyService, "_touch_http_bridge_lease", failing_touch_http_bridge_lease) + + payload = proxy_module.ResponsesRequest.model_validate({"model": "gpt-5.1", "instructions": "hi", "input": []}) + affinity = proxy_module._AffinityPolicy( + key="reconnect-lease-failure", + kind=proxy_module.StickySessionKind.PROMPT_CACHE, + ) + session = await service._get_or_create_http_bridge_session( + proxy_module._make_http_bridge_session_key( + payload, + headers={}, + affinity=affinity, + api_key=None, + request_id="req_reconnect_lease_failure", + ), + headers={}, + affinity=affinity, + api_key=None, + request_model=payload.model, + idle_ttl_seconds=120.0, + max_sessions=128, + ) + + request_state = proxy_module._WebSocketRequestState( + request_id="req_reconnect_lease_failure_retry", + model=payload.model, + service_tier=None, + reasoning_effort=None, + api_key_reservation=None, + started_at=time.monotonic(), + ) + + with pytest.raises(proxy_module.ProxyResponseError) as exc_info: + await service._reconnect_http_bridge_session(session, request_state=request_state, restart_reader=True) + + exc = exc_info.value + assert exc.status_code == 502 + assert exc.payload["error"].get("code") == "upstream_unavailable" + assert session.closed is True + assert session.upstream_reader is None or session.upstream_reader.done() + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_session_id_reconnect_keeps_upstream_turn_state( async_client, From 7a4198381c692e781455a1b353e9723fb49530c9 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 15:29:30 +0000 Subject: [PATCH 21/34] fix(proxy): scope bridge ownership to replicas --- app/modules/proxy/service.py | 80 +--------------- .../integration/test_http_responses_bridge.py | 93 +++++++++++++------ 2 files changed, 69 insertions(+), 104 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index b6d30881..141de17e 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1,11 +1,9 @@ from __future__ import annotations import asyncio -import errno import inspect import json import logging -import os import time from collections import deque from collections.abc import Sequence @@ -5763,72 +5761,13 @@ def _normalized_http_bridge_instance_ring(settings: object) -> tuple[str, tuple[ def _http_bridge_current_owner_id(settings: object) -> str: instance_id, _ = _normalized_http_bridge_instance_ring(settings) - pid = os.getpid() - process_marker = _http_bridge_process_start_marker(pid) - if process_marker is None: - return f"{instance_id}@{pid}" - return f"{instance_id}@{pid}:{process_marker}" + return instance_id def _http_bridge_owner_instance_group(owner_id: str) -> str: return owner_id.split("@", 1)[0] -def _http_bridge_owner_pid(owner_id: str) -> int | None: - owner_parts = owner_id.split("@", 1) - if len(owner_parts) != 2: - return None - pid_text, _, _ = owner_parts[1].partition(":") - try: - pid = int(pid_text) - except ValueError: - return None - return pid if pid > 0 else None - - -def _http_bridge_owner_process_marker(owner_id: str) -> str | None: - owner_parts = owner_id.split("@", 1) - if len(owner_parts) != 2: - return None - _, separator, process_marker = owner_parts[1].partition(":") - if not separator or not process_marker: - return None - return process_marker - - -def _http_bridge_process_start_marker(pid: int) -> str | None: - try: - with open(f"/proc/{pid}/stat", encoding="utf-8") as process_stat: - payload = process_stat.read().strip() - except OSError: - return None - try: - _, stat_tail = payload.rsplit(") ", 1) - except ValueError: - return None - stat_fields = stat_tail.split() - if len(stat_fields) <= 19: - return None - process_marker = stat_fields[19].strip() - return process_marker or None - - -def _http_bridge_process_exists(pid: int) -> bool: - try: - os.kill(pid, 0) - except ProcessLookupError: - return False - except PermissionError: - return True - except OSError as exc: - if exc.errno == errno.ESRCH: - return False - if exc.errno == errno.EPERM: - return True - return True - return True - - def _http_bridge_owner_matches_current( owner_id: str, *, @@ -5837,22 +5776,7 @@ def _http_bridge_owner_matches_current( ) -> bool: if owner_id == current_owner_id or owner_id == current_instance_id: return True - if _http_bridge_owner_instance_group(owner_id) != current_instance_id: - return False - owner_pid = _http_bridge_owner_pid(owner_id) - if owner_pid is None: - return False - if owner_pid == os.getpid(): - return True - owner_process_marker = _http_bridge_owner_process_marker(owner_id) - if owner_process_marker is None: - return not _http_bridge_process_exists(owner_pid) - live_process_marker = _http_bridge_process_start_marker(owner_pid) - if live_process_marker is None: - return True - if live_process_marker != owner_process_marker: - return True - return False + return _http_bridge_owner_instance_group(owner_id) == current_instance_id def _http_bridge_owner_instance(key: _HTTPBridgeSessionKey, settings: object) -> str | None: diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 4d82e0f4..195b3e15 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -1482,8 +1482,6 @@ async def fake_connect_responses_websocket( monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) - monkeypatch.setattr(proxy_module, "_http_bridge_current_owner_id", lambda settings: "instance-a@222") - monkeypatch.setattr(proxy_module, "_http_bridge_process_exists", lambda pid: False) async with SessionLocal() as db_session: await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) @@ -1600,12 +1598,6 @@ async def fake_connect_responses_websocket( monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) - monkeypatch.setattr(proxy_module, "_http_bridge_current_owner_id", lambda settings: "instance-a@222:current-start") - monkeypatch.setattr( - proxy_module, - "_http_bridge_process_start_marker", - lambda pid: "reused-start" if pid == 111 else None, - ) async with SessionLocal() as db_session: await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) @@ -1709,7 +1701,7 @@ async def test_v1_responses_http_bridge_live_lease_lookup_does_not_delete_concur @pytest.mark.asyncio -async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_worker_is_wrong_instance( +async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_worker_recovers_within_same_instance( async_client, app_instance, monkeypatch, @@ -1727,6 +1719,7 @@ async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_wo ) account = await _get_account(account_id) service = get_proxy_service_for_app(app_instance) + fake_upstream = _FakeBridgeUpstreamWebSocket() session_id = "hbs_signed_worker_owner_mismatch" signed_turn_state = service._encode_http_bridge_turn_state( session_id=session_id, @@ -1734,7 +1727,57 @@ async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_wo api_key_id=None, ) - monkeypatch.setattr(proxy_module, "_http_bridge_current_owner_id", lambda settings: "instance-a@worker-2") + async def fake_select_account_with_budget( + self, + deadline, + *, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids=None, + additional_limit_name=None, + ): + del ( + self, + deadline, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids, + additional_limit_name, + ) + return AccountSelection(account=account, error_message=None, error_code=None) + + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del headers, access_token, account_id_header, base_url, session + return fake_upstream + + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) async with SessionLocal() as db_session: await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) @@ -1756,23 +1799,21 @@ async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_wo downstream_turn_state=signed_turn_state, ) - with pytest.raises(proxy_module.ProxyResponseError) as exc_info: - await service._get_or_create_http_bridge_session( - proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), - headers={"x-codex-turn-state": signed_turn_state}, - affinity=proxy_module._AffinityPolicy( - key=signed_turn_state, - kind=proxy_module.StickySessionKind.CODEX_SESSION, - ), - api_key=None, - request_model="gpt-5.1", - idle_ttl_seconds=120.0, - max_sessions=128, - ) + recovered = await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) - exc = exc_info.value - assert exc.status_code == 409 - assert exc.payload["error"].get("code") == "bridge_wrong_instance" + assert recovered.bridge_session_id != session_id + assert proxy_module._http_bridge_owner_instance_group(recovered.owner_instance_id) == "instance-a" @pytest.mark.asyncio From d819475f9776572e0ef7e36611c41767cb4d22d6 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 15:42:45 +0000 Subject: [PATCH 22/34] fix(proxy): split bridge replay and worker ownership --- app/modules/proxy/service.py | 142 +++++++--- .../integration/test_http_responses_bridge.py | 243 +++++++++++++----- 2 files changed, 284 insertions(+), 101 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 141de17e..a60c7944 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1,9 +1,11 @@ from __future__ import annotations import asyncio +import errno import inspect import json import logging +import os import time from collections import deque from collections.abc import Sequence @@ -1793,37 +1795,6 @@ async def _get_or_create_http_bridge_session( active_turn_state_lease = await self._get_live_http_bridge_lease( turn_state_token.session_id if turn_state_token is not None else None ) - if ( - active_turn_state_lease is not None - and not _http_bridge_owner_matches_current( - active_turn_state_lease.owner_instance_id, - current_owner_id=current_owner, - current_instance_id=current_instance, - ) - and _http_bridge_owner_instance_group(active_turn_state_lease.owner_instance_id) in ring - ): - _log_http_bridge_event( - "owner_mismatch", - key, - account_id=active_turn_state_lease.account_id, - model=request_model, - detail=( - f"lease_session_id={active_turn_state_lease.session_id}, " - f"expected_instance={active_turn_state_lease.owner_instance_id}, " - f"current_instance={current_instance}" - ), - ) - raise ProxyResponseError( - 409, - openai_error( - "bridge_wrong_instance", - ( - "HTTP responses session bridge turn-state is owned by another live instance " - f"(expected {active_turn_state_lease.owner_instance_id}, got {current_owner})" - ), - error_type="server_error", - ), - ) created_session_id = self._new_http_bridge_session_id() while True: sessions_to_close: list[_HTTPBridgeSession] = [] @@ -1878,6 +1849,37 @@ async def _get_or_create_http_bridge_session( key = alias_session.key lookup_key = key session_key = key + elif ( + active_turn_state_lease is not None + and not _http_bridge_owner_matches_current( + active_turn_state_lease.owner_instance_id, + current_owner_id=current_owner, + current_instance_id=current_instance, + ) + and _http_bridge_owner_instance_group(active_turn_state_lease.owner_instance_id) in ring + ): + _log_http_bridge_event( + "owner_mismatch", + key, + account_id=active_turn_state_lease.account_id, + model=request_model, + detail=( + f"lease_session_id={active_turn_state_lease.session_id}, " + f"expected_instance={active_turn_state_lease.owner_instance_id}, " + f"current_instance={current_instance}" + ), + ) + raise ProxyResponseError( + 409, + openai_error( + "bridge_wrong_instance", + ( + "HTTP responses session bridge turn-state is owned by another live instance " + f"(expected {active_turn_state_lease.owner_instance_id}, got {current_owner})" + ), + error_type="server_error", + ), + ) elif turn_state_token is not None: recovered_turn_state_replay = True stale_turn_state_lease_session_id = ( @@ -5761,13 +5763,72 @@ def _normalized_http_bridge_instance_ring(settings: object) -> tuple[str, tuple[ def _http_bridge_current_owner_id(settings: object) -> str: instance_id, _ = _normalized_http_bridge_instance_ring(settings) - return instance_id + pid = os.getpid() + process_marker = _http_bridge_process_start_marker(pid) + if process_marker is None: + return f"{instance_id}@{pid}" + return f"{instance_id}@{pid}:{process_marker}" def _http_bridge_owner_instance_group(owner_id: str) -> str: return owner_id.split("@", 1)[0] +def _http_bridge_owner_pid(owner_id: str) -> int | None: + owner_parts = owner_id.split("@", 1) + if len(owner_parts) != 2: + return None + pid_text, _, _ = owner_parts[1].partition(":") + try: + pid = int(pid_text) + except ValueError: + return None + return pid if pid > 0 else None + + +def _http_bridge_owner_process_marker(owner_id: str) -> str | None: + owner_parts = owner_id.split("@", 1) + if len(owner_parts) != 2: + return None + _, separator, process_marker = owner_parts[1].partition(":") + if not separator or not process_marker: + return None + return process_marker + + +def _http_bridge_process_start_marker(pid: int) -> str | None: + try: + with open(f"/proc/{pid}/stat", encoding="utf-8") as process_stat: + payload = process_stat.read().strip() + except OSError: + return None + try: + _, stat_tail = payload.rsplit(") ", 1) + except ValueError: + return None + stat_fields = stat_tail.split() + if len(stat_fields) <= 19: + return None + process_marker = stat_fields[19].strip() + return process_marker or None + + +def _http_bridge_process_exists(pid: int) -> bool: + try: + os.kill(pid, 0) + except ProcessLookupError: + return False + except PermissionError: + return True + except OSError as exc: + if exc.errno == errno.ESRCH: + return False + if exc.errno == errno.EPERM: + return True + return True + return True + + def _http_bridge_owner_matches_current( owner_id: str, *, @@ -5776,7 +5837,22 @@ def _http_bridge_owner_matches_current( ) -> bool: if owner_id == current_owner_id or owner_id == current_instance_id: return True - return _http_bridge_owner_instance_group(owner_id) == current_instance_id + if _http_bridge_owner_instance_group(owner_id) != current_instance_id: + return False + owner_pid = _http_bridge_owner_pid(owner_id) + if owner_pid is None: + return False + if owner_pid == os.getpid(): + return True + owner_process_marker = _http_bridge_owner_process_marker(owner_id) + if owner_process_marker is None: + return not _http_bridge_process_exists(owner_pid) + live_process_marker = _http_bridge_process_start_marker(owner_pid) + if live_process_marker is None: + return True + if live_process_marker != owner_process_marker: + return True + return False def _http_bridge_owner_instance(key: _HTTPBridgeSessionKey, settings: object) -> str | None: diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 195b3e15..3b02c37c 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -1330,6 +1330,8 @@ async def fake_connect_responses_websocket( monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + monkeypatch.setattr(proxy_module, "_http_bridge_current_owner_id", lambda settings: "instance-a@222") + monkeypatch.setattr(proxy_module, "_http_bridge_process_exists", lambda pid: False) async with SessionLocal() as db_session: await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) @@ -1404,6 +1406,152 @@ async def fake_connect_responses_websocket( assert new_lease.affinity_key == session.key.affinity_key +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_signed_turn_state_replay_prefers_local_alias_over_stale_lease_owner( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits( + monkeypatch, + enabled=True, + instance_id="instance-a", + instance_ring=["instance-a", "instance-b"], + ) + account_id = await _import_account( + async_client, + "acc_http_bridge_stale_alias_preferred", + "http-bridge-stale-alias-preferred@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + fake_upstream = _FakeBridgeUpstreamWebSocket() + connect_headers_seen: list[dict[str, str]] = [] + session_id = "hbs_signed_stale_alias_preferred" + signed_turn_state = service._encode_http_bridge_turn_state( + session_id=session_id, + owner_instance_id="instance-a@111", + api_key_id=None, + ) + + async def fake_select_account_with_budget( + self, + deadline, + *, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids=None, + additional_limit_name=None, + ): + del ( + self, + deadline, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids, + additional_limit_name, + ) + return AccountSelection(account=account, error_message=None, error_code=None) + + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del access_token, account_id_header, base_url, session + connect_headers_seen.append(dict(headers)) + return fake_upstream + + async def flaky_delete_http_bridge_lease(self, stale_session_id): + del self + if stale_session_id == session_id: + raise RuntimeError("stale delete failed") + return None + + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + monkeypatch.setattr(proxy_module.ProxyService, "_delete_http_bridge_lease", flaky_delete_http_bridge_lease) + monkeypatch.setattr(proxy_module, "_http_bridge_current_owner_id", lambda settings: "instance-a@222") + monkeypatch.setattr(proxy_module, "_http_bridge_process_exists", lambda pid: False) + + async with SessionLocal() as db_session: + await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) + await db_session.commit() + + async with service._repo_factory() as repos: + await repos.http_bridge_leases.upsert( + session_id=session_id, + affinity_kind="turn_state_header", + affinity_key=signed_turn_state, + api_key_scope="", + owner_instance_id="instance-a@111", + lease_expires_at=proxy_module._http_bridge_lease_expires_at(120.0), + account_id=account.id, + request_model="gpt-5.1", + codex_session=True, + idle_ttl_seconds=120.0, + upstream_turn_state=None, + downstream_turn_state=signed_turn_state, + ) + + session = await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) + replayed = await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) + + async with SessionLocal() as db_session: + stale_lease = ( + await db_session.execute(select(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) + ).scalar_one_or_none() + + assert replayed is session + assert stale_lease is not None + assert signed_turn_state in session.downstream_turn_state_aliases + assert len(connect_headers_seen) == 1 + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_signed_turn_state_live_lease_from_restarted_worker_recovers_on_same_instance( async_client, @@ -1482,6 +1630,8 @@ async def fake_connect_responses_websocket( monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + monkeypatch.setattr(proxy_module, "_http_bridge_current_owner_id", lambda settings: "instance-a@222") + monkeypatch.setattr(proxy_module, "_http_bridge_process_exists", lambda pid: False) async with SessionLocal() as db_session: await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) @@ -1598,6 +1748,12 @@ async def fake_connect_responses_websocket( monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + monkeypatch.setattr(proxy_module, "_http_bridge_current_owner_id", lambda settings: "instance-a@222:current-start") + monkeypatch.setattr( + proxy_module, + "_http_bridge_process_start_marker", + lambda pid: "reused-start" if pid == 111 else None, + ) async with SessionLocal() as db_session: await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) @@ -1701,7 +1857,7 @@ async def test_v1_responses_http_bridge_live_lease_lookup_does_not_delete_concur @pytest.mark.asyncio -async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_worker_recovers_within_same_instance( +async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_worker_is_wrong_instance( async_client, app_instance, monkeypatch, @@ -1719,65 +1875,14 @@ async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_wo ) account = await _get_account(account_id) service = get_proxy_service_for_app(app_instance) - fake_upstream = _FakeBridgeUpstreamWebSocket() session_id = "hbs_signed_worker_owner_mismatch" signed_turn_state = service._encode_http_bridge_turn_state( session_id=session_id, owner_instance_id="instance-a@worker-1", api_key_id=None, ) - - async def fake_select_account_with_budget( - self, - deadline, - *, - request_id, - kind, - sticky_key, - sticky_kind, - reallocate_sticky, - sticky_max_age_seconds, - prefer_earlier_reset_accounts, - routing_strategy, - model, - exclude_account_ids=None, - additional_limit_name=None, - ): - del ( - self, - deadline, - request_id, - kind, - sticky_key, - sticky_kind, - reallocate_sticky, - sticky_max_age_seconds, - prefer_earlier_reset_accounts, - routing_strategy, - model, - exclude_account_ids, - additional_limit_name, - ) - return AccountSelection(account=account, error_message=None, error_code=None) - - async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): - del self, force, timeout_seconds - return target - - async def fake_connect_responses_websocket( - headers, - access_token, - account_id_header, - *, - base_url=None, - session=None, - ): - del headers, access_token, account_id_header, base_url, session - return fake_upstream - - monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) - monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) - monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + monkeypatch.setattr(proxy_module, "_http_bridge_current_owner_id", lambda settings: "instance-a@worker-2") + monkeypatch.setattr(proxy_module, "_http_bridge_process_exists", lambda pid: True) async with SessionLocal() as db_session: await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) @@ -1799,21 +1904,23 @@ async def fake_connect_responses_websocket( downstream_turn_state=signed_turn_state, ) - recovered = await service._get_or_create_http_bridge_session( - proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), - headers={"x-codex-turn-state": signed_turn_state}, - affinity=proxy_module._AffinityPolicy( - key=signed_turn_state, - kind=proxy_module.StickySessionKind.CODEX_SESSION, - ), - api_key=None, - request_model="gpt-5.1", - idle_ttl_seconds=120.0, - max_sessions=128, - ) + with pytest.raises(proxy_module.ProxyResponseError) as exc_info: + await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) - assert recovered.bridge_session_id != session_id - assert proxy_module._http_bridge_owner_instance_group(recovered.owner_instance_id) == "instance-a" + exc = exc_info.value + assert exc.status_code == 409 + assert exc.payload["error"].get("code") == "bridge_wrong_instance" @pytest.mark.asyncio From fb293ad15100b041a23c0713f4318fed1bd0c9e4 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 15:58:57 +0000 Subject: [PATCH 23/34] fix(proxy): fail closed on unreadable bridge owners --- app/modules/proxy/service.py | 2 +- .../integration/test_http_responses_bridge.py | 68 +++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index a60c7944..88679de9 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -5849,7 +5849,7 @@ def _http_bridge_owner_matches_current( return not _http_bridge_process_exists(owner_pid) live_process_marker = _http_bridge_process_start_marker(owner_pid) if live_process_marker is None: - return True + return not _http_bridge_process_exists(owner_pid) if live_process_marker != owner_process_marker: return True return False diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 3b02c37c..b4003dfe 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -1923,6 +1923,74 @@ async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_wo assert exc.payload["error"].get("code") == "bridge_wrong_instance" +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_signed_turn_state_live_peer_with_unreadable_marker_is_wrong_instance( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits( + monkeypatch, + enabled=True, + instance_id="instance-a", + instance_ring=["instance-a", "instance-b"], + ) + account_id = await _import_account( + async_client, + "acc_http_bridge_worker_owner_unreadable_marker", + "http-bridge-worker-owner-unreadable-marker@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + session_id = "hbs_signed_worker_owner_unreadable_marker" + signed_turn_state = service._encode_http_bridge_turn_state( + session_id=session_id, + owner_instance_id="instance-a@111:old-start", + api_key_id=None, + ) + monkeypatch.setattr(proxy_module, "_http_bridge_current_owner_id", lambda settings: "instance-a@222:current-start") + monkeypatch.setattr(proxy_module, "_http_bridge_process_exists", lambda pid: True) + monkeypatch.setattr(proxy_module, "_http_bridge_process_start_marker", lambda pid: None) + + async with SessionLocal() as db_session: + await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) + await db_session.commit() + + async with service._repo_factory() as repos: + await repos.http_bridge_leases.upsert( + session_id=session_id, + affinity_kind="turn_state_header", + affinity_key=signed_turn_state, + api_key_scope="", + owner_instance_id="instance-a@111:old-start", + lease_expires_at=proxy_module._http_bridge_lease_expires_at(120.0), + account_id=account.id, + request_model="gpt-5.1", + codex_session=True, + idle_ttl_seconds=120.0, + upstream_turn_state=None, + downstream_turn_state=signed_turn_state, + ) + + with pytest.raises(proxy_module.ProxyResponseError) as exc_info: + await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) + + exc = exc_info.value + assert exc.status_code == 409 + assert exc.payload["error"].get("code") == "bridge_wrong_instance" + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_signed_turn_state_owner_mismatch_rekeys_recovered_session( async_client, From c46418abdbe578aae835f306632f1b88f364ee3e Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 16:08:09 +0000 Subject: [PATCH 24/34] fix(proxy): preserve bridge continuity across replay races --- app/modules/proxy/service.py | 10 ++- .../integration/test_http_responses_bridge.py | 80 +++++++++++++++++++ 2 files changed, 87 insertions(+), 3 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 88679de9..f34c9531 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1571,11 +1571,15 @@ async def _get_live_http_bridge_lease( if lease is None: return None if to_utc_naive(lease.lease_expires_at) < utcnow(): - await repos.http_bridge_leases.delete_if_expires_at( + deleted = await repos.http_bridge_leases.delete_if_expires_at( session_id, lease_expires_at=lease.lease_expires_at, ) - return None + if deleted: + return None + lease = await repos.http_bridge_leases.get_by_session_id(session_id) + if lease is None or to_utc_naive(lease.lease_expires_at) < utcnow(): + return None return _HTTPBridgeLeaseSnapshot( session_id=lease.session_id, affinity_kind=lease.affinity_kind, @@ -2144,7 +2148,7 @@ async def _get_or_create_http_bridge_session( self._http_bridge_inflight_sessions.pop(lookup_key, None) self._http_bridge_sessions[session_key] = session if ( - rekey_recovered_turn_state + recovered_turn_state_replay and incoming_turn_state is not None and incoming_turn_state != session.key.affinity_key ): diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index b4003dfe..a7757ba4 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -1856,6 +1856,71 @@ async def test_v1_responses_http_bridge_live_lease_lookup_does_not_delete_concur assert proxy_module.to_utc_naive(remaining.lease_expires_at) == proxy_module.to_utc_naive(refreshed_expiry) +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_live_lease_lookup_rereads_after_refresh_wins_delete_race( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits(monkeypatch, enabled=True) + account_id = await _import_account( + async_client, + "acc_http_bridge_live_lease_reread", + "http-bridge-live-lease-reread@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + session_id = "hbs_bridge_live_lease_reread" + original_expiry = proxy_module.utcnow() - timedelta(seconds=1) + refreshed_expiry = proxy_module.utcnow() + timedelta(seconds=120) + original_delete_if_expires_at = HttpBridgeLeasesRepository.delete_if_expires_at + + async def fake_delete_if_expires_at(self, session_id_arg, *, lease_expires_at): + row = await self.get_by_session_id(session_id_arg) + assert row is not None + await self.touch( + session_id_arg, + affinity_kind=row.affinity_kind, + affinity_key=row.affinity_key, + api_key_scope=row.api_key_scope, + owner_instance_id=row.owner_instance_id, + lease_expires_at=refreshed_expiry, + account_id=row.account_id, + request_model=row.request_model, + codex_session=row.codex_session, + idle_ttl_seconds=row.idle_ttl_seconds, + upstream_turn_state=row.upstream_turn_state, + downstream_turn_state=row.downstream_turn_state, + ) + return False + + monkeypatch.setattr(HttpBridgeLeasesRepository, "delete_if_expires_at", fake_delete_if_expires_at) + + async with service._repo_factory() as repos: + await repos.http_bridge_leases.upsert( + session_id=session_id, + affinity_kind="turn_state_header", + affinity_key="signed-state", + api_key_scope="", + owner_instance_id="instance-a", + lease_expires_at=original_expiry, + account_id=account.id, + request_model="gpt-5.1", + codex_session=True, + idle_ttl_seconds=120.0, + upstream_turn_state=None, + downstream_turn_state="signed-state", + ) + + snapshot = await service._get_live_http_bridge_lease(session_id) + + monkeypatch.setattr(HttpBridgeLeasesRepository, "delete_if_expires_at", original_delete_if_expires_at) + + assert snapshot is not None + assert snapshot.session_id == session_id + assert proxy_module.to_utc_naive(snapshot.lease_expires_at) == proxy_module.to_utc_naive(refreshed_expiry) + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_worker_is_wrong_instance( async_client, @@ -2357,6 +2422,20 @@ async def fake_connect_responses_websocket( kind=proxy_module.StickySessionKind.PROMPT_CACHE, max_age_seconds=300, ) + assert signed_turn_state in recovered.downstream_turn_state_aliases + + replayed = await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) reused = await service._get_or_create_http_bridge_session( proxy_module._HTTPBridgeSessionKey("prompt_cache", "stable-affinity-thread", None), @@ -2383,6 +2462,7 @@ async def fake_connect_responses_websocket( ) await service._reconnect_http_bridge_session(recovered, request_state=request_state) + assert replayed is recovered assert reused is recovered assert connect_count == 2 assert sticky_selections == [ From ddcd1e948a602610e26bddde22da27975e4c6eb6 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 16:20:01 +0000 Subject: [PATCH 25/34] fix(proxy): fail turn-state registration on lease loss --- app/modules/proxy/service.py | 4 + .../integration/test_http_responses_bridge.py | 84 +++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index f34c9531..5726dc28 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -2298,6 +2298,10 @@ async def _register_http_bridge_turn_state(self, session: "_HTTPBridgeSession", session, failure_message="Failed to persist HTTP bridge lease after turn-state registration session_id=%s", ) + raise ProxyResponseError( + 502, + openai_error("upstream_unavailable", "HTTP bridge session became unavailable"), + ) async def _unregister_http_bridge_turn_states(self, session: "_HTTPBridgeSession") -> None: async with self._http_bridge_lock: diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index a7757ba4..680b9884 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -2987,6 +2987,90 @@ async def fake_touch_http_bridge_lease(self, session): assert not session.pending_requests +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_turn_state_registration_failure_does_not_emit_dead_header( + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits(monkeypatch, enabled=True) + service = get_proxy_service_for_app(app_instance) + payload = proxy_module.ResponsesRequest.model_validate({"model": "gpt-5.1", "instructions": "hi", "input": []}) + session = cast( + proxy_module._HTTPBridgeSession, + _make_dummy_bridge_session(proxy_module._HTTPBridgeSessionKey("request", "register-turn-state-failure", None)), + ) + session.bridge_session_id = "hbs_register_turn_state_failure" + session.response_create_gate = asyncio.Semaphore(1) + session.account = SimpleNamespace(id="acc_register_turn_state_failure", status=AccountStatus.ACTIVE) # type: ignore[assignment] + response_headers_out: dict[str, str] = {} + + def fake_prepare_http_bridge_request(self, *args, **kwargs): + del self, args, kwargs + return ( + proxy_module._WebSocketRequestState( + request_id="req_register_turn_state_failure", + model="gpt-5.1", + service_tier=None, + reasoning_effort=None, + api_key_reservation=None, + started_at=time.monotonic(), + event_queue=asyncio.Queue(), + ), + json.dumps({"type": "response.create", "model": "gpt-5.1", "input": []}), + ) + + async def fake_get_or_create_http_bridge_session(self, *args, **kwargs): + del self, args, kwargs + return session + + async def fake_submit_http_bridge_request(self, session_arg, *, request_state, text_data, queue_limit): + del session_arg, text_data, queue_limit + await request_state.event_queue.put('data: {"type":"response.completed"}\n\n') + await request_state.event_queue.put(None) + + def fake_resolve_http_bridge_downstream_turn_state(self, session_arg, *, requested_turn_state, api_key_id): + del self, session_arg, requested_turn_state, api_key_id + return "http_turn_dead_header" + + async def failing_touch_http_bridge_lease(self, session_arg): + del self, session_arg + raise RuntimeError("lease touch failed") + + monkeypatch.setattr(proxy_module.ProxyService, "_prepare_http_bridge_request", fake_prepare_http_bridge_request) + monkeypatch.setattr(proxy_module.ProxyService, "_get_or_create_http_bridge_session", fake_get_or_create_http_bridge_session) + monkeypatch.setattr(proxy_module.ProxyService, "_submit_http_bridge_request", fake_submit_http_bridge_request) + monkeypatch.setattr( + proxy_module.ProxyService, + "_resolve_http_bridge_downstream_turn_state", + fake_resolve_http_bridge_downstream_turn_state, + ) + monkeypatch.setattr(proxy_module.ProxyService, "_touch_http_bridge_lease", failing_touch_http_bridge_lease) + + with pytest.raises(proxy_module.ProxyResponseError) as exc_info: + async for _ in service._stream_via_http_bridge( + payload, + {}, + codex_session_affinity=False, + propagate_http_errors=False, + openai_cache_affinity=False, + api_key=None, + api_key_reservation=None, + suppress_text_done_events=False, + idle_ttl_seconds=120.0, + codex_idle_ttl_seconds=120.0, + max_sessions=8, + queue_limit=8, + response_headers_out=response_headers_out, + ): + pass + + exc = exc_info.value + assert exc.status_code == 502 + assert exc.payload["error"].get("code") == "upstream_unavailable" + assert "x-codex-turn-state" not in response_headers_out + assert session.closed is True + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_keeps_lease_alive_while_request_is_active(app_instance, monkeypatch): _install_bridge_settings_with_limits(monkeypatch, enabled=True) From c4eeff1103dd92eecdcf43a6a4df411ae98896fb Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 16:34:24 +0000 Subject: [PATCH 26/34] fix(proxy): canonicalize signed bridge turn states --- app/modules/proxy/service.py | 6 ++- .../integration/test_http_responses_bridge.py | 49 +++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 5726dc28..76ff0efe 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -2279,7 +2279,11 @@ async def _register_http_bridge_turn_state(self, session: "_HTTPBridgeSession", session=session, api_key_id=session.key.api_key_id, ): - if session.downstream_turn_state is None: + if session.downstream_turn_state is None or not self._http_bridge_turn_state_matches_session( + session.downstream_turn_state, + session=session, + api_key_id=session.key.api_key_id, + ): session.downstream_turn_state = turn_state else: self._promote_http_bridge_session_to_codex_affinity( diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 680b9884..33bbeeff 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -2785,6 +2785,55 @@ async def fake_connect_responses_websocket( await service._close_http_bridge_session(session) +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_legacy_replay_converges_to_signed_canonical_turn_state( + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits(monkeypatch, enabled=True) + service = get_proxy_service_for_app(app_instance) + session = cast( + proxy_module._HTTPBridgeSession, + _make_dummy_bridge_session(proxy_module._HTTPBridgeSessionKey("request", "legacy-canonical-convergence", None)), + ) + session.bridge_session_id = "hbs_legacy_canonical_convergence" + session.owner_instance_id = "instance-a" + + async def fake_touch_http_bridge_lease(self, session_arg): + del self, session_arg + return None + + monkeypatch.setattr(proxy_module.ProxyService, "_touch_http_bridge_lease", fake_touch_http_bridge_lease) + + await service._register_http_bridge_turn_state(session, "http_turn_legacy_client") + + signed_turn_state = service._resolve_http_bridge_downstream_turn_state( + session, + requested_turn_state="http_turn_legacy_client", + api_key_id=None, + ) + await service._register_http_bridge_turn_state(session, signed_turn_state) + + signed_turn_state_repeat = service._resolve_http_bridge_downstream_turn_state( + session, + requested_turn_state="http_turn_legacy_client", + api_key_id=None, + ) + await service._register_http_bridge_turn_state(session, signed_turn_state_repeat) + + assert signed_turn_state_repeat == signed_turn_state + assert session.downstream_turn_state == signed_turn_state + assert session.downstream_turn_state_aliases == {"http_turn_legacy_client", signed_turn_state} + assert service._http_bridge_turn_state_index[ + proxy_module._http_bridge_turn_state_alias_key("http_turn_legacy_client", session.key.api_key_id) + ] == session.key + assert service._http_bridge_turn_state_index[ + proxy_module._http_bridge_turn_state_alias_key(signed_turn_state, session.key.api_key_id) + ] == session.key + + await service._close_http_bridge_session(session) + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_close_waits_for_turn_state_index_lock( async_client, From add96526adcf8232ea1ae0903ac0363bf70ec6cb Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 16:50:58 +0000 Subject: [PATCH 27/34] fix(proxy): claim bridge leases by stable affinity --- ...e_http_bridge_lease_affinity_uniqueness.py | 73 ++++++++++++ app/db/models.py | 7 ++ app/modules/proxy/bridge_repository.py | 110 +++++++++++++++++- app/modules/proxy/service.py | 49 ++++++-- .../integration/test_http_responses_bridge.py | 76 ++++++++++++ 5 files changed, 307 insertions(+), 8 deletions(-) create mode 100644 app/db/alembic/versions/20260325_120000_enforce_http_bridge_lease_affinity_uniqueness.py diff --git a/app/db/alembic/versions/20260325_120000_enforce_http_bridge_lease_affinity_uniqueness.py b/app/db/alembic/versions/20260325_120000_enforce_http_bridge_lease_affinity_uniqueness.py new file mode 100644 index 00000000..b552d3d6 --- /dev/null +++ b/app/db/alembic/versions/20260325_120000_enforce_http_bridge_lease_affinity_uniqueness.py @@ -0,0 +1,73 @@ +"""enforce http bridge lease affinity uniqueness + +Revision ID: 20260325_120000_enforce_http_bridge_lease_affinity_uniqueness +Revises: 20260322_000000_merge_http_bridge_lease_head +Create Date: 2026-03-25 +""" + +from __future__ import annotations + +import sqlalchemy as sa +from alembic import op +from sqlalchemy.engine import Connection + +# revision identifiers, used by Alembic. +revision = "20260325_120000_enforce_http_bridge_lease_affinity_uniqueness" +down_revision = "20260322_000000_merge_http_bridge_lease_head" +branch_labels = None +depends_on = None + + +def _table_exists(connection: Connection, table_name: str) -> bool: + inspector = sa.inspect(connection) + return inspector.has_table(table_name) + + +def _index_exists(connection: Connection, index_name: str, table_name: str) -> bool: + inspector = sa.inspect(connection) + if not inspector.has_table(table_name): + return False + return any(index["name"] == index_name for index in inspector.get_indexes(table_name)) + + +def upgrade() -> None: + bind = op.get_bind() + if not _table_exists(bind, "http_bridge_leases"): + return + op.execute( + sa.text( + """ + DELETE FROM http_bridge_leases + WHERE session_id IN ( + SELECT session_id + FROM ( + SELECT + session_id, + ROW_NUMBER() OVER ( + PARTITION BY affinity_kind, affinity_key, api_key_scope + ORDER BY lease_expires_at DESC, updated_at DESC, created_at DESC, session_id DESC + ) AS row_num + FROM http_bridge_leases + ) ranked_leases + WHERE ranked_leases.row_num > 1 + ) + """ + ) + ) + if not _index_exists(bind, "ux_http_bridge_leases_affinity_scope", "http_bridge_leases"): + op.create_index( + "ux_http_bridge_leases_affinity_scope", + "http_bridge_leases", + ["affinity_kind", "affinity_key", "api_key_scope"], + unique=True, + ) + + +def downgrade() -> None: + bind = op.get_bind() + if _table_exists(bind, "http_bridge_leases") and _index_exists( + bind, + "ux_http_bridge_leases_affinity_scope", + "http_bridge_leases", + ): + op.drop_index("ux_http_bridge_leases_affinity_scope", table_name="http_bridge_leases") diff --git a/app/db/models.py b/app/db/models.py index af48475c..63263679 100644 --- a/app/db/models.py +++ b/app/db/models.py @@ -160,6 +160,13 @@ class StickySession(Base): class HttpBridgeLease(Base): __tablename__ = "http_bridge_leases" __table_args__ = ( + Index( + "ux_http_bridge_leases_affinity_scope", + "affinity_kind", + "affinity_key", + "api_key_scope", + unique=True, + ), Index("ix_http_bridge_leases_owner_expires", "owner_instance_id", "lease_expires_at"), Index("ix_http_bridge_leases_expires", "lease_expires_at"), ) diff --git a/app/modules/proxy/bridge_repository.py b/app/modules/proxy/bridge_repository.py index 762a713c..4c67b5e7 100644 --- a/app/modules/proxy/bridge_repository.py +++ b/app/modules/proxy/bridge_repository.py @@ -2,7 +2,7 @@ from datetime import datetime -from sqlalchemy import delete, select, update +from sqlalchemy import delete, or_, select, update from sqlalchemy.dialects.postgresql import insert as pg_insert from sqlalchemy.dialects.sqlite import insert as sqlite_insert from sqlalchemy.ext.asyncio import AsyncSession @@ -61,6 +61,51 @@ async def upsert( await self._session.refresh(row) return row + async def claim( + self, + *, + session_id: str, + affinity_kind: str, + affinity_key: str, + api_key_scope: str, + owner_instance_id: str, + lease_expires_at: datetime, + account_id: str | None, + request_model: str | None, + codex_session: bool, + idle_ttl_seconds: float, + upstream_turn_state: str | None, + downstream_turn_state: str | None, + replace_session_id: str | None, + expires_before: datetime, + ) -> HttpBridgeLease | None: + statement = self._build_claim_statement( + session_id=session_id, + affinity_kind=affinity_kind, + affinity_key=affinity_key, + api_key_scope=api_key_scope, + owner_instance_id=owner_instance_id, + lease_expires_at=lease_expires_at, + account_id=account_id, + request_model=request_model, + codex_session=codex_session, + idle_ttl_seconds=idle_ttl_seconds, + upstream_turn_state=upstream_turn_state, + downstream_turn_state=downstream_turn_state, + replace_session_id=replace_session_id, + expires_before=expires_before, + ) + result = await self._session.execute(statement.returning(HttpBridgeLease.session_id)) + await self._session.commit() + claimed_session_id = result.scalar_one_or_none() + if claimed_session_id != session_id: + return None + row = await self.get_by_session_id(session_id) + if row is None: + raise RuntimeError(f"HttpBridgeLease claim failed for session_id={session_id!r}") + await self._session.refresh(row) + return row + async def delete(self, session_id: str) -> bool: if not session_id: return False @@ -182,3 +227,66 @@ def _build_upsert_statement( "updated_at": func.now(), }, ) + + def _build_claim_statement( + self, + *, + session_id: str, + affinity_kind: str, + affinity_key: str, + api_key_scope: str, + owner_instance_id: str, + lease_expires_at: datetime, + account_id: str | None, + request_model: str | None, + codex_session: bool, + idle_ttl_seconds: float, + upstream_turn_state: str | None, + downstream_turn_state: str | None, + replace_session_id: str | None, + expires_before: datetime, + ) -> Insert: + dialect = self._session.get_bind().dialect.name + if dialect == "postgresql": + insert_fn = pg_insert + elif dialect == "sqlite": + insert_fn = sqlite_insert + else: + raise RuntimeError(f"HttpBridgeLease claim unsupported for dialect={dialect!r}") + statement = insert_fn(HttpBridgeLease).values( + session_id=session_id, + affinity_kind=affinity_kind, + affinity_key=affinity_key, + api_key_scope=api_key_scope, + owner_instance_id=owner_instance_id, + lease_expires_at=to_utc_naive(lease_expires_at), + account_id=account_id, + request_model=request_model, + codex_session=codex_session, + idle_ttl_seconds=idle_ttl_seconds, + upstream_turn_state=upstream_turn_state, + downstream_turn_state=downstream_turn_state, + ) + replace_condition = HttpBridgeLease.lease_expires_at < to_utc_naive(expires_before) + if replace_session_id is not None: + replace_condition = or_(replace_condition, HttpBridgeLease.session_id == replace_session_id) + return statement.on_conflict_do_update( + index_elements=[ + HttpBridgeLease.affinity_kind, + HttpBridgeLease.affinity_key, + HttpBridgeLease.api_key_scope, + ], + set_={ + "session_id": session_id, + "owner_instance_id": owner_instance_id, + "lease_expires_at": to_utc_naive(lease_expires_at), + "account_id": account_id, + "request_model": request_model, + "codex_session": codex_session, + "idle_ttl_seconds": idle_ttl_seconds, + "upstream_turn_state": upstream_turn_state, + "downstream_turn_state": downstream_turn_state, + "updated_at": func.now(), + }, + where=replace_condition, + ) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 76ff0efe..31bd5cb2 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -108,6 +108,10 @@ logger = logging.getLogger(__name__) + +class _HTTPBridgeLeaseClaimLost(RuntimeError): + pass + _TEXT_DELTA_EVENT_TYPES = frozenset({"response.output_text.delta", "response.refusal.delta"}) _TEXT_DONE_CONTENT_PART_TYPES = frozenset({"output_text", "refusal"}) _REQUEST_TRANSPORT_HTTP = "http" @@ -1597,8 +1601,16 @@ async def _delete_http_bridge_lease(self, session_id: str | None) -> None: await repos.http_bridge_leases.delete(session_id) async def _persist_http_bridge_lease(self, session: "_HTTPBridgeSession") -> None: + await self._claim_http_bridge_lease(session, replace_session_id=None) + + async def _claim_http_bridge_lease( + self, + session: "_HTTPBridgeSession", + *, + replace_session_id: str | None, + ) -> None: async with self._repo_factory() as repos: - await repos.http_bridge_leases.upsert( + claimed = await repos.http_bridge_leases.claim( session_id=session.bridge_session_id, affinity_kind=session.key.affinity_kind, affinity_key=session.key.affinity_key, @@ -1611,6 +1623,12 @@ async def _persist_http_bridge_lease(self, session: "_HTTPBridgeSession") -> Non idle_ttl_seconds=session.idle_ttl_seconds, upstream_turn_state=session.upstream_turn_state, downstream_turn_state=session.downstream_turn_state, + replace_session_id=replace_session_id, + expires_before=utcnow(), + ) + if claimed is None: + raise _HTTPBridgeLeaseClaimLost( + f"HTTP bridge lease claim lost for affinity={session.key.affinity_kind}:{session.key.affinity_key}" ) async def _touch_http_bridge_lease(self, session: "_HTTPBridgeSession") -> None: @@ -1635,7 +1653,7 @@ async def _touch_http_bridge_lease(self, session: "_HTTPBridgeSession") -> None: if not touched: if session.closed: return - await repos.http_bridge_leases.upsert( + claimed = await repos.http_bridge_leases.claim( session_id=session.bridge_session_id, affinity_kind=session.key.affinity_kind, affinity_key=session.key.affinity_key, @@ -1648,7 +1666,13 @@ async def _touch_http_bridge_lease(self, session: "_HTTPBridgeSession") -> None: idle_ttl_seconds=session.idle_ttl_seconds, upstream_turn_state=session.upstream_turn_state, downstream_turn_state=session.downstream_turn_state, + replace_session_id=session.bridge_session_id, + expires_before=utcnow(), ) + if claimed is None: + raise _HTTPBridgeLeaseClaimLost( + "HTTP bridge lease claim lost while recreating missing lease row" + ) async def _invalidate_http_bridge_session_after_lease_failure( self, @@ -1877,10 +1901,7 @@ async def _get_or_create_http_bridge_session( 409, openai_error( "bridge_wrong_instance", - ( - "HTTP responses session bridge turn-state is owned by another live instance " - f"(expected {active_turn_state_lease.owner_instance_id}, got {current_owner})" - ), + "HTTP responses session bridge turn-state is owned by another live instance", error_type="server_error", ), ) @@ -2138,6 +2159,8 @@ async def _get_or_create_http_bridge_session( create_kwargs["bridge_session_id"] = created_session_id if accepts_extra_create_kwargs or "owner_instance_id" in create_signature.parameters: create_kwargs["owner_instance_id"] = current_owner + if accepts_extra_create_kwargs or "replaced_bridge_session_id" in create_signature.parameters: + create_kwargs["replaced_bridge_session_id"] = stale_turn_state_lease_session_id session = await create_session( session_key, **create_kwargs, @@ -2172,6 +2195,17 @@ async def _get_or_create_http_bridge_session( inflight_future.exception() if session is not None and not session_registered: await self._close_http_bridge_session(session) + if isinstance(exc, _HTTPBridgeLeaseClaimLost): + return await self._get_or_create_http_bridge_session( + key, + headers=headers, + affinity=affinity, + api_key=api_key, + request_model=request_model, + idle_ttl_seconds=idle_ttl_seconds, + max_sessions=max_sessions, + previous_response_id=previous_response_id, + ) raise _log_http_bridge_event( "create", @@ -2359,6 +2393,7 @@ async def _create_http_bridge_session( idle_ttl_seconds: float, bridge_session_id: str, owner_instance_id: str, + replaced_bridge_session_id: str | None = None, ) -> "_HTTPBridgeSession": request_state = _WebSocketRequestState( request_id=f"http_bridge_connect_{uuid4().hex}", @@ -2441,7 +2476,7 @@ async def _create_http_bridge_session( downstream_turn_state=None, ) try: - await self._persist_http_bridge_lease(session) + await self._claim_http_bridge_lease(session, replace_session_id=replaced_bridge_session_id) except BaseException: session.closed = True try: diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 33bbeeff..098b9cf9 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -1921,6 +1921,81 @@ async def fake_delete_if_expires_at(self, session_id_arg, *, lease_expires_at): assert proxy_module.to_utc_naive(snapshot.lease_expires_at) == proxy_module.to_utc_naive(refreshed_expiry) +@pytest.mark.asyncio +async def test_http_bridge_leases_claim_allows_only_one_stale_replacement(): + stale_expiry = proxy_module.utcnow() + timedelta(seconds=120) + + async with SessionLocal() as session: + repo = HttpBridgeLeasesRepository(session) + await repo.upsert( + session_id="hbs_stale_original", + affinity_kind="prompt_cache", + affinity_key="stable-claim-key", + api_key_scope="", + owner_instance_id="instance-a", + lease_expires_at=stale_expiry, + account_id=None, + request_model="gpt-5.1", + codex_session=False, + idle_ttl_seconds=120.0, + upstream_turn_state=None, + downstream_turn_state=None, + ) + + async with SessionLocal() as session_one: + repo_one = HttpBridgeLeasesRepository(session_one) + claimed_one = await repo_one.claim( + session_id="hbs_claim_one", + affinity_kind="prompt_cache", + affinity_key="stable-claim-key", + api_key_scope="", + owner_instance_id="instance-a@worker-1", + lease_expires_at=stale_expiry, + account_id=None, + request_model="gpt-5.1", + codex_session=False, + idle_ttl_seconds=120.0, + upstream_turn_state=None, + downstream_turn_state=None, + replace_session_id="hbs_stale_original", + expires_before=proxy_module.utcnow(), + ) + + async with SessionLocal() as session_two: + repo_two = HttpBridgeLeasesRepository(session_two) + claimed_two = await repo_two.claim( + session_id="hbs_claim_two", + affinity_kind="prompt_cache", + affinity_key="stable-claim-key", + api_key_scope="", + owner_instance_id="instance-a@worker-2", + lease_expires_at=stale_expiry, + account_id=None, + request_model="gpt-5.1", + codex_session=False, + idle_ttl_seconds=120.0, + upstream_turn_state=None, + downstream_turn_state=None, + replace_session_id="hbs_stale_original", + expires_before=proxy_module.utcnow(), + ) + + assert (claimed_one is None) != (claimed_two is None) + + async with SessionLocal() as session: + lease = ( + await session.execute( + select(HttpBridgeLease).where( + HttpBridgeLease.affinity_kind == "prompt_cache", + HttpBridgeLease.affinity_key == "stable-claim-key", + HttpBridgeLease.api_key_scope == "", + ) + ) + ).scalar_one() + + assert lease.session_id in {"hbs_claim_one", "hbs_claim_two"} + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_worker_is_wrong_instance( async_client, @@ -1986,6 +2061,7 @@ async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_wo exc = exc_info.value assert exc.status_code == 409 assert exc.payload["error"].get("code") == "bridge_wrong_instance" + assert exc.payload["error"].get("message") == "HTTP responses session bridge turn-state is owned by another live instance" @pytest.mark.asyncio From cf6b72688e4f1ab3427f7f6587bd1ca16c45adb5 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 17:28:18 +0000 Subject: [PATCH 28/34] fix(proxy): preserve bridge reconnect lease handoff --- app/modules/proxy/service.py | 7 +- .../integration/test_http_responses_bridge.py | 166 ++++++++++++++++-- 2 files changed, 154 insertions(+), 19 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 31bd5cb2..fba58516 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1922,7 +1922,7 @@ async def _get_or_create_http_bridge_session( key, openai_cache_affinity_max_age_seconds=settings.openai_cache_affinity_max_age_seconds, ) - rekey_recovered_turn_state = key.affinity_kind == "turn_state_header" + rekey_recovered_turn_state = True else: lookup_key = _HTTPBridgeSessionKey( "turn_state_header", @@ -2881,10 +2881,9 @@ async def _reconnect_http_bridge_session( old_upstream = session.upstream old_reader = session.upstream_reader if restart_reader else None new_upstream: UpstreamResponsesWebSocket | None = None - preserve_lease_during_reconnect = False + preserve_lease_during_reconnect = True + session.preserve_lease_during_reconnect = True if old_reader is not None: - session.preserve_lease_during_reconnect = True - preserve_lease_during_reconnect = True old_reader.cancel() if old_reader is not asyncio.current_task(): try: diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 098b9cf9..728e7f47 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -2357,9 +2357,15 @@ async def fake_connect_responses_websocket( max_sessions=128, ) - assert recovered.key.affinity_kind == "prompt_cache" - assert recovered.key.affinity_key == "stale-owner-thread" + assert recovered.key.affinity_kind == "turn_state_header" + assert recovered.key.affinity_key != signed_turn_state assert recovered.bridge_session_id != stale_session_id + assert recovered.affinity == proxy_module._AffinityPolicy( + key=recovered.key.affinity_key, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ) + assert recovered.codex_session is True + assert signed_turn_state in recovered.downstream_turn_state_aliases assert proxy_module._http_bridge_owner_instance_group(recovered.owner_instance_id) == "instance-new" async with SessionLocal() as db_session: @@ -2374,12 +2380,13 @@ async def fake_connect_responses_websocket( assert stale_lease is None assert proxy_module._http_bridge_owner_instance_group(new_lease.owner_instance_id) == "instance-new" - assert new_lease.affinity_kind == "prompt_cache" - assert new_lease.affinity_key == "stale-owner-thread" + assert new_lease.affinity_kind == "turn_state_header" + assert new_lease.affinity_key == recovered.key.affinity_key + await service._close_http_bridge_session(recovered) @pytest.mark.asyncio -async def test_v1_responses_http_bridge_signed_turn_state_recovery_preserves_stable_affinity( +async def test_v1_responses_http_bridge_signed_turn_state_recovery_rekeys_to_codex_affinity( async_client, app_instance, monkeypatch, @@ -2397,7 +2404,11 @@ async def test_v1_responses_http_bridge_signed_turn_state_recovery_preserves_sta ) account = await _get_account(account_id) service = get_proxy_service_for_app(app_instance) - upstreams = [_FakeBridgeUpstreamWebSocket(), _FakeBridgeUpstreamWebSocket()] + upstreams = [ + _FakeBridgeUpstreamWebSocket(), + _FakeBridgeUpstreamWebSocket(), + _FakeBridgeUpstreamWebSocket(), + ] connect_count = 0 sticky_selections: list[tuple[str | None, object | None, bool, int | None]] = [] session_id = "hbs_signed_missing_alias_stable_affinity" @@ -2491,13 +2502,14 @@ async def fake_connect_responses_websocket( max_sessions=128, ) - assert recovered.key.affinity_kind == "prompt_cache" - assert recovered.key.affinity_key == "stable-affinity-thread" + assert recovered.key.affinity_kind == "turn_state_header" + assert recovered.key.affinity_key != signed_turn_state assert recovered.affinity == proxy_module._AffinityPolicy( - key="stable-affinity-thread", - kind=proxy_module.StickySessionKind.PROMPT_CACHE, - max_age_seconds=300, + key=recovered.key.affinity_key, + kind=proxy_module.StickySessionKind.CODEX_SESSION, ) + assert recovered.codex_session is True + assert recovered.idle_ttl_seconds == pytest.approx(900.0) assert signed_turn_state in recovered.downstream_turn_state_aliases replayed = await service._get_or_create_http_bridge_session( @@ -2539,12 +2551,16 @@ async def fake_connect_responses_websocket( await service._reconnect_http_bridge_session(recovered, request_state=request_state) assert replayed is recovered - assert reused is recovered - assert connect_count == 2 + assert reused is not recovered + assert reused.key == proxy_module._HTTPBridgeSessionKey("prompt_cache", "stable-affinity-thread", None) + assert connect_count == 3 assert sticky_selections == [ - ("stable-affinity-thread", proxy_module.StickySessionKind.PROMPT_CACHE, False, 300), - ("stable-affinity-thread", proxy_module.StickySessionKind.PROMPT_CACHE, False, 300), + (recovered.key.affinity_key, proxy_module.StickySessionKind.CODEX_SESSION, False, None), + ("stable-affinity-thread", proxy_module.StickySessionKind.PROMPT_CACHE, False, None), + (recovered.key.affinity_key, proxy_module.StickySessionKind.CODEX_SESSION, False, None), ] + await service._close_http_bridge_session(recovered) + await service._close_http_bridge_session(reused) @pytest.mark.asyncio @@ -3813,6 +3829,126 @@ async def fake_touch_http_bridge_lease(self, session): await service._reconnect_http_bridge_session(session, request_state=request_state, restart_reader=True) assert call_order == ["touch"] + await service._close_http_bridge_session(session) + + +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_reconnect_without_reader_restart_preserves_lease_until_touch( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits(monkeypatch, enabled=True) + account_id = await _import_account( + async_client, + "acc_http_bridge_reconnect_lease_handoff_no_restart", + "http-bridge-reconnect-lease-handoff-no-restart@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + first_upstream = _FakeBridgeUpstreamWebSocket() + second_upstream = _FakeBridgeUpstreamWebSocket() + upstreams = [first_upstream, second_upstream] + + async def fake_select_account_with_budget( + self, + deadline, + *, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids=None, + additional_limit_name=None, + ): + del ( + self, + deadline, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids, + additional_limit_name, + ) + return AccountSelection(account=account, error_message=None, error_code=None) + + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del headers, access_token, account_id_header, base_url, session + return upstreams.pop(0) + + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + + payload = proxy_module.ResponsesRequest.model_validate({"model": "gpt-5.1", "instructions": "hi", "input": []}) + affinity = proxy_module._AffinityPolicy( + key="reconnect-lease-handoff-no-restart", + kind=proxy_module.StickySessionKind.PROMPT_CACHE, + ) + session = await service._get_or_create_http_bridge_session( + proxy_module._make_http_bridge_session_key( + payload, + headers={}, + affinity=affinity, + api_key=None, + request_id="req_reconnect_lease_handoff_no_restart", + ), + headers={}, + affinity=affinity, + api_key=None, + request_model=payload.model, + idle_ttl_seconds=120.0, + max_sessions=128, + ) + + call_order: list[str] = [] + + async def fake_delete_http_bridge_lease(self, session_id): + del self, session_id + call_order.append("delete") + + async def fake_touch_http_bridge_lease(self, session): + del self, session + call_order.append("touch") + + monkeypatch.setattr(proxy_module.ProxyService, "_delete_http_bridge_lease", fake_delete_http_bridge_lease) + monkeypatch.setattr(proxy_module.ProxyService, "_touch_http_bridge_lease", fake_touch_http_bridge_lease) + + request_state = proxy_module._WebSocketRequestState( + request_id="req_reconnect_lease_no_restart", + model=payload.model, + service_tier=None, + reasoning_effort=None, + api_key_reservation=None, + started_at=time.monotonic(), + ) + await service._reconnect_http_bridge_session(session, request_state=request_state) + await asyncio.sleep(0) + + assert call_order == ["touch"] + await service._close_http_bridge_session(session) @pytest.mark.asyncio From 9624c4ddb13f22b1acb8a52eecd4d770709f2ef4 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 17:40:41 +0000 Subject: [PATCH 29/34] fix(proxy): restore bridge lease persist hook --- app/modules/proxy/service.py | 10 +- .../integration/test_http_responses_bridge.py | 94 +++++++++++++++++++ 2 files changed, 102 insertions(+), 2 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index fba58516..98832fd1 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1601,7 +1601,11 @@ async def _delete_http_bridge_lease(self, session_id: str | None) -> None: await repos.http_bridge_leases.delete(session_id) async def _persist_http_bridge_lease(self, session: "_HTTPBridgeSession") -> None: - await self._claim_http_bridge_lease(session, replace_session_id=None) + replace_session_id = session.pending_replaced_bridge_session_id + try: + await self._claim_http_bridge_lease(session, replace_session_id=replace_session_id) + finally: + session.pending_replaced_bridge_session_id = None async def _claim_http_bridge_lease( self, @@ -2474,9 +2478,10 @@ async def _create_http_bridge_session( upstream_turn_state=echoed_turn_state, reconnect_turn_state=reconnect_turn_state, downstream_turn_state=None, + pending_replaced_bridge_session_id=replaced_bridge_session_id, ) try: - await self._claim_http_bridge_lease(session, replace_session_id=replaced_bridge_session_id) + await self._persist_http_bridge_lease(session) except BaseException: session.closed = True try: @@ -4991,6 +4996,7 @@ class _HTTPBridgeSession: reconnect_turn_state: str | None = None downstream_turn_state: str | None = None downstream_turn_state_aliases: set[str] = field(default_factory=set) + pending_replaced_bridge_session_id: str | None = None upstream_reader: asyncio.Task[None] | None = None lease_keepalive_task: asyncio.Task[None] | None = None preserve_lease_during_reconnect: bool = False diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 728e7f47..0e7146ff 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -3386,6 +3386,100 @@ async def fake_persist_http_bridge_lease(self, session): assert fake_upstream.closed is True +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_creation_with_replacement_uses_persist_hook( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits(monkeypatch, enabled=True) + account_id = await _import_account( + async_client, + "acc_http_bridge_lease_persist_failure_replace", + "http-bridge-lease-persist-failure-replace@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + fake_upstream = _FakeBridgeUpstreamWebSocket() + persisted_replace_session_ids: list[str | None] = [] + + async def fake_select_account_with_budget( + self, + deadline, + *, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids=None, + additional_limit_name=None, + ): + del ( + self, + deadline, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids, + additional_limit_name, + ) + return AccountSelection(account=account, error_message=None, error_code=None) + + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del headers, access_token, account_id_header, base_url, session + return fake_upstream + + async def fake_persist_http_bridge_lease(self, session): + del self + persisted_replace_session_ids.append(session.pending_replaced_bridge_session_id) + raise RuntimeError("lease persistence failed") + + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + monkeypatch.setattr(proxy_module.ProxyService, "_persist_http_bridge_lease", fake_persist_http_bridge_lease) + + with pytest.raises(RuntimeError, match="lease persistence failed"): + await service._create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", "replacement-hook", None), + headers={}, + affinity=proxy_module._AffinityPolicy( + key="replacement-hook", + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + bridge_session_id="hbs_lease_persist_failure_replace", + owner_instance_id="instance-a", + replaced_bridge_session_id="hbs_stale_replaced", + ) + + assert persisted_replace_session_ids == ["hbs_stale_replaced"] + assert fake_upstream.closed is True + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_allows_unstable_request_key_even_on_non_owner_instance( async_client, From fc2b81991e61d537d434536c9a31ce3ac2b8c5c4 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 17:51:47 +0000 Subject: [PATCH 30/34] fix(proxy): expire recovered stale turn states --- app/modules/proxy/service.py | 9 - .../integration/test_http_responses_bridge.py | 162 ++++++++++++++++-- 2 files changed, 149 insertions(+), 22 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 98832fd1..dda63d35 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -2174,15 +2174,6 @@ async def _get_or_create_http_bridge_session( if current_future is inflight_future: self._http_bridge_inflight_sessions.pop(lookup_key, None) self._http_bridge_sessions[session_key] = session - if ( - recovered_turn_state_replay - and incoming_turn_state is not None - and incoming_turn_state != session.key.affinity_key - ): - session.downstream_turn_state_aliases.add(incoming_turn_state) - self._http_bridge_turn_state_index[ - _http_bridge_turn_state_alias_key(incoming_turn_state, session.key.api_key_id) - ] = session.key session_registered = True if inflight_future is not None and not inflight_future.done(): inflight_future.set_result(session) diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 0e7146ff..682774c3 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -1376,10 +1376,10 @@ async def fake_connect_responses_websocket( assert connect_headers_seen[-1].get("x-codex-turn-state") is None replayed = await service._get_or_create_http_bridge_session( - proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), - headers={"x-codex-turn-state": signed_turn_state}, + proxy_module._HTTPBridgeSessionKey("turn_state_header", session.key.affinity_key, None), + headers={"x-codex-turn-state": session.key.affinity_key}, affinity=proxy_module._AffinityPolicy( - key=signed_turn_state, + key=session.key.affinity_key, kind=proxy_module.StickySessionKind.CODEX_SESSION, ), api_key=None, @@ -1399,7 +1399,6 @@ async def fake_connect_responses_websocket( ).scalar_one() assert replayed is session - assert signed_turn_state in session.downstream_turn_state_aliases assert connect_headers_seen and len(connect_headers_seen) == 1 assert stale_lease is None assert new_lease.affinity_kind == "turn_state_header" @@ -1407,7 +1406,7 @@ async def fake_connect_responses_websocket( @pytest.mark.asyncio -async def test_v1_responses_http_bridge_signed_turn_state_replay_prefers_local_alias_over_stale_lease_owner( +async def test_v1_responses_http_bridge_signed_turn_state_recovery_does_not_alias_stale_token_when_delete_fails( async_client, app_instance, monkeypatch, @@ -1546,10 +1545,11 @@ async def flaky_delete_http_bridge_lease(self, stale_session_id): await db_session.execute(select(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) ).scalar_one_or_none() - assert replayed is session + assert replayed is not session assert stale_lease is not None - assert signed_turn_state in session.downstream_turn_state_aliases - assert len(connect_headers_seen) == 1 + assert len(connect_headers_seen) == 2 + await service._close_http_bridge_session(session) + await service._close_http_bridge_session(replayed) @pytest.mark.asyncio @@ -2365,7 +2365,6 @@ async def fake_connect_responses_websocket( kind=proxy_module.StickySessionKind.CODEX_SESSION, ) assert recovered.codex_session is True - assert signed_turn_state in recovered.downstream_turn_state_aliases assert proxy_module._http_bridge_owner_instance_group(recovered.owner_instance_id) == "instance-new" async with SessionLocal() as db_session: @@ -2510,13 +2509,12 @@ async def fake_connect_responses_websocket( ) assert recovered.codex_session is True assert recovered.idle_ttl_seconds == pytest.approx(900.0) - assert signed_turn_state in recovered.downstream_turn_state_aliases replayed = await service._get_or_create_http_bridge_session( - proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), - headers={"x-codex-turn-state": signed_turn_state}, + proxy_module._HTTPBridgeSessionKey("turn_state_header", recovered.key.affinity_key, None), + headers={"x-codex-turn-state": recovered.key.affinity_key}, affinity=proxy_module._AffinityPolicy( - key=signed_turn_state, + key=recovered.key.affinity_key, kind=proxy_module.StickySessionKind.CODEX_SESSION, ), api_key=None, @@ -2629,6 +2627,144 @@ async def test_v1_responses_http_bridge_signed_turn_state_missing_local_alias_wi assert exc.payload["error"]["code"] == "bridge_session_expired" +@pytest.mark.asyncio +async def test_v1_responses_http_bridge_recovered_stale_turn_state_with_previous_response_expires( + async_client, + app_instance, + monkeypatch, +): + _install_bridge_settings_with_limits( + monkeypatch, + enabled=True, + instance_id="instance-a", + instance_ring=["instance-a", "instance-b"], + ) + account_id = await _import_account( + async_client, + "acc_http_bridge_recovered_signed_alias_previous", + "http-bridge-recovered-signed-alias-previous@example.com", + ) + account = await _get_account(account_id) + service = get_proxy_service_for_app(app_instance) + fake_upstream = _FakeBridgeUpstreamWebSocket() + connect_headers_seen: list[dict[str, str]] = [] + session_id = "hbs_recovered_signed_alias_previous" + signed_turn_state = service._encode_http_bridge_turn_state( + session_id=session_id, + owner_instance_id="instance-a", + api_key_id=None, + ) + + async def fake_select_account_with_budget( + self, + deadline, + *, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids=None, + additional_limit_name=None, + ): + del ( + self, + deadline, + request_id, + kind, + sticky_key, + sticky_kind, + reallocate_sticky, + sticky_max_age_seconds, + prefer_earlier_reset_accounts, + routing_strategy, + model, + exclude_account_ids, + additional_limit_name, + ) + return AccountSelection(account=account, error_message=None, error_code=None) + + async def fake_ensure_fresh_with_budget(self, target, *, force=False, timeout_seconds): + del self, force, timeout_seconds + return target + + async def fake_connect_responses_websocket( + headers, + access_token, + account_id_header, + *, + base_url=None, + session=None, + ): + del access_token, account_id_header, base_url, session + connect_headers_seen.append(dict(headers)) + return fake_upstream + + monkeypatch.setattr(proxy_module.ProxyService, "_select_account_with_budget", fake_select_account_with_budget) + monkeypatch.setattr(proxy_module.ProxyService, "_ensure_fresh_with_budget", fake_ensure_fresh_with_budget) + monkeypatch.setattr(proxy_module, "connect_responses_websocket", fake_connect_responses_websocket) + monkeypatch.setattr(proxy_module, "_http_bridge_current_owner_id", lambda settings: "instance-a@222") + monkeypatch.setattr(proxy_module, "_http_bridge_process_exists", lambda pid: False) + + async with SessionLocal() as db_session: + await db_session.execute(delete(HttpBridgeLease).where(HttpBridgeLease.session_id == session_id)) + await db_session.commit() + + async with service._repo_factory() as repos: + await repos.http_bridge_leases.upsert( + session_id=session_id, + affinity_kind="turn_state_header", + affinity_key=signed_turn_state, + api_key_scope="", + owner_instance_id="instance-a", + lease_expires_at=proxy_module._http_bridge_lease_expires_at(120.0), + account_id=account.id, + request_model="gpt-5.1", + codex_session=True, + idle_ttl_seconds=120.0, + upstream_turn_state=None, + downstream_turn_state=signed_turn_state, + ) + + recovered = await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + ) + + with pytest.raises(proxy_module.ProxyResponseError) as exc_info: + await service._get_or_create_http_bridge_session( + proxy_module._HTTPBridgeSessionKey("turn_state_header", signed_turn_state, None), + headers={"x-codex-turn-state": signed_turn_state}, + affinity=proxy_module._AffinityPolicy( + key=signed_turn_state, + kind=proxy_module.StickySessionKind.CODEX_SESSION, + ), + api_key=None, + request_model="gpt-5.1", + idle_ttl_seconds=120.0, + max_sessions=128, + previous_response_id="resp_previous", + ) + + exc = exc_info.value + assert exc.status_code == 409 + assert exc.payload["error"]["code"] == "bridge_session_expired" + assert len(connect_headers_seen) == 1 + await service._close_http_bridge_session(recovered) + + @pytest.mark.asyncio async def test_v1_responses_http_bridge_turn_state_alias_respects_api_key_isolation( async_client, From 8fb2e2c20cf3e726a49e21d3a95a04a4bce06eb4 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 18:01:55 +0000 Subject: [PATCH 31/34] style(proxy): format bridge service and tests --- app/modules/proxy/service.py | 18 +++++--- .../integration/test_http_responses_bridge.py | 45 +++++++++++++------ 2 files changed, 44 insertions(+), 19 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index dda63d35..881da0cb 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -112,6 +112,7 @@ class _HTTPBridgeLeaseClaimLost(RuntimeError): pass + _TEXT_DELTA_EVENT_TYPES = frozenset({"response.output_text.delta", "response.refusal.delta"}) _TEXT_DONE_CONTENT_PART_TYPES = frozenset({"output_text", "refusal"}) _REQUEST_TRANSPORT_HTTP = "http" @@ -2133,10 +2134,10 @@ async def _get_or_create_http_bridge_session( "turn_state_header", self._encode_http_bridge_turn_state( session_id=created_session_id, - owner_instance_id=current_owner, - api_key_id=api_key_id, - ), - api_key_id, + owner_instance_id=current_owner, + api_key_id=api_key_id, + ), + api_key_id, ) create_affinity = _AffinityPolicy( key=session_key.affinity_key, @@ -2285,6 +2286,7 @@ async def _close_session() -> None: except Exception: logger.debug("Failed to close HTTP bridge upstream websocket", exc_info=True) await self._delete_http_bridge_lease(getattr(session, "bridge_session_id", None)) + if lease_lock is not None: async with lease_lock: await _close_session() @@ -2478,7 +2480,9 @@ async def _create_http_bridge_session( try: await upstream.close() except Exception: - logger.debug("Failed to close HTTP bridge upstream websocket after lease persistence error", exc_info=True) + logger.debug( + "Failed to close HTTP bridge upstream websocket after lease persistence error", exc_info=True + ) raise session.upstream_reader = asyncio.create_task(self._relay_http_bridge_upstream_messages(session)) return session @@ -2954,7 +2958,9 @@ async def _reconnect_http_bridge_session( try: await new_upstream.close() except Exception: - logger.debug("Failed to close replacement HTTP bridge websocket after reconnect error", exc_info=True) + logger.debug( + "Failed to close replacement HTTP bridge websocket after reconnect error", exc_info=True + ) if preserve_lease_during_reconnect: session.preserve_lease_during_reconnect = False await self._delete_http_bridge_lease(session.bridge_session_id) diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 682774c3..3a7673bc 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -2061,7 +2061,10 @@ async def test_v1_responses_http_bridge_signed_turn_state_live_lease_on_other_wo exc = exc_info.value assert exc.status_code == 409 assert exc.payload["error"].get("code") == "bridge_wrong_instance" - assert exc.payload["error"].get("message") == "HTTP responses session bridge turn-state is owned by another live instance" + assert ( + exc.payload["error"].get("message") + == "HTTP responses session bridge turn-state is owned by another live instance" + ) @pytest.mark.asyncio @@ -2238,7 +2241,9 @@ async def fake_connect_responses_websocket( async with SessionLocal() as db_session: lease = ( - await db_session.execute(select(HttpBridgeLease).where(HttpBridgeLease.session_id == session.bridge_session_id)) + await db_session.execute( + select(HttpBridgeLease).where(HttpBridgeLease.session_id == session.bridge_session_id) + ) ).scalar_one() assert proxy_module._http_bridge_owner_instance_group(lease.owner_instance_id) == "instance-a" assert lease.affinity_kind == "turn_state_header" @@ -3052,12 +3057,18 @@ async def fake_touch_http_bridge_lease(self, session_arg): assert signed_turn_state_repeat == signed_turn_state assert session.downstream_turn_state == signed_turn_state assert session.downstream_turn_state_aliases == {"http_turn_legacy_client", signed_turn_state} - assert service._http_bridge_turn_state_index[ - proxy_module._http_bridge_turn_state_alias_key("http_turn_legacy_client", session.key.api_key_id) - ] == session.key - assert service._http_bridge_turn_state_index[ - proxy_module._http_bridge_turn_state_alias_key(signed_turn_state, session.key.api_key_id) - ] == session.key + assert ( + service._http_bridge_turn_state_index[ + proxy_module._http_bridge_turn_state_alias_key("http_turn_legacy_client", session.key.api_key_id) + ] + == session.key + ) + assert ( + service._http_bridge_turn_state_index[ + proxy_module._http_bridge_turn_state_alias_key(signed_turn_state, session.key.api_key_id) + ] + == session.key + ) await service._close_http_bridge_session(session) @@ -3177,7 +3188,7 @@ async def test_v1_responses_http_bridge_refreshes_lease_after_request_detach(app session.response_create_gate = asyncio.Semaphore(1) event_queue: asyncio.Queue[str | None] = asyncio.Queue() - await event_queue.put("data: {\"type\":\"response.completed\"}\n\n") + await event_queue.put('data: {"type":"response.completed"}\n\n') await event_queue.put(None) request_state = proxy_module._WebSocketRequestState( request_id="req_bridge_lease_refresh", @@ -3229,14 +3240,18 @@ async def fake_touch_http_bridge_lease(self, session): touch_points.append(session.last_used_at) monkeypatch.setattr(proxy_module.ProxyService, "_prepare_http_bridge_request", fake_prepare_http_bridge_request) - monkeypatch.setattr(proxy_module.ProxyService, "_get_or_create_http_bridge_session", fake_get_or_create_http_bridge_session) + monkeypatch.setattr( + proxy_module.ProxyService, "_get_or_create_http_bridge_session", fake_get_or_create_http_bridge_session + ) monkeypatch.setattr(proxy_module.ProxyService, "_submit_http_bridge_request", fake_submit_http_bridge_request) monkeypatch.setattr( proxy_module.ProxyService, "_resolve_http_bridge_downstream_turn_state", fake_resolve_http_bridge_downstream_turn_state, ) - monkeypatch.setattr(proxy_module.ProxyService, "_register_http_bridge_turn_state", fake_register_http_bridge_turn_state) + monkeypatch.setattr( + proxy_module.ProxyService, "_register_http_bridge_turn_state", fake_register_http_bridge_turn_state + ) monkeypatch.setattr(proxy_module.ProxyService, "_touch_http_bridge_lease", fake_touch_http_bridge_lease) events = [ @@ -3314,7 +3329,9 @@ async def failing_touch_http_bridge_lease(self, session_arg): raise RuntimeError("lease touch failed") monkeypatch.setattr(proxy_module.ProxyService, "_prepare_http_bridge_request", fake_prepare_http_bridge_request) - monkeypatch.setattr(proxy_module.ProxyService, "_get_or_create_http_bridge_session", fake_get_or_create_http_bridge_session) + monkeypatch.setattr( + proxy_module.ProxyService, "_get_or_create_http_bridge_session", fake_get_or_create_http_bridge_session + ) monkeypatch.setattr(proxy_module.ProxyService, "_submit_http_bridge_request", fake_submit_http_bridge_request) monkeypatch.setattr( proxy_module.ProxyService, @@ -3393,7 +3410,9 @@ async def test_v1_responses_http_bridge_keepalive_refresh_failure_closes_session service = get_proxy_service_for_app(app_instance) session = cast( proxy_module._HTTPBridgeSession, - _make_dummy_bridge_session(proxy_module._HTTPBridgeSessionKey("request", "bridge-lease-keepalive-failure", None)), + _make_dummy_bridge_session( + proxy_module._HTTPBridgeSessionKey("request", "bridge-lease-keepalive-failure", None) + ), ) session.bridge_session_id = "hbs_bridge_lease_keepalive_failure" session.idle_ttl_seconds = 0.5 From b901fa5264b9ec5c0b34a81427ee4e4470286515 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 18:06:34 +0000 Subject: [PATCH 32/34] fix(ci): satisfy bridge type checks --- app/modules/proxy/service.py | 15 +++++++++++++-- tests/integration/test_http_responses_bridge.py | 6 +++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 881da0cb..dbcf6695 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -12,7 +12,7 @@ from dataclasses import dataclass, field from datetime import datetime, timedelta from hashlib import sha256 -from typing import AsyncIterator, Mapping, NoReturn, cast +from typing import AsyncIterator, Mapping, NoReturn, TypedDict, cast from uuid import uuid4 import aiohttp @@ -1859,6 +1859,7 @@ async def _get_or_create_http_bridge_session( break if alias_session is not None: matched_turn_state_alias = True + assert alias_key is not None key = alias_key if incoming_turn_state is not None and ( turn_state_token is None @@ -2149,7 +2150,7 @@ async def _get_or_create_http_bridge_session( else headers ) create_session = self._create_http_bridge_session - create_kwargs: dict[str, object] = { + create_kwargs: _HTTPBridgeCreateSessionKwargs = { "headers": create_headers, "affinity": create_affinity, "request_model": request_model, @@ -5020,6 +5021,16 @@ class _WebSocketReceiveTimeout: fail_all_pending: bool = False +class _HTTPBridgeCreateSessionKwargs(TypedDict, total=False): + headers: dict[str, str] + affinity: "_AffinityPolicy" + request_model: str | None + idle_ttl_seconds: float + bridge_session_id: str + owner_instance_id: str + replaced_bridge_session_id: str | None + + def _event_type_from_payload(event: OpenAIEvent | None, payload: dict[str, JsonValue] | None) -> str | None: if event is not None: return event.type diff --git a/tests/integration/test_http_responses_bridge.py b/tests/integration/test_http_responses_bridge.py index 3a7673bc..4e080073 100644 --- a/tests/integration/test_http_responses_bridge.py +++ b/tests/integration/test_http_responses_bridge.py @@ -3445,9 +3445,13 @@ async def fake_release_websocket_reservation(self, reservation): monkeypatch.setattr(proxy_module.ProxyService, "_write_request_log", fake_write_request_log) monkeypatch.setattr(proxy_module.ProxyService, "_release_websocket_reservation", fake_release_websocket_reservation) + assert request_state.event_queue is not None await service._ensure_http_bridge_lease_keepalive(session) failed_event = await asyncio.wait_for(request_state.event_queue.get(), timeout=1.0) - assert proxy_module.parse_sse_data_json(failed_event)["type"] == "response.failed" + assert failed_event is not None + failed_payload = proxy_module.parse_sse_data_json(failed_event) + assert failed_payload is not None + assert failed_payload["type"] == "response.failed" assert await asyncio.wait_for(request_state.event_queue.get(), timeout=1.0) is None await asyncio.sleep(0) assert session.closed is True From 7412ba65d872cfc6d62c20bc03093844e9581b6e Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 19:28:07 +0000 Subject: [PATCH 33/34] fix(proxy): serialize http bridge lease cleanup --- app/modules/proxy/service.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index dbcf6695..27a1823c 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -1796,6 +1796,29 @@ async def _stop_http_bridge_lease_keepalive(self, session: "_HTTPBridgeSession") except asyncio.CancelledError: pass + async def _delete_http_bridge_lease_after_reader_exit(self, session: "_HTTPBridgeSession") -> None: + if session.preserve_lease_during_reconnect or session.lease_cleanup_owned_by_close: + return + lease_lock = getattr(session, "lease_lock", None) + if lease_lock is None: + await self._delete_http_bridge_lease(session.bridge_session_id) + return + while True: + if session.preserve_lease_during_reconnect or session.lease_cleanup_owned_by_close: + return + try: + lease_lock.acquire_nowait() + except anyio.WouldBlock: + await asyncio.sleep(0) + continue + try: + if session.preserve_lease_during_reconnect or session.lease_cleanup_owned_by_close: + return + await self._delete_http_bridge_lease(session.bridge_session_id) + return + finally: + lease_lock.release() + async def _get_or_create_http_bridge_session( self, key: "_HTTPBridgeSessionKey", @@ -2264,6 +2287,7 @@ async def _close_http_bridge_session( async def _close_session() -> None: session.closed = True + session.lease_cleanup_owned_by_close = True await self._stop_http_bridge_lease_keepalive(session) if fail_pending_requests: await self._fail_pending_http_bridge_requests( @@ -2788,8 +2812,7 @@ async def _relay_http_bridge_upstream_messages( finally: session.closed = True await self._stop_http_bridge_lease_keepalive(session) - if not session.preserve_lease_during_reconnect: - await self._delete_http_bridge_lease(session.bridge_session_id) + await self._delete_http_bridge_lease_after_reader_exit(session) async def _retry_http_bridge_request_on_fresh_upstream( self, @@ -4998,6 +5021,7 @@ class _HTTPBridgeSession: upstream_reader: asyncio.Task[None] | None = None lease_keepalive_task: asyncio.Task[None] | None = None preserve_lease_during_reconnect: bool = False + lease_cleanup_owned_by_close: bool = False closed: bool = False From 2def1f800ddc41975a9321b44d612ecfbe170e26 Mon Sep 17 00:00:00 2001 From: aaiyer Date: Wed, 25 Mar 2026 19:40:51 +0000 Subject: [PATCH 34/34] fix(proxy): avoid http bridge cleanup deadlock --- app/modules/proxy/service.py | 57 ++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/app/modules/proxy/service.py b/app/modules/proxy/service.py index 27a1823c..52a17891 100644 --- a/app/modules/proxy/service.py +++ b/app/modules/proxy/service.py @@ -2285,38 +2285,43 @@ async def _close_http_bridge_session( ) -> None: lease_lock = getattr(session, "lease_lock", None) - async def _close_session() -> None: + async def _claim_close_cleanup() -> None: session.closed = True session.lease_cleanup_owned_by_close = True - await self._stop_http_bridge_lease_keepalive(session) - if fail_pending_requests: - await self._fail_pending_http_bridge_requests( - session, - error_code=error_code, - error_message=error_message, - ) - if turn_state_lock_held: - self._unregister_http_bridge_turn_states_locked(session) - else: - await self._unregister_http_bridge_turn_states(session) - if session.upstream_reader is not None: - session.upstream_reader.cancel() - if session.upstream_reader is not asyncio.current_task(): - try: - await session.upstream_reader - except asyncio.CancelledError: - pass - try: - await session.upstream.close() - except Exception: - logger.debug("Failed to close HTTP bridge upstream websocket", exc_info=True) - await self._delete_http_bridge_lease(getattr(session, "bridge_session_id", None)) if lease_lock is not None: async with lease_lock: - await _close_session() + await _claim_close_cleanup() else: - await _close_session() + await _claim_close_cleanup() + + await self._stop_http_bridge_lease_keepalive(session) + if fail_pending_requests: + await self._fail_pending_http_bridge_requests( + session, + error_code=error_code, + error_message=error_message, + ) + if turn_state_lock_held: + self._unregister_http_bridge_turn_states_locked(session) + else: + await self._unregister_http_bridge_turn_states(session) + if session.upstream_reader is not None: + session.upstream_reader.cancel() + if session.upstream_reader is not asyncio.current_task(): + try: + await session.upstream_reader + except asyncio.CancelledError: + pass + try: + await session.upstream.close() + except Exception: + logger.debug("Failed to close HTTP bridge upstream websocket", exc_info=True) + if lease_lock is not None: + async with lease_lock: + await self._delete_http_bridge_lease(getattr(session, "bridge_session_id", None)) + else: + await self._delete_http_bridge_lease(getattr(session, "bridge_session_id", None)) _log_http_bridge_event( "close", session.key,