From 1ad7f43c5ee2ab375a914cb139174db5d0eb820f Mon Sep 17 00:00:00 2001 From: XavLim Date: Sun, 29 Mar 2026 08:41:43 +0000 Subject: [PATCH] fix: detect stale usage data after account rate-limit/quota reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an account's usage window resets (e.g. daily/weekly limit rollover), the load balancer could continue treating the account as exhausted because: 1. `_latest_usage_is_fresh()` only checked record age, not whether the `reset_at` timestamp had passed — stale 100% data persisted until the next refresh interval even though the upstream limit had already reset. 2. `_state_from_account()` passed stale 100% usage into the balancer and sort keys even when `reset_at` was in the past, causing the account to be blocked or deprioritised. 3. `select_account()` cleared the blocked status when `reset_at` expired but did not reset `used_percent` (RATE_LIMITED) or `secondary_used_percent` (QUOTA_EXCEEDED), so the account remained last in usage-weighted selection. This commit fixes all three layers: - Freshness check now returns false when `reset_at` has passed, forcing an immediate re-fetch from the upstream API. - A new `_account_needs_post_reset_refresh()` helper forces refresh even when primary data looks fresh but the account's persisted reset time has expired (covers secondary-only resets). - `_state_from_account()` zeros out stale exhausted percentages when the window has rolled over. - `select_account()` resets both `used_percent` and `secondary_used_percent` when clearing expired statuses. Co-Authored-By: Claude Opus 4.6 --- app/core/balancer/logic.py | 2 + app/modules/proxy/load_balancer.py | 14 ++++ app/modules/usage/updater.py | 34 ++++++++- tests/unit/test_load_balancer.py | 61 ++++++++++++++++ tests/unit/test_usage_updater.py | 113 ++++++++++++++++++++++++++++- 5 files changed, 221 insertions(+), 3 deletions(-) diff --git a/app/core/balancer/logic.py b/app/core/balancer/logic.py index d7350bd2..a09b4caa 100644 --- a/app/core/balancer/logic.py +++ b/app/core/balancer/logic.py @@ -63,6 +63,7 @@ def select_account( if state.status == AccountStatus.RATE_LIMITED: if state.reset_at and current >= state.reset_at: state.status = AccountStatus.ACTIVE + state.used_percent = 0.0 state.error_count = 0 state.reset_at = None else: @@ -71,6 +72,7 @@ def select_account( if state.reset_at and current >= state.reset_at: state.status = AccountStatus.ACTIVE state.used_percent = 0.0 + state.secondary_used_percent = 0.0 state.reset_at = None else: continue diff --git a/app/modules/proxy/load_balancer.py b/app/modules/proxy/load_balancer.py index bf0e5e5e..741a1664 100644 --- a/app/modules/proxy/load_balancer.py +++ b/app/modules/proxy/load_balancer.py @@ -809,6 +809,20 @@ def _state_from_account( secondary_used = effective_secondary_entry.used_percent if effective_secondary_entry else None secondary_reset = effective_secondary_entry.reset_at if effective_secondary_entry else None + # If the usage window has reset (reset_at is in the past) but the last + # recorded sample still shows 100 % usage, the data is stale. Zero it + # out so the account is not incorrectly blocked or deprioritised while + # waiting for the next usage refresh to fetch fresh numbers. + now_epoch = int(time.time()) + if primary_used is not None and primary_used >= 100.0: + if primary_reset is not None and primary_reset <= now_epoch: + primary_used = 0.0 + primary_reset = None + if secondary_used is not None and secondary_used >= 100.0: + if secondary_reset is not None and secondary_reset <= now_epoch: + secondary_used = 0.0 + secondary_reset = None + # Use account.reset_at from DB as the authoritative source for runtime reset # and to survive process restarts. db_reset_at = float(account.reset_at) if account.reset_at else None diff --git a/app/modules/usage/updater.py b/app/modules/usage/updater.py index 204d9313..edcb485f 100644 --- a/app/modules/usage/updater.py +++ b/app/modules/usage/updater.py @@ -188,7 +188,11 @@ async def refresh_accounts( continue latest = latest_usage.get(account.id) if _latest_usage_is_fresh(latest, now=now, interval_seconds=interval): - continue + # Still force a refresh when the account is blocked with an + # expired reset — the fresh primary entry does not capture + # that the upstream window has turned over. + if not _account_needs_post_reset_refresh(account, now): + continue # Additional-only accounts have no main UsageHistory entry. # Check DB-backed freshness (works across workers/restarts) # with process-local cache as a fast path. @@ -636,7 +640,18 @@ def _latest_usage_is_fresh( now: datetime, interval_seconds: int, ) -> bool: - return latest is not None and (now - latest.recorded_at).total_seconds() < interval_seconds + if latest is None: + return False + if (now - latest.recorded_at).total_seconds() >= interval_seconds: + return False + # If the usage window has reset since this entry was recorded, the + # recorded percentage is stale — force a re-fetch so the scheduler + # picks up the new (lower) usage from the upstream API promptly. + if latest.reset_at is not None: + now_epoch = int(now.replace(tzinfo=timezone.utc).timestamp()) + if now_epoch >= latest.reset_at: + return False + return True def _parse_credits_balance(value: str | int | float | None) -> float | None: @@ -652,6 +667,21 @@ def _parse_credits_balance(value: str | int | float | None) -> float | None: return None +def _account_needs_post_reset_refresh(account: Account, now: datetime) -> bool: + """Return True when the account is blocked with an expired reset time. + + This catches the case where only the *secondary* window has reset (the + primary entry may still look fresh) but the account is still persisted as + RATE_LIMITED or QUOTA_EXCEEDED from the previous cycle. + """ + if account.status not in (AccountStatus.RATE_LIMITED, AccountStatus.QUOTA_EXCEEDED): + return False + if account.reset_at is None: + return False + now_epoch = int(now.replace(tzinfo=timezone.utc).timestamp()) + return now_epoch >= account.reset_at + + def _window_minutes(limit_seconds: int | None) -> int | None: if not limit_seconds or limit_seconds <= 0: return None diff --git a/tests/unit/test_load_balancer.py b/tests/unit/test_load_balancer.py index 7c5ac7c1..6371cea7 100644 --- a/tests/unit/test_load_balancer.py +++ b/tests/unit/test_load_balancer.py @@ -377,3 +377,64 @@ def test_error_backoff_expired_account_does_not_immediately_relock(): result2 = select_account([state], now=now + 2) assert result2.account is not None assert result2.account.account_id == "a" + + +def test_select_account_resets_used_percent_when_rate_limit_expires(): + """When a rate-limited account's reset_at expires, used_percent must be + cleared so the account is not deprioritised in the usage-weighted sort.""" + now = 1_700_000_000.0 + state = AccountState( + "a", + AccountStatus.RATE_LIMITED, + used_percent=100.0, + reset_at=now - 10, + ) + result = select_account([state], now=now) + assert result.account is not None + assert state.status == AccountStatus.ACTIVE + assert state.used_percent == 0.0 + assert state.reset_at is None + + +def test_select_account_resets_secondary_used_percent_when_quota_exceeded_expires(): + """When a quota-exceeded account's reset_at expires, secondary_used_percent + must be cleared so the account is not deprioritised in the sort key.""" + now = 1_700_000_000.0 + state = AccountState( + "a", + AccountStatus.QUOTA_EXCEEDED, + used_percent=100.0, + secondary_used_percent=100.0, + reset_at=now - 10, + ) + result = select_account([state], now=now) + assert result.account is not None + assert state.status == AccountStatus.ACTIVE + assert state.used_percent == 0.0 + assert state.secondary_used_percent == 0.0 + assert state.reset_at is None + + +def test_select_account_prefers_reset_account_over_high_usage(): + """An account whose quota just reset (via the selection-time clear) should + be preferred over an active account with moderate usage.""" + now = 1_700_000_000.0 + states = [ + AccountState( + "a", + AccountStatus.QUOTA_EXCEEDED, + used_percent=100.0, + secondary_used_percent=100.0, + reset_at=now - 10, + ), + AccountState( + "b", + AccountStatus.ACTIVE, + used_percent=60.0, + secondary_used_percent=60.0, + ), + ] + result = select_account(states, now=now) + assert result.account is not None + # Account "a" should be preferred because after reset its usage is 0% + assert result.account.account_id == "a" diff --git a/tests/unit/test_usage_updater.py b/tests/unit/test_usage_updater.py index 3e2ce0fc..c345dbb1 100644 --- a/tests/unit/test_usage_updater.py +++ b/tests/unit/test_usage_updater.py @@ -13,7 +13,12 @@ from app.core.usage.models import UsagePayload from app.db.models import Account, AccountStatus, UsageHistory from app.modules.usage.additional_quota_keys import canonicalize_additional_quota_key -from app.modules.usage.updater import UsageUpdater, _last_successful_refresh +from app.modules.usage.updater import ( + UsageUpdater, + _account_needs_post_reset_refresh, + _last_successful_refresh, + _latest_usage_is_fresh, +) pytestmark = pytest.mark.unit @@ -1521,3 +1526,109 @@ async def stub_fetch_usage(**_: Any) -> UsagePayload: assert not hasattr(entry, "credits_has") assert not hasattr(entry, "credits_unlimited") assert not hasattr(entry, "credits_balance") + + +# --------------------------------------------------------------------------- +# _latest_usage_is_fresh — reset boundary awareness +# --------------------------------------------------------------------------- + + +def test_latest_usage_is_fresh_returns_true_when_recent_no_reset(): + now = datetime(2024, 6, 1, 12, 0, 30) + entry = UsageHistory( + id=1, + account_id="a", + used_percent=50.0, + recorded_at=datetime(2024, 6, 1, 12, 0, 0), + window="primary", + reset_at=None, + window_minutes=300, + ) + assert _latest_usage_is_fresh(entry, now=now, interval_seconds=60) is True + + +def test_latest_usage_is_fresh_returns_false_when_old(): + now = datetime(2024, 6, 1, 12, 1, 30) + entry = UsageHistory( + id=1, + account_id="a", + used_percent=50.0, + recorded_at=datetime(2024, 6, 1, 12, 0, 0), + window="primary", + reset_at=None, + window_minutes=300, + ) + assert _latest_usage_is_fresh(entry, now=now, interval_seconds=60) is False + + +def test_latest_usage_is_fresh_returns_false_when_reset_at_has_passed(): + """Data recorded recently but the usage window has already reset.""" + # recorded_at is 10s ago → within interval. But reset_at was 5s ago. + reset_epoch = int(datetime(2024, 6, 1, 12, 0, 25).replace(tzinfo=timezone.utc).timestamp()) + now = datetime(2024, 6, 1, 12, 0, 30) + entry = UsageHistory( + id=1, + account_id="a", + used_percent=100.0, + recorded_at=datetime(2024, 6, 1, 12, 0, 20), + window="primary", + reset_at=reset_epoch, + window_minutes=300, + ) + assert _latest_usage_is_fresh(entry, now=now, interval_seconds=60) is False + + +def test_latest_usage_is_fresh_returns_true_when_reset_at_is_in_future(): + """Data recorded recently and the reset hasn't happened yet.""" + reset_epoch = int(datetime(2024, 6, 1, 12, 5, 0).replace(tzinfo=timezone.utc).timestamp()) + now = datetime(2024, 6, 1, 12, 0, 30) + entry = UsageHistory( + id=1, + account_id="a", + used_percent=100.0, + recorded_at=datetime(2024, 6, 1, 12, 0, 20), + window="primary", + reset_at=reset_epoch, + window_minutes=300, + ) + assert _latest_usage_is_fresh(entry, now=now, interval_seconds=60) is True + + +# --------------------------------------------------------------------------- +# _account_needs_post_reset_refresh +# --------------------------------------------------------------------------- + + +def test_account_needs_refresh_active_account_returns_false(): + acc = _make_account("a", "ws_a") + acc.status = AccountStatus.ACTIVE + acc.reset_at = int(datetime(2024, 6, 1, 11, 0, 0).replace(tzinfo=timezone.utc).timestamp()) + assert _account_needs_post_reset_refresh(acc, datetime(2024, 6, 1, 12, 0, 0)) is False + + +def test_account_needs_refresh_quota_exceeded_future_reset_returns_false(): + acc = _make_account("a", "ws_a") + acc.status = AccountStatus.QUOTA_EXCEEDED + acc.reset_at = int(datetime(2024, 6, 1, 13, 0, 0).replace(tzinfo=timezone.utc).timestamp()) + assert _account_needs_post_reset_refresh(acc, datetime(2024, 6, 1, 12, 0, 0)) is False + + +def test_account_needs_refresh_quota_exceeded_past_reset_returns_true(): + acc = _make_account("a", "ws_a") + acc.status = AccountStatus.QUOTA_EXCEEDED + acc.reset_at = int(datetime(2024, 6, 1, 11, 0, 0).replace(tzinfo=timezone.utc).timestamp()) + assert _account_needs_post_reset_refresh(acc, datetime(2024, 6, 1, 12, 0, 0)) is True + + +def test_account_needs_refresh_rate_limited_past_reset_returns_true(): + acc = _make_account("a", "ws_a") + acc.status = AccountStatus.RATE_LIMITED + acc.reset_at = int(datetime(2024, 6, 1, 11, 0, 0).replace(tzinfo=timezone.utc).timestamp()) + assert _account_needs_post_reset_refresh(acc, datetime(2024, 6, 1, 12, 0, 0)) is True + + +def test_account_needs_refresh_no_reset_at_returns_false(): + acc = _make_account("a", "ws_a") + acc.status = AccountStatus.QUOTA_EXCEEDED + acc.reset_at = None + assert _account_needs_post_reset_refresh(acc, datetime(2024, 6, 1, 12, 0, 0)) is False